适配deepseekv3\v2 moe awq的推理支持

47bd229c · yangql · 4a734b9d · 47bd229c · 47bd229c · 47bd229c
Commit 47bd229c authored Feb 20, 2025 by yangql
20 changed files
--- a/benchmarks/kernels/benchmark_moe_int4.py
+++ b/benchmarks/kernels/benchmark_moe_int4.py
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+import time
+from datetime import datetime
+from itertools import product
+from typing import Any, Dict, List, Tuple, TypedDict
+import ray
+import torch
+import triton
+from ray.experimental.tqdm_ray import tqdm
+from transformers import AutoConfig
+from vllm.model_executor.layers.fused_moe.fused_moe import *
+from vllm.platforms import current_platform
+from vllm.utils import FlexibleArgumentParser
+FP8_DTYPE = torch.float8_e4m3fnuz if current_platform.is_rocm(
+) else torch.float8_e4m3fn
+class BenchmarkConfig(TypedDict):
+    BLOCK_SIZE_M: int
+    BLOCK_SIZE_N: int
+    BLOCK_SIZE_K: int
+    GROUP_SIZE_M: int
+    num_warps: int
+    num_stages: int
+    num_ldmatrixes: Optional[int]
+def benchmark_config(
+    config: BenchmarkConfig,
+    num_tokens: int,
+    num_experts: int,
+    shard_intermediate_size: int,
+    hidden_size: int,
+    topk: int,
+    dtype: torch.dtype,
+    use_fp8_w8a8: bool,
+    use_int8_w8a16: bool,
+    use_int4_w4a16: bool,
+    group_size: int,
+    num_iters: int = 100,
+    nn_moe: Optional[bool] = False
+) -> float:
+    init_dtype = torch.float16 if use_fp8_w8a8 else dtype
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
+    if use_int8_w8a16:
+        if not nn_moe:
+            w1 = torch.randint(-127,
+                            127, (
+                                num_experts,
+                                shard_intermediate_size,
+                                hidden_size,
+                            ),
+                            dtype=torch.int8)
+            w2 = torch.randint(-127,
+                            127, (
+                                num_experts,
+                                hidden_size,
+                                shard_intermediate_size // 2,
+                            ),
+                            dtype=torch.int8)
+        else:
+            w1 = torch.randint(-127,
+                            127, (
+                                num_experts,
+                                hidden_size,
+                                shard_intermediate_size
+                            ),
+                            dtype=torch.int8)
+            w2 = torch.randint(-127,
+                            127, (
+                                num_experts,
+                                shard_intermediate_size // 2,
+                                hidden_size
+                            ),
+                            dtype=torch.int8)
+    if use_int4_w4a16:
+            w1 = torch.randint(0,
+                            255, (
+                                num_experts,
+                                shard_intermediate_size,
+                                hidden_size // 2,
+                            ),
+                            dtype=torch.uint8)
+            w2 = torch.randint(0,
+                            255, (
+                                num_experts,
+                                hidden_size,
+                                shard_intermediate_size // 4,
+                            ),
+                            dtype=torch.uint8)
+    else:
+        if not nn_moe:
+            w1 = torch.randn(num_experts,
+                            shard_intermediate_size,
+                            hidden_size,
+                            dtype=init_dtype)
+            w2 = torch.randn(num_experts,
+                            hidden_size,
+                            shard_intermediate_size // 2,
+                            dtype=init_dtype)
+        else:
+            w1 = torch.randn(num_experts,
+                             hidden_size,
+                            shard_intermediate_size,
+                            dtype=init_dtype)
+            w2 = torch.randn(num_experts,
+                             shard_intermediate_size // 2,
+                            hidden_size,
+                            dtype=init_dtype)
+    gating_output = torch.randn(num_iters,
+                                num_tokens,
+                                num_experts,
+                                dtype=torch.float32)
+    w1_scale = None
+    w2_scale = None
+    a1_scale = None
+    a2_scale = None
+    w1_zp = None
+    w2_zp = None
+    block_shape = None
+    if use_int8_w8a16:
+        w1_scale = torch.randn((num_experts, 2 * shard_intermediate_size),
+                               dtype=torch.float32)
+        w2_scale = torch.randn((hidden_size, num_experts), dtype=torch.float32)
+    if use_fp8_w8a8:
+        w1_scale = torch.randn(num_experts, dtype=torch.float32)
+        w2_scale = torch.randn(num_experts, dtype=torch.float32)
+        a1_scale = torch.randn(1, dtype=torch.float32)
+        a2_scale = torch.randn(1, dtype=torch.float32)
+        w1 = w1.to(FP8_DTYPE)
+        w2 = w2.to(FP8_DTYPE)
+    if use_int4_w4a16:
+        w1_scale = torch.randn((num_experts, shard_intermediate_size, hidden_size // (group_size)),
+                               dtype=torch.float16)
+        w2_scale = torch.randn((num_experts, hidden_size,shard_intermediate_size // (2*group_size)),
+                               dtype=torch.float16)
+        w1_zp = torch.randint(0,
+                        255, (
+                            num_experts,
+                            shard_intermediate_size // 2,
+                            hidden_size // (group_size),
+                        ),
+                        dtype=torch.uint8)
+        w2_zp = torch.randint(0,
+                        255, (
+                            num_experts,
+                            hidden_size // 2,
+                            shard_intermediate_size // (2*group_size),
+                        ),
+                        dtype=torch.uint8)
+        nn_moe = False 
+        block_shape=[0, group_size]
+    input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32)
+    print(f"shape: {x.shape[0]} | config: {config}")
+    def prepare(i: int):
+        input_gating.copy_(gating_output[i])
+    def run():
+        from vllm.model_executor.layers.fused_moe import override_config
+        with override_config(config):
+            fused_moe(
+                x,
+                w1,
+                w2,
+                input_gating,
+                topk,
+                renormalize=True,
+                inplace=True,
+                use_fp8_w8a8=use_fp8_w8a8,
+                use_int8_w8a16=use_int8_w8a16,
+                use_int4_w4a16=use_int4_w4a16,
+                w1_scale=w1_scale,
+                w2_scale=w2_scale,
+                w1_zp=w1_zp,
+                w2_zp=w2_zp,
+                a1_scale=a1_scale,
+                a2_scale=a2_scale,
+                use_nn_moe=nn_moe,
+                block_shape=block_shape,
+            )
+    # JIT compilation & warmup
+    run()
+    torch.cuda.synchronize()
+    # Capture 10 invocations with CUDA graph
+    # graph = torch.cuda.CUDAGraph()
+    # with torch.cuda.graph(graph):
+    #     for _ in range(10):
+    #         run()
+    # torch.cuda.synchronize()
+    # Warmup
+    for _ in range(5):
+        # graph.replay()
+        run()
+    torch.cuda.synchronize()
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    latencies: List[float] = []
+    for i in range(num_iters):
+        prepare(i)
+        torch.cuda.synchronize()
+        start_event.record()
+        # graph.replay()
+        run()
+        end_event.record()
+        end_event.synchronize()
+        latencies.append(start_event.elapsed_time(end_event))
+    avg = sum(latencies) / (num_iters * 10) * 1000  # us
+    print(f"avg: {avg}")
+    # graph.reset()
+    return avg
+def get_rocm_tuning_space(use_fp16, use_int4_w4a16, nn_moe: Optional[bool] = False):
+    if use_int4_w4a16:
+        block_m_range = [16, 32, 64]
+        block_n_range = [32, 64, 128]        
+        block_k_range = [16, 32, 64]
+        num_warps_range = [1, 2, 4, 8]
+        group_m_range = [1, 4, 8, 16]
+        num_stage_range = [2, 4]
+        num_ldmatrixes = [0]
+        param_ranges = {
+            "BLOCK_SIZE_M": block_m_range,
+            "BLOCK_SIZE_N": block_n_range,
+            "BLOCK_SIZE_K": block_k_range,
+            "GROUP_SIZE_M": group_m_range,
+            "num_warps": num_warps_range,
+            "num_stages": num_stage_range,
+            "num_ldmatrixes": num_ldmatrixes,
+        }
+        return param_ranges
+    block_mn_range = [16, 32, 64, 128, 256]
+    block_k_range = [16, 32, 64, 128, 256]
+    if not use_fp16:
+        block_k_range.remove(16)  # BLOCK_K=16 not supported for fp8
+    num_warps_range = [1, 2, 4, 8]
+    group_m_range = [1, 4, 8, 16, 32]
+    num_stage_range = [2]
+    waves_per_eu_range = [0]
+    matrix_instr_nonkdim_range = [16, 32] if use_fp16 else []
+    kpack_range = [1, 2] if use_fp16 else []
+    param_ranges = {
+        "BLOCK_SIZE_M": block_mn_range,
+        "BLOCK_SIZE_N": block_mn_range,
+        "BLOCK_SIZE_K": block_k_range,
+        "GROUP_SIZE_M": group_m_range,
+        "num_warps": num_warps_range,
+        "num_stages": num_stage_range,
+        "waves_per_eu": waves_per_eu_range,
+    }
+    if nn_moe:
+        param_ranges["num_ldmatrixes"] = 1
+    if use_fp16:
+        param_ranges["matrix_instr_nonkdim"] = matrix_instr_nonkdim_range
+        param_ranges["kpack"] = kpack_range
+    return param_ranges
+def get_configs_compute_bound(use_fp16, use_int4_w4a16, nn_moe: Optional[bool] = False) -> List[Dict[str, int]]:
+    configs: List[BenchmarkConfig] = []
+    if current_platform.is_rocm():
+        param_ranges = get_rocm_tuning_space(use_fp16, use_int4_w4a16, nn_moe)
+    else:
+        # Reduced search space for faster tuning.
+        # TODO(woosuk): Increase the search space and use a performance model to
+        # prune the search space.
+        block_m_range = [16, 32, 64, 128, 256]
+        block_n_range = [32, 64, 128, 256]
+        block_k_range = [64, 128, 256]
+        num_warps_range = [4, 8]
+        group_m_range = [1, 16, 32, 64]
+        num_stage_range = [2, 3, 4, 5]
+        param_ranges = {
+            "BLOCK_SIZE_M": block_m_range,
+            "BLOCK_SIZE_N": block_n_range,
+            "BLOCK_SIZE_K": block_k_range,
+            "GROUP_SIZE_M": group_m_range,
+            "num_warps": num_warps_range,
+            "num_stages": num_stage_range,
+        }
+    keys, values = zip(*param_ranges.items())
+    for config_values in product(*values):
+        config = dict(zip(keys, config_values))
+        configs.append(config)
+    return configs
+def prune_rocm_search_space(num_tokens, shard_intermediate_size, hidden_size,
+                            search_space, is_fp16):
+    N1, K1 = shard_intermediate_size, hidden_size
+    N2, K2 = hidden_size, shard_intermediate_size // 2
+    pruned_space_1 = prune_rocm_configs(num_tokens * 2, N1, K1, search_space,
+                                        is_fp16)
+    pruned_space_2 = prune_rocm_configs(num_tokens * 2, N2, K2, search_space,
+                                        is_fp16)
+    search_space = merge_unique_dicts(pruned_space_1, pruned_space_2)
+    return search_space
+# The following code is inspired by ROCm/Triton GEMM tuning script:
+# https://github.com/ROCm/triton/blob/triton-mlir/scripts/amd/gemm/tune_gemm.py#L89
+def prune_rocm_configs(M, N, K, configs, is_fp16=True):
+    pruned_configs = []
+    elemBytes_a = 2 if is_fp16 else 1
+    elemBytes_b = 2 if is_fp16 else 1
+    mfma = 16 if M < 32 or N < 32 else 32
+    # TODO (zhanglx): figure out the boundary between large and small gemms
+    large_gemm = False
+    if M >= 2048 and N >= 2048:
+        large_gemm = True
+    for config in configs:
+        BLOCK_SIZE_M = config.get("BLOCK_SIZE_M")
+        BLOCK_SIZE_N = config.get("BLOCK_SIZE_N")
+        BLOCK_SIZE_K = config.get("BLOCK_SIZE_K")
+        num_warps = config.get("num_warps")
+        if is_fp16:
+            matrix_instr_nonkdim = config.get("matrix_instr_nonkdim")
+            if matrix_instr_nonkdim > mfma:
+                continue
+        if mfma == 4 and BLOCK_SIZE_K < 64:
+            continue
+        # some layouts could not work properly in case
+        # number elements per thread is less 1
+        if BLOCK_SIZE_M * BLOCK_SIZE_N < 64:
+            continue
+        SPLIT_K = config.get("SPLIT_K", 1)
+        GROUP_M = config.get("GROUP_SIZE_M")
+        if is_fp16:
+            if (matrix_instr_nonkdim > BLOCK_SIZE_M
+                    or matrix_instr_nonkdim > BLOCK_SIZE_N):
+                continue
+            if (matrix_instr_nonkdim >= M
+                    and matrix_instr_nonkdim != BLOCK_SIZE_M):
+                continue
+            if (matrix_instr_nonkdim >= N
+                    and matrix_instr_nonkdim != BLOCK_SIZE_N):
+                continue
+        # Skip BLOCK_SIZE that is too large compare to M/N
+        # unless BLOCK_SIZE is already small enough
+        if M * 2 < BLOCK_SIZE_M and BLOCK_SIZE_M != 16:
+            continue
+        if N * 2 < BLOCK_SIZE_N and BLOCK_SIZE_N != 16:
+            continue
+        # skip large split_k when not necessary
+        if SPLIT_K != 1 and not need_split_k(M, N, K):
+            continue
+        # skip split_k that leads to EVEN_K = false
+        leap = SPLIT_K * BLOCK_SIZE_K
+        modv = K % leap
+        if modv != 0:
+            continue
+        # skip large GROUP_M
+        if GROUP_M * BLOCK_SIZE_M > M and GROUP_M != 1:
+            continue
+        # out of shared memory resource
+        # TODO (zhanglx): This does not consider the LDS usage in the epilogue
+        LDS = (BLOCK_SIZE_K * BLOCK_SIZE_M * elemBytes_a +
+               BLOCK_SIZE_K * BLOCK_SIZE_N * elemBytes_b)
+        if LDS > 65536:
+            continue
+        # Skip small block sizes and num_warps for large gemm
+        # For fp16 and f8, we want to only use BLOCK_SIZE >= 64
+        if large_gemm:
+            if BLOCK_SIZE_M < 64 or BLOCK_SIZE_N < 64:
+                continue
+            if BLOCK_SIZE_K < 64:
+                continue
+            if num_warps < 4:
+                continue
+        pruned_configs.append(config)
+    return pruned_configs
+def need_split_k(SIZE_M, SIZE_N, SIZE_K):
+    return (SIZE_M < 64 or SIZE_N < 64) and SIZE_K > 1024
+def merge_unique_dicts(list1, list2):
+    result = []
+    combined_list = list1.copy()
+    combined_list.extend(list2)
+    for dictionary in combined_list:
+        if dictionary not in result:
+            result.append(dictionary)
+    return result
+@ray.remote(num_gpus=1)
+class BenchmarkWorker:
+    def __init__(self, seed: int) -> None:
+        torch.set_default_device("cuda")
+        current_platform.seed_everything(seed)
+        self.seed = seed
+        # Get the device ID to allocate tensors and kernels
+        # on the respective GPU. This is required for Ray to work
+        # correctly with multi-GPU tuning on the ROCm platform.
+        self.device_id = int(ray.get_gpu_ids()[0])
+    def benchmark(
+        self,
+        num_tokens: int,
+        num_experts: int,
+        shard_intermediate_size: int,
+        hidden_size: int,
+        topk: int,
+        dtype: torch.dtype,
+        use_fp8_w8a8: bool,
+        use_int8_w8a16: bool,
+        use_int4_w4a16: bool,
+        group_size:int,
+    ) -> Tuple[Dict[str, int], float]:
+        current_platform.seed_everything(self.seed)
+        dtype_str = get_config_dtype_str(dtype,
+                                         use_int4_w4a16=use_int4_w4a16,
+                                         use_int8_w8a16=use_int8_w8a16,
+                                         use_fp8_w8a8=use_fp8_w8a8)
+        # NOTE(woosuk): The current naming convention uses w2.shape[2], which
+        # is the intermediate size after silu_and_mul.
+        config_shard_intermediate_size = shard_intermediate_size
+        if use_int4_w4a16:
+            config_shard_intermediate_size = shard_intermediate_size // 2
+        op_config = get_moe_configs(num_experts, config_shard_intermediate_size // 2,
+                                    dtype_str)
+        if op_config is None:
+            config = get_default_config(num_tokens,
+                                        num_experts,
+                                        config_shard_intermediate_size,
+                                        hidden_size,
+                                        topk,
+                                        dtype_str,
+                                        is_marlin=False)
+        else:
+            config = op_config[min(op_config.keys(),
+                                   key=lambda x: abs(x - num_tokens))]
+        kernel_time = benchmark_config(config, num_tokens, num_experts,
+                                       shard_intermediate_size, hidden_size,
+                                       topk, dtype, use_fp8_w8a8,
+                                       use_int8_w8a16,use_int4_w4a16,group_size)
+        return config, kernel_time
+    def tune(
+        self,
+        num_tokens: int,
+        num_experts: int,
+        shard_intermediate_size: int,
+        hidden_size: int,
+        topk: int,
+        dtype: torch.dtype,
+        use_fp8_w8a8: bool,
+        use_int8_w8a16: bool,
+        use_int4_w4a16: bool,
+        group_size: int,
+        search_space: List[Dict[str, int]],
+        nn_moe: Optional[bool] = False
+    ) -> Dict[str, int]:
+        best_config = None
+        best_time = float("inf")
+        if current_platform.is_rocm():
+            is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16 or use_int4_w4a16)
+            search_space = prune_rocm_search_space(num_tokens,
+                                                   shard_intermediate_size,
+                                                   hidden_size, search_space,
+                                                   is_fp16)
+        with torch.cuda.device(self.device_id):
+            for config in tqdm(search_space):
+                try:
+                    kernel_time = benchmark_config(config,
+                                                   num_tokens,
+                                                   num_experts,
+                                                   shard_intermediate_size,
+                                                   hidden_size,
+                                                   topk,
+                                                   dtype,
+                                                   use_fp8_w8a8,
+                                                   use_int8_w8a16,
+                                                   use_int4_w4a16,
+                                                   group_size,
+                                                   num_iters=20,
+                                                   nn_moe=nn_moe)
+                except triton.runtime.autotuner.OutOfResources:
+                    # Some configurations may be invalid and fail to compile.
+                    continue
+                if kernel_time < best_time:
+                    best_time = kernel_time
+                    best_config = config
+        now = datetime.now()
+        print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
+        assert best_config is not None
+        return best_config
+def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
+    if "num_ldmatrixes" not in config:
+        return {
+            "BLOCK_SIZE_M": 
+            config["BLOCK_SIZE_M"],
+            "BLOCK_SIZE_N": 
+            config["BLOCK_SIZE_N"],
+            "BLOCK_SIZE_K": 
+            config["BLOCK_SIZE_K"],
+            "GROUP_SIZE_M": 
+            config["GROUP_SIZE_M"],
+            "num_warps": 
+            config["num_warps"],
+            "num_stages": 
+            config["num_stages"],
+            **({
+            "waves_per_eu": config["waves_per_eu"]
+            } if "waves_per_eu" in config else {}),
+            **({
+                "matrix_instr_nonkdim": config["matrix_instr_nonkdim"]
+            } if "matrix_instr_nonkdim" in config else {}),
+            **({
+                "kpack": config["kpack"]
+            } if "kpack" in config else {}),
+        }
+    else:
+        return {
+            "BLOCK_SIZE_M": 
+            config["BLOCK_SIZE_M"],
+            "BLOCK_SIZE_N": 
+            config["BLOCK_SIZE_N"],
+            "BLOCK_SIZE_K": 
+            config["BLOCK_SIZE_K"],
+            "GROUP_SIZE_M": 
+            config["GROUP_SIZE_M"],
+            "num_warps": 
+            config["num_warps"],
+            "num_stages": 
+            config["num_stages"],
+            "num_ldmatrixes": 
+            config["num_ldmatrixes"],
+            **({
+            "waves_per_eu": config["waves_per_eu"]
+            } if "waves_per_eu" in config else {}),
+            **({
+                "matrix_instr_nonkdim": config["matrix_instr_nonkdim"]
+            } if "matrix_instr_nonkdim" in config else {}),
+            **({
+                "kpack": config["kpack"]
+            } if "kpack" in config else {}),
+        }
+def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int,
+                 shard_intermediate_size: int, hidden_size: int, topk: int,
+                 dtype: torch.dtype, use_fp8_w8a8: bool,
+                 use_int8_w8a16: bool, use_int4_w4a16: bool, use_nn_moe: Optional[bool] = False) -> None:
+    dtype_str = get_config_dtype_str(dtype,
+                                     use_int8_w8a16=use_int8_w8a16,
+                                     use_int4_w4a16=use_int4_w4a16,
+                                     use_fp8_w8a8=use_fp8_w8a8)
+    # NOTE(woosuk): The current naming convention uses w2.shape[2], which
+    # is the intermediate size after silu_and_mul.
+    filename = get_config_file_name(num_experts, shard_intermediate_size // 2,
+                                    dtype_str, use_nn_moe=use_nn_moe)
+    print(f"Writing best config to {filename}...")
+    with open(filename, "w") as f:
+        json.dump(configs, f, indent=4)
+        f.write("\n")
+def main(args: argparse.Namespace):
+    print(args)
+    config = AutoConfig.from_pretrained(
+        args.model, trust_remote_code=args.trust_remote_code)
+    group_size = None
+    if config.architectures[0] == "DbrxForCausalLM":
+        E = config.ffn_config.moe_num_experts
+        topk = config.ffn_config.moe_top_k
+        intermediate_size = config.ffn_config.ffn_hidden_size
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
+    elif config.architectures[0] == "JambaForCausalLM":
+        E = config.num_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
+    elif config.architectures[0] ==  "DeepseekV2ForCausalLM" or "DeepseekV3ForCausalLM":
+        E = config.n_routed_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.moe_intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
+        if config.quantization_config['quant_method'] == "awq":
+            group_size = config.quantization_config["group_size"]
+    else:
+        # Default: Mixtral.
+        E = config.num_local_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
+    hidden_size = config.hidden_size
+    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
+    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
+    use_int8_w8a16 = args.dtype == "int8_w8a16"
+    use_int4_w4a16 = args.dtype == "int4_w4a16"
+    if args.batch_size is None:
+        batch_sizes = [
+            1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
+            2048, 3072, 4096
+        ]
+    else:
+        batch_sizes = [args.batch_size]
+    ray.init(address=None,
+                 ignore_reinit_error=True,
+                 num_gpus=1)
+    num_gpus = int(ray.available_resources()["GPU"])
+    workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
+    def _distribute(method: str, inputs: List[Any]) -> List[Any]:
+        outputs = []
+        worker_idx = 0
+        for input_args in inputs:
+            worker = workers[worker_idx]
+            worker_method = getattr(worker, method)
+            output = worker_method.remote(*input_args)
+            outputs.append(output)
+            worker_idx = (worker_idx + 1) % num_gpus
+        return ray.get(outputs)
+    if args.tune:
+        is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16 or use_int4_w4a16)
+        search_space = get_configs_compute_bound(is_fp16, use_int4_w4a16, args.nn_moe)
+        print(f"Start tuning over {len(search_space)} configurations...")
+        start = time.time()
+        configs = _distribute(
+            "tune", [(batch_size, E, shard_intermediate_size, hidden_size,
+                      topk, dtype, use_fp8_w8a8, use_int8_w8a16, use_int4_w4a16, group_size, search_space, args.nn_moe)
+                     for batch_size in batch_sizes])
+        best_configs = {
+            M: sort_config(config)
+            for M, config in zip(batch_sizes, configs)
+        }
+        if use_int4_w4a16:
+            save_configs(best_configs, E, shard_intermediate_size // 2, hidden_size,
+                        topk, dtype, use_fp8_w8a8, use_int8_w8a16, use_int4_w4a16, use_nn_moe=args.nn_moe)
+        else: 
+            save_configs(best_configs, E, shard_intermediate_size, hidden_size,
+                        topk, dtype, use_fp8_w8a8, use_int8_w8a16, use_nn_moe=args.nn_moe)
+        end = time.time()
+        print(f"Tuning took {end - start:.2f} seconds")
+    else:
+        outputs = _distribute(
+            "benchmark", [(batch_size, E, shard_intermediate_size, hidden_size,
+                           topk, dtype, use_fp8_w8a8, use_int8_w8a16, use_int4_w4a16, group_size)
+                          for batch_size in batch_sizes])
+        for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
+            print(f"Batch size: {batch_size}, config: {config}")
+            print(f"Kernel time: {kernel_time:.2f} us")
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser()
+    parser.add_argument("--model",
+                        type=str,
+                        default="/home/yang/llm-models/vllm-awq-models/DeepSeek-R1-AWQ/")
+    parser.add_argument("--tp-size",
+                        "-tp",
+                        "--tensor-parallel-size",
+                        type=int,
+                        default=8)
+    parser.add_argument("--dtype",
+                        type=str,
+                        choices=["auto", "fp8_w8a8", "int8_w8a16", "int4_w4a16"],
+                        default="int4_w4a16")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--batch-size", type=int, required=False)
+    parser.add_argument("--tune", action="store_true",default=False)
+    parser.add_argument("--nn_moe", type=bool, default=False)
+    parser.add_argument("--trust-remote-code", action="store_true", default=True)
+    args = parser.parse_args()
+    main(args)
--- a/setup.py
+++ b/setup.py
@@ -688,7 +688,8 @@ package_data = {
        "model_executor/layers/fused_moe/configs/*.json",
        "model_executor/layers/quantization/utils/configs/*.json",
        "benchmarks/*.py",
-        "model_executor/layers/quantization/configs/w8a8/*.json"
+        "model_executor/layers/quantization/configs/w8a8/*.json",
+        "model_executor/layers/quantization/configs/awq/*.json"
    ]
 }

--- a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=BW200,dtype=int4_w4a16.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=BW200,dtype=int4_w4a16.json
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4,
+        "num_ldmatrixes": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "num_ldmatrixes": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "num_ldmatrixes": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "num_ldmatrixes": 0
+    }
+}
--- a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=K100_AI,dtype=int4_w4a16.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=K100_AI,dtype=int4_w4a16.json
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4,
+        "num_ldmatrixes": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "num_ldmatrixes": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "num_ldmatrixes": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "num_ldmatrixes": 0
+    }
+}
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1066,7 +1066,7 @@ def get_config_dtype_str(dtype: torch.dtype,
    elif use_int8_w8a16:
        return "int8_w8a16"
    elif use_int4_w4a16:
-        return "int4_w8a16"
+        return "int4_w4a16"
    elif dtype == torch.float:
        # avoiding cases where kernel fails when float32 MoE
        # use fp16/bfloat16 configs

--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -5,7 +5,10 @@ from typing import Any, Dict, List, Optional
 import torch
 import os
 import torch.nn.functional as F
+import vllm.envs as envs
+import json
+import math
+from vllm.platforms import current_platform
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                               UnquantizedLinearMethod)
@@ -13,6 +16,57 @@ from vllm.model_executor.layers.quantization.base_config import (
    QuantizationConfig)
 from vllm.model_executor.parameter import (GroupQuantScaleParameter,
                                           PackedvLLMParameter)
+from vllm.model_executor.layers.quantization.awq_triton import awq_gemm_triton
+from vllm.logger import init_logger
+logger = init_logger(__name__)
+triton_configs_dict={}
+def get_triton_cache(file_path):
+        #会将所报错的json文件以字典的形式return出来
+    if os.path.exists(file_path):
+        with open(file_path, 'r') as file:
+            cachedata = json.load(file)
+    #把所有的cache解析成key:config的形式：[M_N_K]:[config]
+    for key, value in cachedata.items():
+        for sub_key, sub_value in value.items():
+            configs_key= f"{sub_key}_{key}"
+            configs_value={
+                'SPLIT_K': int(sub_value["SPLIT_K"]),
+                'BLOCK_SIZE_M': int(sub_value["BLOCK_SIZE_M"]),
+                'BLOCK_SIZE_N': int(sub_value["BLOCK_SIZE_N"]),
+                'BLOCK_SIZE_K': int(sub_value["BLOCK_SIZE_K"]),
+                'GROUP_SIZE_M': int(sub_value["GROUP_SIZE_M"]),
+                'num_stages':int(sub_value['num_stages']),
+                'num_warps':int(sub_value['num_warps'])
+            }
+            if 'num_ldmatrixes' in sub_value:
+                configs_value["num_ldmatrixes"] = int(sub_value['num_ldmatrixes'])
+            triton_configs_dict[configs_key]=configs_value
+    logger.info("%s have loaded!", file_path)
+def default_execution(k,n):
+    configs_key= f"1_{n}_{k}"
+    if configs_key in triton_configs_dict:
+        return
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    cache_json_file=f"{script_dir}/configs/awq/"
+    device_name = current_platform.get_device_name().replace(" ", "_")
+    filename = f"AWQ_{n}_{k}_{device_name}.json"
+    file_full_path = os.path.join(cache_json_file, filename)
+    if os.path.isfile(file_full_path) and file_full_path.endswith(".json"):
+        # 如果是文件，则添加到列表
+        get_triton_cache(file_full_path)
+    return
+def getspec_config(M,N,K):
+    if f"{M}_{N}_{K}" in triton_configs_dict:
+        return triton_configs_dict[f"{M}_{N}_{K}"]
+    else:
+        return None  
 class AWQShareWorkSpace:
@@ -111,7 +165,6 @@ class AWQLinearMethod(LinearMethodBase):
        self.quant_config = quant_config
        self.awqsingleton= AWQShareWorkSpace()
        self.use_awq_pad = os.environ.get('AWQ_PAD') == '1'
-        self.AWQ_CK_GEMMBS =int(os.getenv('AWQ_CK_GEMMBS', '20000'))
    def create_weights(self, layer: torch.nn.Module,
                       input_size_per_partition: int,
@@ -178,7 +231,9 @@ class AWQLinearMethod(LinearMethodBase):
        layer.register_parameter("qzeros", qzeros)
        layer.register_parameter("scales", scales)
        layer.register_parameter("zeros_and_scales", zeros_and_scales)
+        # 加载triton_config
+        if envs.VLLM_USE_TRITON_AWQ:
+            default_execution(input_size_per_partition,output_size_per_partition)
    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
        layer.qweight = torch.nn.Parameter(layer.qweight.data,
@@ -196,7 +251,9 @@ class AWQLinearMethod(LinearMethodBase):
              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
        qweight = layer.qweight
        zeros_and_scales = layer.zeros_and_scales
+        qzeros = layer.qzeros
+        scales = layer.scales
+        pack_factor = self.quant_config.pack_factor   
        out_shape = (x.shape[:-1] + (qweight.shape[0] * 1, ))
        reshaped_x = x.reshape(-1, x.shape[-1])
@@ -212,7 +269,14 @@ class AWQLinearMethod(LinearMethodBase):
        else:
            padding_group=0
-        if m <= self.AWQ_CK_GEMMBS: 
+        if envs.VLLM_USE_TRITON_AWQ:
+            if m>16:
+                m = 2 ** math.ceil(math.log2(m))
+            best_config=getspec_config(m,n,k)
+            out = awq_gemm_triton(reshaped_x, qweight, scales, qzeros, pack_factor, best_config)    
+            out_shape = (x.shape[:-1] + (qweight.shape[1] * 8, ))
+        else:
            out = ops.awq_gemm(reshaped_x,
                            qweight,
                            zeros_and_scales,
@@ -223,15 +287,6 @@ class AWQLinearMethod(LinearMethodBase):
                            padding_group,
                            self.awqsingleton.awqworkshapce,
                            self.awqsingleton.awqworkshapcesize)
-        else: 
-            #下面是采用rocblas的做法
-            deqweight=ops.dequant_w4_gemm_colmajor(    # shape[n, k/8] ---> [n,k]
-                            qweight, 
-                            zeros_and_scales,
-                            k+padding_group*self.quant_config.group_size,
-                            n,
-                            self.quant_config.group_size)
-            out=F.linear(reshaped_x, deqweight[:,0:k])    
        if bias is not None:
            out.add_(bias)

--- a/vllm/model_executor/layers/quantization/awq_triton.py
+++ b/vllm/model_executor/layers/quantization/awq_triton.py
@@ -44,10 +44,14 @@ def awq_dequantize_kernel(
    result_masks = result_masks_y[:, None] & result_masks_x[None, :]
    # Load the weights.
-    iweights = tl.load(qweight_ptr + offsets, masks, 0.0)
+    iweights = tl.load(qweight_ptr + offsets, masks)
-    iweights = tl.interleave(iweights, iweights)
-    iweights = tl.interleave(iweights, iweights)
+    iweights =tl.join(iweights, iweights).reshape(iweights.shape[:-1] + [2 * iweights.shape[-1]])
-    iweights = tl.interleave(iweights, iweights)
+    iweights =tl.join(iweights, iweights).reshape(iweights.shape[:-1] + [2 * iweights.shape[-1]])
+    iweights =tl.join(iweights, iweights).reshape(iweights.shape[:-1] + [2 * iweights.shape[-1]])
+    # iweights = tl.interleave(iweights, iweights)
+    # iweights = tl.interleave(iweights, iweights)
+    # iweights = tl.interleave(iweights, iweights)
    # Create reverse AWQ order as tensor: [0, 4, 1, 5, 2, 6, 3, 7]
    # that will map given indices to the correct order.
@@ -73,10 +77,14 @@ def awq_dequantize_kernel(
    zero_masks = zero_masks_y[:, None] & zero_masks_x[None, :]
    # Load the zeros.
-    zeros = tl.load(zeros_ptr + zero_offsets, zero_masks, 0.0)
+    zeros = tl.load(zeros_ptr + zero_offsets, zero_masks)
-    zeros = tl.interleave(zeros, zeros)
+    # zeros = tl.interleave(zeros, zeros)
-    zeros = tl.interleave(zeros, zeros)
+    # zeros = tl.interleave(zeros, zeros)
-    zeros = tl.interleave(zeros, zeros)
+    # zeros = tl.interleave(zeros, zeros)
+    zeros =tl.join(zeros, zeros).reshape(zeros.shape[:-1] + [2 * zeros.shape[-1]])
+    zeros =tl.join(zeros, zeros).reshape(zeros.shape[:-1] + [2 * zeros.shape[-1]])
+    zeros =tl.join(zeros, zeros).reshape(zeros.shape[:-1] + [2 * zeros.shape[-1]])
    zeros = tl.broadcast_to(zeros, (BLOCK_SIZE_Y, BLOCK_SIZE_X * 8))
    # Unpack and reorder: shift out the correct 4-bit value and mask.
@@ -93,7 +101,7 @@ def awq_dequantize_kernel(
    scale_masks = scale_masks_y[:, None] & scale_masks_x[None, :]
    # Load the scales.
-    scales = tl.load(scales_ptr + scale_offsets, scale_masks, 0.0)
+    scales = tl.load(scales_ptr + scale_offsets, scale_masks)
    scales = tl.broadcast_to(scales, (BLOCK_SIZE_Y, BLOCK_SIZE_X * 8))
    # Dequantize.
@@ -108,19 +116,26 @@ def awq_dequantize_kernel(
 def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K,
                    group_size, BLOCK_SIZE_M: tl.constexpr,
                    BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
-                    SPLIT_K: tl.constexpr):
+                    GROUP_SIZE_M: tl.constexpr,SPLIT_K: tl.constexpr):
    pid = tl.program_id(axis=0)
    pid_z = tl.program_id(1)
-    # NOTE: This doesn't work in TRITON_INTERPRET=1 mode.  Use below instead.
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    # num_pid_n = (N + BLOCK_SIZE_N - 1) // BLOCK_SIZE_N
    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    if GROUP_SIZE_M == 1:
        pid_m = pid // num_pid_n
        pid_n = pid % num_pid_n
+    else:
-    accumulator_dtype = c_ptr.type.element_ty
+        num_pid_in_group = GROUP_SIZE_M * num_pid_n
+        group_id = pid // num_pid_in_group
+        first_pid_m = group_id * GROUP_SIZE_M
+        group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+        pid_m = first_pid_m + (pid % group_size_m)
+        pid_n = (pid % num_pid_in_group) // group_size_m
+    # accumulator_dtype = c_ptr.type.element_ty
+    BLOCK_SIZE_N_8 = BLOCK_SIZE_N // 8
+    N_8 = N // 8
    # NOTE: This doesn't work in TRITON_INTERPRET=1 mode.  Use below instead.
    # accumulator = tl.arange(0, BLOCK_SIZE_N)
    # accumulator = tl.broadcast_to(accumulator[None, :],
@@ -128,16 +143,16 @@ def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K,
    # accumulator = accumulator & 0x0
    # accumulator = accumulator.to(accumulator_dtype)
    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N),
-                           dtype=accumulator_dtype)
+                           dtype=tl.float32)
    # Create reverse AWQ order as tensor: [0, 4, 1, 5, 2, 6, 3, 7]
    # that will map given indices to the correct order.
-    reverse_awq_order_tensor = ((tl.arange(0, 2) * 4)[None, :] +
+    shifts = ((tl.arange(0, 2) * 16)[None, :] +
-                                tl.arange(0, 4)[:, None]).reshape(8)
+                                (tl.arange(0, 4) * 4)[:, None]).reshape(1,8)
    # Create the necessary shifts to use to unpack.
-    shifts = reverse_awq_order_tensor * 4
+    # shifts = reverse_awq_order_tensor * 4
-    shifts = tl.broadcast_to(shifts[None, :],
+    shifts = tl.broadcast_to(shifts,
                             (BLOCK_SIZE_K * (BLOCK_SIZE_N // 8), 8))
    shifts = tl.reshape(shifts, (BLOCK_SIZE_K, BLOCK_SIZE_N))
@@ -145,18 +160,15 @@ def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K,
    offsets_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
    masks_am = offsets_am < M
-    offsets_bn = pid_n * (BLOCK_SIZE_N // 8) + tl.arange(0, BLOCK_SIZE_N // 8)
+    offsets_bzn = pid_n * (BLOCK_SIZE_N_8) + tl.arange(0, BLOCK_SIZE_N // 8)
-    masks_bn = offsets_bn < N // 8
+    masks_bzn = offsets_bzn < N_8
-    offsets_zn = pid_n * (BLOCK_SIZE_N // 8) + tl.arange(0, BLOCK_SIZE_N // 8)
-    masks_zn = offsets_zn < N // 8
    offsets_sn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
    masks_sn = offsets_sn < N
    offsets_k = pid_z * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)
    offsets_a = K * offsets_am[:, None] + offsets_k[None, :]
-    offsets_b = (N // 8) * offsets_k[:, None] + offsets_bn[None, :]
+    offsets_b = (N_8) * offsets_k[:, None] + offsets_bzn[None, :]
    a_ptrs = a_ptr + offsets_a
    b_ptrs = b_ptr + offsets_b
@@ -167,33 +179,40 @@ def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K,
    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K * SPLIT_K)):
        masks_k = offsets_k < K
        masks_a = masks_am[:, None] & masks_k[None, :]
-        a = tl.load(a_ptrs, mask=masks_a, other=0.0)
+        a = tl.load(a_ptrs, mask=masks_a)
-        masks_b = masks_k[:, None] & masks_bn[None, :]
+        masks_b = masks_k[:, None] & masks_bzn[None, :]
-        b = tl.load(b_ptrs, mask=masks_b, other=0.0)
+        b = tl.load(b_ptrs, mask=masks_b)
-        b = tl.interleave(b, b)
+        # b = tl.interleave(b, b)
-        b = tl.interleave(b, b)
+        # b = tl.interleave(b, b)
-        b = tl.interleave(b, b)
+        # b = tl.interleave(b, b)
+        b =tl.join(b, b).reshape(b.shape[:-1] + [2 * b.shape[-1]])
+        b =tl.join(b, b).reshape(b.shape[:-1] + [2 * b.shape[-1]])
+        b =tl.join(b, b).reshape(b.shape[:-1] + [2 * b.shape[-1]])
        # Dequantize b.
-        offsets_szk = (
+        offsets_szk = (BLOCK_SIZE_K * SPLIT_K * k + pid_z * BLOCK_SIZE_K) // group_size 
-            (BLOCK_SIZE_K * SPLIT_K * k + pid_z * BLOCK_SIZE_K) // group_size +
+        offsets_szk = offsets_szk + (tl.arange(0,BLOCK_SIZE_K) // group_size) 
-            tl.arange(0, 1))
+        offsets_z = (N_8) * offsets_szk[:, None] + offsets_bzn[None, :]
-        offsets_z = (N // 8) * offsets_szk[:, None] + offsets_zn[None, :]
        masks_zk = offsets_szk < K // group_size
-        masks_z = masks_zk[:, None] & masks_zn[None, :]
+        masks_z = masks_zk[:, None] & masks_bzn[None, :]
        zeros_ptrs = zeros_ptr + offsets_z
-        zeros = tl.load(zeros_ptrs, mask=masks_z, other=0.0)
+        zeros = tl.load(zeros_ptrs, mask=masks_z)
-        zeros = tl.interleave(zeros, zeros)
+        # zeros = tl.interleave(zeros, zeros)
-        zeros = tl.interleave(zeros, zeros)
+        # zeros = tl.interleave(zeros, zeros)
-        zeros = tl.interleave(zeros, zeros)
+        # zeros = tl.interleave(zeros, zeros)
+        zeros =tl.join(zeros, zeros).reshape(zeros.shape[:-1] + [2 * zeros.shape[-1]])
+        zeros =tl.join(zeros, zeros).reshape(zeros.shape[:-1] + [2 * zeros.shape[-1]])
+        zeros =tl.join(zeros, zeros).reshape(zeros.shape[:-1] + [2 * zeros.shape[-1]])
        zeros = tl.broadcast_to(zeros, (BLOCK_SIZE_K, BLOCK_SIZE_N))
        offsets_s = N * offsets_szk[:, None] + offsets_sn[None, :]
        masks_sk = offsets_szk < K // group_size
        masks_s = masks_sk[:, None] & masks_sn[None, :]
        scales_ptrs = scales_ptr + offsets_s
-        scales = tl.load(scales_ptrs, mask=masks_s, other=0.0)
+        scales = tl.load(scales_ptrs, mask=masks_s)
        scales = tl.broadcast_to(scales, (BLOCK_SIZE_K, BLOCK_SIZE_N))
        b = (b >> shifts) & 0xF
@@ -202,18 +221,20 @@ def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K,
        b = b.to(c_ptr.type.element_ty)
        # Accumulate results.
-        accumulator = tl.dot(a, b, accumulator, out_dtype=accumulator_dtype)
+        accumulator = tl.dot(a, b, accumulator, out_dtype=tl.float32)
        offsets_k += BLOCK_SIZE_K * SPLIT_K
        a_ptrs += BLOCK_SIZE_K * SPLIT_K
-        b_ptrs += BLOCK_SIZE_K * SPLIT_K * (N // 8)
+        b_ptrs += BLOCK_SIZE_K * SPLIT_K * (N_8)
    c = accumulator.to(c_ptr.type.element_ty)
-    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    c_ptrs = c_ptr + N * offsets_am[:, None] + offsets_sn[None, :]
-    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_mask = masks_am[:, None] & masks_sn[None, :]
-    c_ptrs = c_ptr + pid_z * N * M + N * offs_cm[:, None] + offs_cn[None, :]
+    if SPLIT_K == 1:
-    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
        tl.store(c_ptrs, c, mask=c_mask)
+        # tl.store(c_ptrs, c)
+    else:
+        tl.atomic_add(c_ptrs, c, mask=c_mask)
 # qweights - [K     , M // 8], int32
@@ -272,9 +293,7 @@ def awq_gemm_triton(input: torch.Tensor,
                    scales: torch.Tensor,
                    qzeros: torch.Tensor,
                    split_k_iters: int,
-                    block_size_m: int = 32,
+                    config) -> torch.Tensor:
-                    block_size_n: int = 32,
-                    block_size_k: int = 32) -> torch.Tensor:
    M, K = input.shape
    N = qweight.shape[1] * 8
    group_size = qweight.shape[0] // qzeros.shape[0]
@@ -289,14 +308,16 @@ def awq_gemm_triton(input: torch.Tensor,
    assert group_size in AWQ_TRITON_SUPPORTED_GROUP_SIZES or group_size == K
    grid = lambda META: (
-        triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
+        triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']),
-            N, META['BLOCK_SIZE_N']),
+        META['SPLIT_K'],
-        split_k_iters,
    )
+    if config is None:
+        config= {'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8,'SPLIT_K': 8}
+        if M >256:
+        #print("INFO:this size not found in json.")
+            config= {'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8,'SPLIT_K': 1}
-    result = torch.zeros((split_k_iters, M, N),
+    result = torch.zeros((M, N), dtype=scales.dtype, device=input.device)
-                         dtype=scales.dtype,
-                         device=input.device)
    # A = input, B = qweight, C = result
    # A = M x K, B = K x N, C = M x N
@@ -309,11 +330,5 @@ def awq_gemm_triton(input: torch.Tensor,
                        N,
                        K,
                        group_size,
-                          BLOCK_SIZE_M=block_size_m,
+                        **config)
-                          BLOCK_SIZE_N=block_size_n,
-                          BLOCK_SIZE_K=block_size_k,
-                          SPLIT_K=split_k_iters)
-    result = result.sum(0)
    return result
--- a/vllm/model_executor/layers/quantization/configs/awq/AWQ_1536_7168_BW200.json
+++ b/vllm/model_executor/layers/quantization/configs/awq/AWQ_1536_7168_BW200.json
+{
+    "1536_7168": {
+        "1": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "2": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "3": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "4": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "5": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "6": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "7": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "8": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "9": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "10": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "11": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "12": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "13": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "14": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "15": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "16": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "32": {
+            "BLOCK_SIZE_M": 32,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "64": {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "128": {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "256": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "512": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "1024": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 2,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "2048": {
+            "BLOCK_SIZE_M": 256,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 2,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "4096": {
+            "BLOCK_SIZE_M": 256,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 1,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        }
+    }
+}
\ No newline at end of file
--- a/vllm/model_executor/layers/quantization/configs/awq/AWQ_1536_7168_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/awq/AWQ_1536_7168_K100_AI.json
+{
+    "1536_7168": {
+        "1": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "2": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "3": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "4": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "5": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "6": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "7": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "8": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "9": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "10": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "11": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "12": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "13": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "14": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "15": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "16": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "32": {
+            "BLOCK_SIZE_M": 32,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "64": {
+            "BLOCK_SIZE_M": 32,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 0
+        },
+        "128": {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "256": {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 0,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "512": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "1024": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 2,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "2048": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 1,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "4096": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 1,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        }
+    }
+}
\ No newline at end of file
--- a/vllm/model_executor/layers/quantization/configs/awq/AWQ_3072_1536_BW200.json
+++ b/vllm/model_executor/layers/quantization/configs/awq/AWQ_3072_1536_BW200.json
+{
+    "3072_1536": {
+        "1": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "2": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "3": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "4": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "5": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "6": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "7": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "8": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "9": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "10": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "11": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "12": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "13": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "14": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "15": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "16": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "32": {
+            "BLOCK_SIZE_M": 32,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "64": {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 4,
+            "num_stages": 0,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "128": {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 2,
+            "num_stages": 0,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "256": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 1,
+            "num_stages": 0,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "512": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 1,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "1024": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 1,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "2048": {
+            "BLOCK_SIZE_M": 256,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 1,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "4096": {
+            "BLOCK_SIZE_M": 256,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 1,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        }
+    }
+}
\ No newline at end of file
--- a/vllm/model_executor/layers/quantization/configs/awq/AWQ_3072_1536_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/awq/AWQ_3072_1536_K100_AI.json
+{
+    "3072_1536": {
+        "1": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "2": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "3": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "4": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "5": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "6": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "7": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "8": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "9": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "10": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "11": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "12": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "13": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "14": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "15": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "16": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "32": {
+            "BLOCK_SIZE_M": 32,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "64": {
+            "BLOCK_SIZE_M": 32,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 0
+        },
+        "128": {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 1,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "256": {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 1,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "512": {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 1,
+            "num_stages": 0,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "1024": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 1,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "2048": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 1,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "4096": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 1,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        }
+    }
+}
\ No newline at end of file
--- a/vllm/model_executor/layers/quantization/configs/awq/AWQ_4096_512_BW200.json
+++ b/vllm/model_executor/layers/quantization/configs/awq/AWQ_4096_512_BW200.json
+{
+    "4096_512": {
+        "1": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "2": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "3": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "4": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "5": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "6": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "7": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "8": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "9": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "10": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "11": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "12": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "13": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "14": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "15": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "16": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "32": {
+            "BLOCK_SIZE_M": 32,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 1,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "64": {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 1,
+            "num_stages": 0,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "128": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 1,
+            "num_stages": 0,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "256": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 1,
+            "num_stages": 0,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "512": {
+            "BLOCK_SIZE_M": 256,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 1,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "1024": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 1,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "2048": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 1,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "4096": {
+            "BLOCK_SIZE_M": 256,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 1,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        }
+    }
+}
\ No newline at end of file
--- a/vllm/model_executor/layers/quantization/configs/awq/AWQ_4096_512_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/awq/AWQ_4096_512_K100_AI.json
+{
+    "4096_512": {
+        "1": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "2": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "3": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "4": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "5": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "6": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "7": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "8": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "9": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "10": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "11": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "12": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "13": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "14": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "15": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "16": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "32": {
+            "BLOCK_SIZE_M": 32,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 2,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "64": {
+            "BLOCK_SIZE_M": 32,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 1,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "128": {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 1,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "256": {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 1,
+            "num_stages": 0,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "512": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 1,
+            "num_stages": 0,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "1024": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 1,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "2048": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 1,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "4096": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 1,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        }
+    }
+}
\ No newline at end of file
--- a/vllm/model_executor/layers/quantization/configs/awq/AWQ_4608_7168_BW200.json
+++ b/vllm/model_executor/layers/quantization/configs/awq/AWQ_4608_7168_BW200.json
+{
+    "4608_7168": {
+        "1": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 2,
+            "num_ldmatrixes": 1
+        },
+        "2": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 2,
+            "num_ldmatrixes": 1
+        },
+        "3": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 2,
+            "num_ldmatrixes": 1
+        },
+        "4": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 2,
+            "num_ldmatrixes": 1
+        },
+        "5": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 2,
+            "num_ldmatrixes": 1
+        },
+        "6": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 2,
+            "num_ldmatrixes": 1
+        },
+        "7": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 2,
+            "num_ldmatrixes": 1
+        },
+        "8": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 2,
+            "num_ldmatrixes": 1
+        },
+        "9": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 2,
+            "num_ldmatrixes": 1
+        },
+        "10": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 2,
+            "num_ldmatrixes": 1
+        },
+        "11": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 2,
+            "num_ldmatrixes": 1
+        },
+        "12": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 2,
+            "num_ldmatrixes": 1
+        },
+        "13": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 2,
+            "num_ldmatrixes": 1
+        },
+        "14": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 2,
+            "num_ldmatrixes": 1
+        },
+        "15": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 2,
+            "num_ldmatrixes": 1
+        },
+        "16": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 2,
+            "num_ldmatrixes": 1
+        },
+        "32": {
+            "BLOCK_SIZE_M": 32,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "64": {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "128": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "256": {
+            "BLOCK_SIZE_M": 256,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 2,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "512": {
+            "BLOCK_SIZE_M": 256,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 1,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "1024": {
+            "BLOCK_SIZE_M": 256,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 1,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "2048": {
+            "BLOCK_SIZE_M": 256,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 1,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "4096": {
+            "BLOCK_SIZE_M": 256,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 1,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        }
+    }
+}
\ No newline at end of file
--- a/vllm/model_executor/layers/quantization/configs/awq/AWQ_4608_7168_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/awq/AWQ_4608_7168_K100_AI.json
+{
+    "4608_7168": {
+        "1": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "2": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "3": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "4": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "5": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "6": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "7": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "8": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "9": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "10": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "11": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "12": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "13": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "14": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "15": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "16": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "32": {
+            "BLOCK_SIZE_M": 32,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "64": {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 4,
+            "num_stages": 0,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "128": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "256": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "512": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 2,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "1024": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 1,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "2048": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 1,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "4096": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 1,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        }
+    }
+}
\ No newline at end of file
--- a/vllm/model_executor/layers/quantization/configs/awq/AWQ_512_7168_BW200.json
+++ b/vllm/model_executor/layers/quantization/configs/awq/AWQ_512_7168_BW200.json
+{
+    "512_7168": {
+        "1": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "2": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "3": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "4": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "5": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "6": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "7": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "8": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "9": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "10": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "11": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "12": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "13": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "14": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "15": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "16": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "32": {
+            "BLOCK_SIZE_M": 32,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "64": {
+            "BLOCK_SIZE_M": 32,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "128": {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "256": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "512": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 4,
+            "num_stages": 0,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "1024": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "2048": {
+            "BLOCK_SIZE_M": 256,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 2,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "4096": {
+            "BLOCK_SIZE_M": 256,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 1,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1   
+        }     
+    }
+}
\ No newline at end of file
--- a/vllm/model_executor/layers/quantization/configs/awq/AWQ_512_7168_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/awq/AWQ_512_7168_K100_AI.json
+{
+    "512_7168": {
+        "1": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "2": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "3": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "4": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "5": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "6": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "7": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "8": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "9": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "10": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "11": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "12": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "13": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "14": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "15": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "16": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "32": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "64": {
+            "BLOCK_SIZE_M": 32,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "128": {
+            "BLOCK_SIZE_M": 32,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "256": {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "512": {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 0,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "1024": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 0,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "2048": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 2,
+            "num_stages": 0,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "4096": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 2,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        }
+    }
+}
\ No newline at end of file
--- a/vllm/model_executor/layers/quantization/configs/awq/AWQ_576_7168_BW200.json
+++ b/vllm/model_executor/layers/quantization/configs/awq/AWQ_576_7168_BW200.json
+{
+    "576_7168": {
+        "1": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "2": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "3": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "4": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "5": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "6": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "7": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "8": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "9": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "10": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "11": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "12": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "13": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "14": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "15": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "16": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "32": {
+            "BLOCK_SIZE_M": 32,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "64": {
+            "BLOCK_SIZE_M": 32,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "128": {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "256": {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "512": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 4,
+            "num_stages": 0,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "1024": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "2048": {
+            "BLOCK_SIZE_M": 256,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 2,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "4096": {
+            "BLOCK_SIZE_M": 256,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 1,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        }        
+    }
+}
\ No newline at end of file
--- a/vllm/model_executor/layers/quantization/configs/awq/AWQ_576_7168_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/awq/AWQ_576_7168_K100_AI.json
+{
+    "576_7168": {
+        "1": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "2": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "3": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "4": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "5": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "6": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "7": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "8": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "9": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "10": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "11": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "12": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "13": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "14": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "15": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "16": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "32": {
+            "BLOCK_SIZE_M": 32,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "64": {
+            "BLOCK_SIZE_M": 32,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "128": {
+            "BLOCK_SIZE_M": 32,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "256": {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 0,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "512": {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 0,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "1024": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "2048": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "4096": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 2,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        }
+    }
+}
\ No newline at end of file
--- a/vllm/model_executor/layers/quantization/configs/awq/AWQ_7168_2048_BW200.json
+++ b/vllm/model_executor/layers/quantization/configs/awq/AWQ_7168_2048_BW200.json
+{
+    "7168_2048": {
+        "1": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "2": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "3": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "4": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "5": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "6": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "7": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "8": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "9": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "10": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "11": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "12": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "13": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "14": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "15": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "16": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "32": {
+            "BLOCK_SIZE_M": 32,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "64": {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 2,
+            "num_stages": 0,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "128": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 1,
+            "num_stages": 0,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "256": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 1,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "512": {
+            "BLOCK_SIZE_M": 256,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 1,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "1024": {
+            "BLOCK_SIZE_M": 256,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 1,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "2048": {
+            "BLOCK_SIZE_M": 256,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 1,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "4096": {
+            "BLOCK_SIZE_M": 256,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 1,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        }
+    }
+}
\ No newline at end of file