增加线性int8 gemm配置

268d8a77 · gaoqiong · 92545504 · 268d8a77
Commit 268d8a77 authored Mar 14, 2025 by gaoqiong
Hide whitespace changes
Inline Side-by-side

Showing with 49 additions and 13 deletions

vllm/model_executor/layers/quantization/utils/int8_utils.py vllm/model_executor/layers/quantization/utils/int8_utils.py +49 -13

No files found.
--- a/vllm/model_executor/layers/quantization/utils/int8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/int8_utils.py
 # SPDX-License-Identifier: Apache-2.0

-# Adapted from https://github.com/sgl-project/sglang/pull/3730
 import functools
 import json
 import logging
@@ -336,23 +335,60 @@ def w8a8_block_int8_matmul(
    C_shape = A.shape[:-1] + (N,)
    C = A.new_empty(C_shape, dtype=output_dtype)

-    configs = get_w8a8_block_int8_configs(N, K, block_size[0], block_size[1])
-    if configs:
-        # If an optimal configuration map has been found, look up the
-        # optimal config
-        config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
-    else:
+    #configs = get_w8a8_block_int8_configs(N, K, block_size[0], block_size[1])
+    #if configs:
+    #    # If an optimal configuration map has been found, look up the
+    #    # optimal config
+    #    config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
+    #else:
        # Default config
        # Block-wise quant: BLOCK_SIZE_K must be divisable by block_size[1]
        #print("block_size[0]:{},block_size[1]:{}".format(block_size[0],block_size[1]))
+    #    config = {
+    #        "BLOCK_SIZE_M": 32, #64
+    #        "BLOCK_SIZE_N": block_size[0],
+    #        "BLOCK_SIZE_K": block_size[1],
+    #        "GROUP_SIZE_M": 32,
+    #        "num_warps": 4,
+    #        "num_stages": 3,
+    #    }
+
+    if M<=64:
        config = {
-            "BLOCK_SIZE_M": 32, #64
-            "BLOCK_SIZE_N": block_size[0],
-            "BLOCK_SIZE_K": block_size[1],
-            "GROUP_SIZE_M": 32,
+            "BLOCK_SIZE_M": 16, #64
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 128,
+            "GROUP_SIZE_M": 2,
            "num_warps": 4,
-            "num_stages": 3,
+            "num_stages": 0,
        }
+    elif M<128:
+        config = {
+            "BLOCK_SIZE_M": 32, #64
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 128,
+            "GROUP_SIZE_M": 2,
+            "num_warps": 4,
+            "num_stages": 0,
+        }   
+    elif M<=256:
+        config = {
+            "BLOCK_SIZE_M": 64, #64
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 128,
+            "GROUP_SIZE_M": 2,
+            "num_warps": 4,
+            "num_stages": 0,
+        }                     
+    else :
+        config = {
+            "BLOCK_SIZE_M": 64, #64
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 128,
+            "GROUP_SIZE_M": 8,
+            "num_warps": 8,
+            "num_stages": 0,
+        }     

    def grid(META):
        return (
@@ -514,4 +550,4 @@ def block_dequant(
                i * block_k : min((i + 1) * block_k, k),
            ] *= x_s[j][i]

-    return x_dq_block
\ No newline at end of file
+    return x_dq_block