增加w8a8相关修改

083b80ea · zhuwenwen · 09428eec · 083b80ea · 083b80ea · 083b80ea
Commit 083b80ea authored Jan 16, 2025 by zhuwenwen
20 changed files
--- a/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_4096_4096_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_4096_4096_K100_AI.json
--- a/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_4608_3584_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_4608_3584_K100_AI.json
--- a/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_5120_13824_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_5120_13824_K100_AI.json
--- a/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_5120_2560_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_5120_2560_K100_AI.json
--- a/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_5120_5120_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_5120_5120_K100_AI.json
--- a/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_5120_6912_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_5120_6912_K100_AI.json
--- a/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_5120_8192_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_5120_8192_K100_AI.json
--- a/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_6144_4096_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_6144_4096_K100_AI.json
--- a/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_7168_8192_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_7168_8192_K100_AI.json
--- a/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_7680_5120_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_7680_5120_K100_AI.json
--- a/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_8192_1024_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_8192_1024_K100_AI.json
--- a/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_8192_14336_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_8192_14336_K100_AI.json
--- a/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_8192_2048_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_8192_2048_K100_AI.json
--- a/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_8192_3584_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_8192_3584_K100_AI.json
--- a/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_8192_4096_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_8192_4096_K100_AI.json
--- a/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_8192_7168_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/w8a8/W8A8_8192_7168_K100_AI.json
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -238,27 +238,40 @@ def apply_int8_linear(
        m=x_q.shape[0]
        k=x_q.shape[1]
        n=weight.shape[1]
-        if f"{m}_{n}_{k}" in  W8A8_TRITONJSON.triton_json_dict[0]:
+        #print("m:{},n:{},k:{}".format(m,n,k))
-            best_config=W8A8_TRITONJSON.triton_json_dict[0][f"{m}_{n}_{k}"]
-            #print("json files:",best_config)
+        if f"1_{n}_{k}" in  W8A8_TRITONJSON.triton_json_dict[0]:
-        elif f"1_{n}_{k}" in  W8A8_TRITONJSON.triton_json_dict[0]:
+            if m<=16:
-            if m<64:
+                m_=m
-                m_= 32
+                #best_config=W8A8_TRITONJSON.triton_json_dict[0][f"{m}_{n}_{k}"]
-            elif m<128:
+            elif m<=64:
-                m_=64
+                m_= (m + 3) & -4 #取值到最近的4的倍数
-            elif m<256:
+            elif m<=160:
-                m_=128
+                m_=(m + 7) & -8
-            elif m<512:
+            elif m<200: #256
+                m_=160
+            elif m<480: #512
                m_=256
-            elif m<1024:
+            elif m<960: #1024
                m_=512
-            else:
+            elif m<2048:
                m_=1024
+            elif m<4096:
+                m_=2048
+            elif m<6000:
+                m_=4096
+            else:
+                m_=8192  
            best_config=W8A8_TRITONJSON.triton_json_dict[0][f"{m_}_{n}_{k}"]
        else: 
            best_config=None
-            print("config not found!")
+         # if best_config==None:
+        #     print("m:{},n:{},k:{}".format(m,n,k))
+        #     print("config not found!")
        return ops.triton_scaled_mm(x_q,
                                weight,

--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -462,7 +462,7 @@ class FalconForCausalLM(nn.Module, SupportsPP):
        self.use_gemm_pad = os.environ.get('GEMM_PAD') == '1'
        self.use_fa_pad = os.environ.get('FA_PAD') == '1'
        self.use_awq_pad = os.environ.get('AWQ_PAD') == '1'
-        self.w8a8_strategy=int(os.getenv('W8A8_SUPPORT_METHODS', '0'))
+        self.w8a8_strategy=int(os.getenv('W8A8_SUPPORT_METHODS', '1'))
    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
        return self.transformer.get_input_embeddings(input_ids)

--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -355,7 +355,7 @@ class LlamaModel(nn.Module):
        self.use_gemm_pad = os.environ.get('GEMM_PAD') == '1'
        self.use_fa_pad = os.environ.get('FA_PAD') == '1'
        self.use_awq_pad = os.environ.get('AWQ_PAD') == '1'
-        self.w8a8_strategy=int(os.getenv('W8A8_SUPPORT_METHODS', '0'))
+        self.w8a8_strategy=int(os.getenv('W8A8_SUPPORT_METHODS', '1'))
    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
        return self.embed_tokens(input_ids)
@@ -553,6 +553,10 @@ class LlamaModel(nn.Module):
                if matches and "scale" not in layername:
                    weight_data =params_dict[layername]
                    n=weight_data.shape[0]
+                    # k=weight_data.shape[1]
+                    # #判断当前size是否在优化的范围内，假如存在则走triton，假如不存在则走rocblas
+                    # json_file=self.tritonsingleton.get_w8a8json_name(n,k)
                    #rocblas和cutlass目前都需要weight做处理，但是triton不用
                    if self.w8a8_strategy!=1:

--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -913,12 +913,13 @@ class QWenBaseModel(nn.Module, SupportsPP, SupportsLoRA):
        if quant_config is not None:
            self.quant_method=quant_config.get_name()
            self.quant_config=quant_config
+        self.tritonsingleton= W8a8GetCacheJSON()
        self.use_llama_nn = os.environ.get('LLAMA_NN') == '1'
        self.use_gemm_pad = os.environ.get('GEMM_PAD') == '1'
        self.use_fa_pad = os.environ.get('FA_PAD') == '1'
        self.use_awq_pad = os.environ.get('AWQ_PAD') == '1'
-        self.w8a8_strategy=int(os.getenv('W8A8_SUPPORT_METHODS', '0'))
+        self.w8a8_strategy=int(os.getenv('W8A8_SUPPORT_METHODS', '1'))
    def _get_image_input_type(
            self,
@@ -1147,7 +1148,7 @@ class QWenBaseModel(nn.Module, SupportsPP, SupportsLoRA):
                    m=int(key.split('_')[0])
                    n=int(key.split('_')[1])
                    k=int(key.split('_')[2])
-                    ops._int8_gemm_helper(m=m,n=n,k=k,per_token_act_quant=True,per_out_channel_weight_quant=True,use_bias=False,best_config=value)
+                    ops.triton_int8_gemm_helper(m=m,n=n,k=k,per_token_act_quant=True,per_out_channel_weight_quant=True,use_bias=False,best_config=value)
        return loaded_params