修复channelwise w8a8 预热没有区分卡造成的triton kernel崩溃问题，另外增加block-int8的moe 拿到config步骤

c0c0eb69 · gaoqiong · 9d5187eb · c0c0eb69 · c0c0eb69 · c0c0eb69
Commit c0c0eb69 authored Jun 30, 2025 by gaoqiong
4 changed files
--- a/vllm/model_executor/layers/quantization/blockwise_int8.py
+++ b/vllm/model_executor/layers/quantization/blockwise_int8.py
@@ -231,8 +231,8 @@ class BlockInt8LinearMethod(LinearMethodBase):
        n=layer.weight.shape[0]
        k=layer.weight.shape[1]
-        if {n,k} not in self.tritonsingleton.weight_shapes:
+        if [n,k] not in self.tritonsingleton.weight_shapes:
-            self.tritonsingleton.weight_shapes.append({n,k})
+            self.tritonsingleton.weight_shapes.append([n,k])
            json_file=self.tritonsingleton.get_blockint8json_name(n,k,self.block_size[0],self.block_size[1])
            configs_dict=self.tritonsingleton.get_blockint8_triton_cache(json_file,n,k,self.block_size[0],self.block_size[1])
@@ -260,7 +260,6 @@ class BlockInt8LinearMethod(LinearMethodBase):
        K=x.shape[1]
        N=layer.weight.shape[0]
-        #print("self.tritonsingleton.triton_json_dict:",self.tritonsingleton.triton_json_dict)
        #Get the best config options
        if len(self.tritonsingleton.triton_json_dict)==0:
            config=None
@@ -293,8 +292,6 @@ class BlockInt8LinearMethod(LinearMethodBase):
        else: 
            config=None   
-        #print("m:{},n:{},k:{},config:{}".format(M,N,K,config))
        return apply_w8a8_block_int8_linear(
            input=x,
            weight=layer.weight,
@@ -431,6 +428,26 @@ class BlockInt8MoEMethod:
    def process_weights_after_loading(self, layer: Module) -> None:
        # Block quant doesn't need to process weights after loading
        # warmup and get moe block-int8 config
+        E=layer.w13_weight.shape[0]
+        N1=layer.w13_weight.shape[1]
+        N2=layer.w2_weight.shape[1]
+        K=layer.w2_weight.shape[2]
+        if [E,N1,N2,K] not in self.tritonsingleton.moe_weight_shapes:
+            self.tritonsingleton.moe_weight_shapes.append([E,N1,N2,K])
+        TOPK= self.tritonsingleton.topk
+        block_size=self.quant_config.weight_block_size
+        json_file=self.tritonsingleton.get_moeblockint8json_name(block_size,E,N1,N2,K,TOPK)
+        configs_dict=self.tritonsingleton.get_moeblockint8_triton_cache(json_file,block_size,E,N1,N2,K,TOPK)
+        #warmup
+        if configs_dict:
+            self.tritonsingleton.triton_moejson_dict.update(configs_dict)
+        #生成模型配置文件
+        self.tritonsingleton.gen_model_json(block_size)   
        return
    def apply(

--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -597,8 +597,8 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
        k=layer.weight.shape[1]
        if self.w8a8_strategy==1:
-            if {n,k} not in self.tritonsingleton.weight_shapes:
+            if [n,k] not in self.tritonsingleton.weight_shapes:
-                self.tritonsingleton.weight_shapes.append({n,k})
+                self.tritonsingleton.weight_shapes.append([n,k])
                json_file=self.tritonsingleton.get_w8a8json_name(n,k)
                configs_dict=self.tritonsingleton.get_triton_cache(json_file,n,k)
@@ -607,7 +607,7 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
                    for key, value in configs_dict.items():
                        m=int(key.split('_')[0])
-                        ops.triton_int8_gemm_helper(m=m,n=n,k=k,per_token_act_quant=True,per_out_channel_weight_quant=True,use_bias=False,best_config=value)
+                        ops.triton_int8_gemm_helper(m=m,n=n,k=k,per_token_act_quant=True,per_out_channel_weight_quant=True,use_bias=False,device=layer.weight.device,best_config=value)
        else: 
            weight_data=layer.weight.data
            _weight=weight_data.T.contiguous().reshape(n,-1)

--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -60,6 +60,7 @@ from .utils import (PPMissingLayer, is_pp_missing_parameter,
                    make_empty_intermediate_tensors_factory, make_layers,
                    maybe_prefix)
 from vllm import _custom_ops as ops
+from vllm.utils import W8a8GetCacheJSON
 class DeepseekV2MLP(nn.Module):
@@ -727,6 +728,9 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP):
            self.model.make_empty_intermediate_tensors)
        self.use_llama_nn = os.environ.get('LLAMA_NN') == '1'
        self.use_awq_pad = os.environ.get('AWQ_PAD') == '1'
+        self.tritonsingleton= W8a8GetCacheJSON() 
+        self.tritonsingleton.topk = config.num_experts_per_tok
+        self.tritonsingleton.quant_method=self.quant_method 
    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
        return self.model.get_input_embeddings(input_ids)

--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1872,7 +1872,6 @@ class AtomicCounter:
    def value(self):
        return self._value
 class W8a8GetCacheJSON:
    _instance = None
@@ -1883,13 +1882,68 @@ class W8a8GetCacheJSON:
        return cls._instance
    def _initialize(self):
+        from vllm.platforms import current_platform
        current_folder_path = os.path.dirname(os.path.abspath(__file__))
        json_folder_path=current_folder_path+'/../lmslim/configs/w8a8'
        self.triton_json_dir=(os.getenv('TRITON_JSON_DIR', json_folder_path))
        self.triton_json_dict={}
+        self.triton_moejson_dict={}
        self.triton_json_list=[]
        self.weight_shapes=[]
+        self.moe_weight_shapes=[]
+        device_name = current_platform.get_device_name().replace(" ", "_")
+        if 'K100_AI' in device_name and torch.cuda.get_device_properties(torch.cuda.current_device()).multi_processor_count == 120:
+            device_name='K100_AI_120'
+        self.device_name=device_name
+        self.topk=1
+        self.quant_method=None
+    #析构函数，最后会生成model.json的配置文件
+    def gen_model_json(self,E:int,block_size:Optional[list]=None):
+        json_dir = os.getenv('LMSLIM_TUNING_JSON', "None")
+        if json_dir is not "None" and os.path.exists(json_dir):
+            #生成模型配置文件
+            logger.info("model_tuning.json is at LMSLIM_TUNING_JSON:%s", json_dir)
+            config = {
+                "layers": {
+                    "linear": {
+                        "shapes": [],
+                        "m_range":"None",
+                    },
+                    "moe": {
+                        "shapes": [],
+                        "m_range": "None",
+                        "topk": self.topk
+                    }
+                },
+                "quantization_config": {
+                    "quant_method": self.quant_method,
+                    "weight_block_size": "None"
+                }
+            }
+            # 处理 MoE shapes
+            for shape in self.moe_weight_shapes:
+                if len(shape) == 4:  # 假设 MoE shape 是 [N1, N2,K] 格式
+                    moe_config = {
+                        "E": shape[0],
+                        "N1": shape[1],
+                        "N2": shape[2],
+                        "K": shape[3],      # 默认值
+                    }
+                    config["layers"]["moe"]["shapes"].append(moe_config)
+            for shape in self.weight_shapes:
+                config["layers"]["linear"]["shapes"].append(shape)
+            if block_size is not None:
+            config["quantization_config"]["weight_block_size"]=block_size
+            with open(json_dir+"/model.json", 'w') as f:
+                json.dump(config, f, indent=4)
+        else:
+            logger.info("LMSLIM_TUNING_JSON is not set")
    def getspec_config(self,configs_dict,M,N,K):
        if f"{M}_{N}_{K}" in configs_dict:
@@ -1913,24 +1967,11 @@ class W8a8GetCacheJSON:
        for key, value in cachedata.items():
            for sub_key, sub_value in value.items():
                configs_key= f"{sub_key}_{key}"
-                configs_value={
+                configs_dict[configs_key]=sub_value
-                    'SPLIT_K': int(sub_value["SPLIT_K"]),
-                    'BLOCK_SIZE_M': int(sub_value["BLOCK_SIZE_M"]),
-                    'BLOCK_SIZE_N': int(sub_value["BLOCK_SIZE_N"]),
-                    'BLOCK_SIZE_K': int(sub_value["BLOCK_SIZE_K"]),
-                    'GROUP_SIZE_M': int(sub_value["GROUP_SIZE_M"]),
-                    'num_stages':int(sub_value['num_stages']),
-                    'num_warps':int(sub_value['num_warps'])
-                }
-                configs_dict[configs_key]=configs_value
        return configs_dict
    def get_w8a8json_name(self,n,k):
-        from vllm.platforms import current_platform
+        return self.triton_json_dir+f"/W8A8_{n}_{k}_{self.device_name}.json"
-        device_name = current_platform.get_device_name().replace(" ", "_")
-        if 'K100_AI' in device_name and torch.cuda.get_device_properties(torch.cuda.current_device()).multi_processor_count == 120:
-            device_name='K100_AI_120'
-        return self.triton_json_dir+f"/W8A8_{n}_{k}_{device_name}.json"
    def get_blockint8_triton_cache(self,file_path,n,k,block_n,block_k):
        cache_json_file=file_path
@@ -1947,26 +1988,33 @@ class W8a8GetCacheJSON:
        for key, value in cachedata.items():
            for sub_key, sub_value in value.items():
                configs_key= f"{sub_key}_{key}"
-                configs_value={
+                configs_dict[configs_key]=sub_value
-                    'BLOCK_SIZE_M': int(sub_value["BLOCK_SIZE_M"]),
-                    'BLOCK_SIZE_N': int(sub_value["BLOCK_SIZE_N"]),
-                    'BLOCK_SIZE_K': int(sub_value["BLOCK_SIZE_K"]),
-                    'GROUP_SIZE_M': int(sub_value["GROUP_SIZE_M"]),
-                    'kpack': int(sub_value["kpack"]),
-                    'num_stages':int(sub_value['num_stages']),
-                    'num_warps':int(sub_value['num_warps']),
-                    'enable_mmacfuse':int(sub_value['enable_mmacfuse']),
-                }
-                configs_dict[configs_key]=configs_value
        return configs_dict
    def get_blockint8json_name(self,n,k,block_n,block_k):
-        from vllm.platforms import current_platform
+        return self.triton_json_dir+f"/linear_{n}_{k}_block[{block_n},{block_k}]_{self.device_name}.json"
-        device_name = current_platform.get_device_name().replace(" ", "_")
-        if 'K100_AI' in device_name and torch.cuda.get_device_properties(torch.cuda.current_device()).multi_processor_count == 120:
+    def get_moeblockint8json_name(self,block_size,E,N1,N2,K,TOPK):
-            device_name='K100_AI_120'
+        return self.triton_json_dir+f"/MOE_BLOCKINT8[{block_size[0]},{block_size[1]}]_E={E}_N1={N1}_N2={N2}_K={K}_TOPK{TOPK}_{self.device_name}.json"
-        return self.triton_json_dir+f"/linear_{n}_{k}_block[{block_n},{block_k}]_{device_name}.json"
+    def get_moeblockint8_triton_cache(self,file_path,block_size,E,N1,N2,K,TOPK):
+        cache_json_file=file_path
+        if os.path.exists(file_path):
+        #try:
+            with open(cache_json_file, 'r') as file:
+                cachedata = json.load(file)
+        else:
+            return None  
+        #把所有的cache解析成key:config的形式：[M_N_K]:[config1,config2]
+        configs_dict={}
+        for key, value in cachedata.items():
+            for sub_key, sub_value in value.items():
+                configs_key= f"{sub_key}_{key}"   
+                configs_dict[configs_key]=sub_value
+        return configs_dict
 # Adapted from: https://stackoverflow.com/a/47212782/5082708
 class LazyDict(Mapping[str, T], Generic[T]):