修改美团 deepseek channel-wise模型 moe config获取

去除AutoTuning info提示信息

修改美团 deepseek channel-wise模型 moe config获取
去除AutoTuning info提示信息
bf3bf955 · zhuwenwen · acf1b6c6 · bf3bf955 · bf3bf955
Commit bf3bf955 authored Jul 03, 2025 by zhuwenwen
Hide whitespace changes
Inline Side-by-side

Showing with 21 additions and 4 deletions

vllm/model_executor/layers/quantization/w8a8_int8.py vllm/model_executor/layers/quantization/w8a8_int8.py +17 -0

vllm/utils.py vllm/utils.py +4 -4

No files found.
--- a/vllm/model_executor/layers/quantization/w8a8_int8.py
+++ b/vllm/model_executor/layers/quantization/w8a8_int8.py
@@ -248,6 +248,7 @@ class W8A8Int8MoEMethod:

    def __init__(self, quant_config):
        self.quant_config = quant_config
+        self.tritonsingleton= W8a8GetCacheJSON()

    def create_weights(
        self,
@@ -302,6 +303,22 @@ class W8A8Int8MoEMethod:
        layer.register_parameter("w2_input_scale", w2_input_scale)

    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        E=layer.w13_weight.shape[0]
+        N1=layer.w13_weight.shape[1]
+        N2=layer.w2_weight.shape[1]
+        K=layer.w2_weight.shape[2]
+        if [E,N1,N2,K] not in self.tritonsingleton.moe_weight_shapes:
+            self.tritonsingleton.moe_weight_shapes.append([E,N1,N2,K])
+            
+        TOPK= self.tritonsingleton.topk
+
+        json_file=self.tritonsingleton.get_moeint8json_name(E,N1,N2,K,TOPK)
+        configs_dict=self.tritonsingleton.get_moeint8_triton_cache(json_file,E,N1,N2,K,TOPK)
+        
+        #warmup
+        if configs_dict:
+            self.tritonsingleton.triton_moejson_dict.update(configs_dict)
+
        layer.w13_weight = Parameter(layer.w13_weight, requires_grad=False)
        layer.w2_weight = Parameter(layer.w2_weight, requires_grad=False)
        layer.w13_weight_scale = Parameter(

--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1904,7 +1904,7 @@ class W8a8GetCacheJSON:
        json_dir = os.getenv('LMSLIM_TUNING_JSON', "None")
        if json_dir is not "None" and os.path.exists(json_dir):
            #生成模型配置文件
-            logger.info("model_tuning.json is at LMSLIM_TUNING_JSON:%s", json_dir)
+            # logger.info("model_tuning.json is at LMSLIM_TUNING_JSON:%s", json_dir)
            config = {
                "layers": {
                    "linear": {
@@ -1938,12 +1938,12 @@ class W8a8GetCacheJSON:
                config["layers"]["linear"]["shapes"].append(shape)
            
            if block_size is not None:
-            config["quantization_config"]["weight_block_size"]=block_size
+                config["quantization_config"]["weight_block_size"]=block_size
                                    
            with open(json_dir+"/model.json", 'w') as f:
                json.dump(config, f, indent=4)
-        else:
-            logger.info("LMSLIM_TUNING_JSON is not set")
+        # else:
+        #     logger.info("LMSLIM_TUNING_JSON is not set")
                   
    def getspec_config(self,configs_dict,M,N,K):
        if f"{M}_{N}_{K}" in configs_dict: