Merge branch 'v0.9.2-dev-yql-1.15' into 'v0.9.2-dev'

V0.9.2 dev yql 1.15 See merge request dcutoolkit/deeplearing/vllm!367

Merge branch 'v0.9.2-dev-yql-1.15' into 'v0.9.2-dev'
V0.9.2 dev yql 1.15 See merge request dcutoolkit/deeplearing/vllm!367
c47f7e61 · zhuwenwen · efd51772 · ab66909d · c47f7e61 · c47f7e61
Commit c47f7e61 authored Jan 15, 2026 by zhuwenwen
3 changed files
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -11,7 +11,10 @@ from vllm.config import CompilationLevel, get_current_vllm_config
 from vllm.platforms import current_platform
 from vllm.utils import W8a8GetCacheJSON
 from lmslim.layers.gemm.int8_utils import per_token_quant_int8
-from lmslim.layers.gemm.fp8_utils import triton_scaled_mm_fp8
+try:
+    from lmslim.layers.gemm.fp8_utils import triton_scaled_mm_fp8
+except Exception:
+    print("INFO: Please updata lmslim if you want to use fp8_utils.\n") 
 # Input scaling factors are no longer optional in _scaled_mm starting
 # from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
 TORCH_DEVICE_IDENTITY = None

--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -232,6 +232,11 @@ def get_model_architecture(
                                'ChatGLMModel', 'Glm4ForCausalLM', 'ChatGLMForConditionalGeneration', 'BaichuanForCausalLM', 'BloomForCausalLM', 'TeleChat2ForCausalLM', 'MixtralForCausalLM', 'FalconForCausalLM',
                                'MedusaModel', 'MLPSpeculatorPreTrainedModel', 'DeepseekV2ForCausalLM', 'DeepseekV3ForCausalLM', 'DeepSeekMTPModel']  
    if any(arch in architectures for arch in support_nn_architectures): 
+        #针对使用dtype为fp16的情况的量化默认关闭"VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD"
+        if model_config.quantization in {"awq", "awq_marlin", "moe_wna16"}:
+            if not envs.is_set("VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD"):
+                os.environ['VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD'] = '0'      
        if not envs.VLLM_USE_NN:
            if os.getenv('LLAMA_NN') != '0': 
                if (architectures == ['QWenLMHeadModel'] or architectures == ['ChatGLMModel'] ) and visions != []:
@@ -287,7 +292,7 @@ def get_model_architecture(
            if os.getenv('FA_PAD') != '1': 
                os.environ['FA_PAD'] = '0'
        else:
-            if architectures in [['DeepseekV3ForCausalLM'], ['DeepSeekMTPModel']]:
+            if architectures in [['DeepseekV3ForCausalLM'], ['DeepSeekMTPModel']]:          
                if not envs.is_set("VLLM_USE_LIGHTOP"):
                    os.environ['VLLM_USE_LIGHTOP'] = '1'
                if not envs.is_set("VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD"):

--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -385,9 +385,12 @@ class DeepseekV2MoE(nn.Module):
                            # Fix FP16 overflow
                            # See DeepseekV2DecoderLayer for more details.
                            # fp16 mode not fused quant
+                            if i_q is not None:
+                                i_q=iqis[0]
+                                i_s=iqis[1]
                            final_hidden_states = self.experts(hidden_states=hidden_states,
                                                            router_logits=router_logits,
-                                                            i_q=iqis[0], i_s=iqis[1])
+                                                            i_q=i_q, i_s=i_s)
                        if shared_output is not None:
                            if hidden_states.dtype != torch.float16:
@@ -429,9 +432,12 @@ class DeepseekV2MoE(nn.Module):
                            assert shared_output is not None
                            final_hidden_states += (shared_output * (1. / self.routed_scaling_factor))
                    else:
+                        if i_q is not None:
+                            i_q=iqis[0]
+                            i_s=iqis[1]
                        final_hidden_states = self.experts(hidden_states=hidden_states,
                                                           router_logits=router_logits,
-                                                           i_q=iqis[0], i_s=iqis[1])
+                                                           i_q=i_q, i_s=i_s)
                        if shared_output is not None:
                            if hidden_states.dtype != torch.float16: