VLLM_USE_LIGHTOP and VLLM_USE_OPT_CAT

add shared_output and routed_scaling_factor of CompressedTensorsW8A8Int8MoEMethod

VLLM_USE_LIGHTOP and VLLM_USE_OPT_CAT
add shared_output and routed_scaling_factor of CompressedTensorsW8A8Int8MoEMethod
c1ece8c6 · zhuwenwen · c7d0b817 · c1ece8c6 · c1ece8c6
Commit c1ece8c6 authored Sep 26, 2025 by zhuwenwen
2 changed files
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -1097,7 +1097,7 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
        expert_load_view: Optional[torch.Tensor] = None,
        logical_to_physical_map: Optional[torch.Tensor] = None,
        logical_replica_count: Optional[torch.Tensor] = None,
+        shared_output: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        if enable_eplb:
            raise NotImplementedError(
@@ -1137,7 +1137,9 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
            w2_scale=layer.w2_weight_scale,
            a1_scale=layer.w13_input_scale,
            a2_scale=layer.w2_input_scale,
-            use_nn_moe=False)
+            use_nn_moe=False,
+            shared_output=shared_output,
+            routed_scaling_factor=routed_scaling_factor)
 class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):

--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -244,7 +244,7 @@ def get_model_architecture(
            else:
                os.environ['LM_NN'] = '1'
-            if (architectures == ['DeepseekV3ForCausalLM'] or architectures == ['DeepSeekMTPModel']):
+            if architectures in [['DeepseekV3ForCausalLM'], ['DeepSeekMTPModel']]:
                if not envs.is_set("VLLM_USE_LIGHTOP"):
                    os.environ['VLLM_USE_LIGHTOP'] = '1'
                if not envs.is_set("VLLM_USE_OPT_CAT"):
@@ -254,6 +254,12 @@ def get_model_architecture(
                os.environ['GEMM_PAD'] = '0'
            if os.getenv('FA_PAD') != '1': 
                os.environ['FA_PAD'] = '0'
+        else:
+            if architectures in [['DeepseekV3ForCausalLM'], ['DeepSeekMTPModel']]:
+                if not envs.is_set("VLLM_USE_LIGHTOP"):
+                    os.environ['VLLM_USE_LIGHTOP'] = '1'
+                if not envs.is_set("VLLM_USE_OPT_CAT"):
+                    os.environ['VLLM_USE_OPT_CAT'] = '1'
        # awq相关配置
        try:
            if os.getenv('AWQ_MOE_SZ') == None: