增加fused-moe int4/int8的支持，以及deepseek精度问题的修复

bb94d2e5 · yangql · 087254b9 · bb94d2e5 · bb94d2e5
Commit bb94d2e5 authored Apr 07, 2025 by yangql
Showing with 536 additions and 163 deletions

vllm/model_executor/layers/fused_moe/fused_moe.py vllm/model_executor/layers/fused_moe/fused_moe.py +506 -140

vllm/model_executor/models/deepseek_v2.py vllm/model_executor/models/deepseek_v2.py +30 -23

No files found.
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -164,21 +164,28 @@ class DeepseekV2MoE(nn.Module):
            shared_output = self.shared_experts(hidden_states)
        # router_logits: (num_tokens, n_experts)
        router_logits, _ = self.gate(hidden_states)
-        if hidden_states.dtype != torch.float16:
+        # if hidden_states.dtype != torch.float16:
-            final_hidden_states = self.experts(
+        #     final_hidden_states = self.experts(
-                hidden_states=hidden_states,
+        #         hidden_states=hidden_states,
-                router_logits=router_logits) * self.routed_scaling_factor
+        #         router_logits=router_logits) * self.routed_scaling_factor
-        else:
+        # else:
-            # This is a special case to avoid FP16 overflow
+        #     # This is a special case to avoid FP16 overflow
-            final_hidden_states = self.experts(hidden_states=hidden_states,
+        #     final_hidden_states = self.experts(hidden_states=hidden_states,
-                                               router_logits=router_logits)
+        #                                        router_logits=router_logits)
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states,
+            router_logits=router_logits) * self.routed_scaling_factor
        if shared_output is not None:
-            if hidden_states.dtype != torch.float16:
+            final_hidden_states = final_hidden_states + shared_output
-                final_hidden_states = final_hidden_states + shared_output
-            else:
+        # if shared_output is not None:
-                # This is a special case to avoid FP16 overflow
+        #     if hidden_states.dtype != torch.float16:
-                final_hidden_states = final_hidden_states + shared_output \
+        #         final_hidden_states = final_hidden_states + shared_output
-                    * (1. / self.routed_scaling_factor)
+        #     else:
+        #         # This is a special case to avoid FP16 overflow
+        #         final_hidden_states = final_hidden_states + shared_output \
+        #             * (1. / self.routed_scaling_factor)
        if self.tp_size > 1:
            final_hidden_states = tensor_model_parallel_all_reduce(
                final_hidden_states)
@@ -571,18 +578,18 @@ class DeepseekV2DecoderLayer(nn.Module):
        )
        # Fully Connected
-        if isinstance(self.mlp, DeepseekV2MoE) and \
+        # if isinstance(self.mlp, DeepseekV2MoE) and \
-            hidden_states.dtype == torch.float16:
+        #     hidden_states.dtype == torch.float16:
-            # This is a special case to avoid FP16 overflow
+        #     # This is a special case to avoid FP16 overflow
-            hidden_states *= 1. / self.routed_scaling_factor
+        #     hidden_states *= 1. / self.routed_scaling_factor
        hidden_states, residual = self.post_attention_layernorm(
            hidden_states, residual)
        hidden_states = self.mlp(hidden_states)
-        if isinstance(self.mlp, DeepseekV2MLP) and \
+        # if isinstance(self.mlp, DeepseekV2MLP) and \
-            hidden_states.dtype == torch.float16:
+        #     hidden_states.dtype == torch.float16:
-            # This is a special case to avoid FP16 overflow
+        #     # This is a special case to avoid FP16 overflow
-            hidden_states *= 1. / self.routed_scaling_factor
+        #     hidden_states *= 1. / self.routed_scaling_factor
-            residual *= 1. / self.routed_scaling_factor
+        #     residual *= 1. / self.routed_scaling_factor
        return hidden_states, residual