"vllm/vscode:/vscode.git/clone" did not exist on "b46e4a06f1aa8f243d3ed8d3b30d14ae56863d3d"
Commit 94c4ca4d authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.9.2-dev-ds-wm-1210' into 'v0.9.2-dev-ds'

[fix]修复deepep 高吞吐模式vmfault问题

See merge request dcutoolkit/deeplearing/vllm!291
parents 8ae59a9c 916b5876
......@@ -285,9 +285,10 @@ class CompressedTensorsW8A8Int8MarlinMoEMethod(CompressedTensorsMarlinMoEMethod)
use_nn_moe: Optional[bool] = False,
routed_scaling_factor: Optional[float] = None,
shared_output: Optional[torch.Tensor] = None,
q_x: Optional[torch.Tensor] = None,
**_ ):
return fused_experts_impl_int8_marlin(
hidden_states=x,
hidden_states=x if q_x is None else q_x,
w1=w1,
w2=w2,
topk_weights=topk_weights,
......
......@@ -263,7 +263,7 @@ class SlimQuantW4A8Int8MarlinMoEMethod:
**_ ):
workspace, global_reduce_buffer = MarlinMoeWorkspace(x.device).get_buffers()
return fused_experts_impl_w4a8_marlin(
x,
x if q_x is None else q_x,
w1,
w2,
topk_ids=topk_ids,
......@@ -510,6 +510,8 @@ class SlimQuantW4A8Int8MarlinMoEMethod:
False)
return TritonOrGroupGemmExperts(
# use_int4_w4a8=True,
# per_act_token_quant=True,
fused_experts=self.w4a8_fused_moe_marlin_forward
)
......@@ -717,9 +717,8 @@ class DeepseekV2DecoderLayer(nn.Module):
self.dp_size = get_dp_group().world_size
vllm_config = get_current_vllm_config()
parallel_config = vllm_config.parallel_config
self.use_deepep = self.dp_size > 1 and parallel_config.enable_expert_parallel and \
(envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput" or \
envs.VLLM_ALL2ALL_BACKEND == "deepep_low_latency")
self.use_deepep_ll = self.dp_size > 1 and parallel_config.enable_expert_parallel and \
envs.VLLM_ALL2ALL_BACKEND == "deepep_low_latency"
self.tp_size = get_tensor_model_parallel_world_size()
if (config.n_routed_experts is not None
......@@ -848,7 +847,7 @@ class DeepseekV2DecoderLayer(nn.Module):
hidden_states, residual)
if isinstance(self.mlp,
DeepseekV2MoE) and self.use_deepep and self.tp_size > 1:
DeepseekV2MoE) and self.use_deepep_ll and self.tp_size > 1:
self.tp_rank = get_tensor_model_parallel_rank()
ori_bs = hidden_states.shape[0]
......@@ -861,7 +860,7 @@ class DeepseekV2DecoderLayer(nn.Module):
hidden_states = self.mlp(hidden_states)
if isinstance(self.mlp,
DeepseekV2MoE) and self.use_deepep and self.tp_size > 1:
DeepseekV2MoE) and self.use_deepep_ll and self.tp_size > 1:
hidden_states = tensor_model_parallel_all_gather(hidden_states, dim=0).contiguous()
hidden_states = hidden_states[:ori_bs, :].contiguous()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment