"docs/vscode:/vscode.git/clone" did not exist on "935c46dd9bad76b11c4f7392ed8140109093e7ca"
Commit c8de4a43 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.9.2-dev-ds-yql' into 'v0.9.2-dev-ds'

删除DPSK_FP16_QUICK,以及增加awq和blockwiseint8的shared_output接口

See merge request dcutoolkit/deeplearing/vllm!227
parents c3b8a0ae 7f459b46
...@@ -238,8 +238,6 @@ class EPMoE(FusedMoE): ...@@ -238,8 +238,6 @@ class EPMoE(FusedMoE):
self.shared_expert_overlap = moe_shared_expert_overlap self.shared_expert_overlap = moe_shared_expert_overlap
self.shared_experts = None self.shared_experts = None
self.dpsk_fp16_quick = os.environ.get('DPSK_FP16_QUICK') == '1'
self.scales = None self.scales = None
self.use_int8_dispatch = True self.use_int8_dispatch = True
...@@ -435,7 +433,7 @@ class EPMoE(FusedMoE): ...@@ -435,7 +433,7 @@ class EPMoE(FusedMoE):
# self.maybe_all_reduce_tensor_model_parallel( # self.maybe_all_reduce_tensor_model_parallel(
# shared_output)) # shared_output))
if hidden_states.dtype != torch.float16 or self.dpsk_fp16_quick: if hidden_states.dtype != torch.float16:
final_hidden_states = final_hidden_states + shared_output final_hidden_states = final_hidden_states + shared_output
else: else:
# Fix FP16 overflow # Fix FP16 overflow
......
...@@ -181,7 +181,6 @@ class MoEAlltoAllTokenDispatcher(MoETokenDispatcher): ...@@ -181,7 +181,6 @@ class MoEAlltoAllTokenDispatcher(MoETokenDispatcher):
self.use_all_gather = current_platform.use_all_gather() self.use_all_gather = current_platform.use_all_gather()
self.probs = None self.probs = None
self.dpsk_fp16_quick = os.environ.get('DPSK_FP16_QUICK') == '1'
# For smuggling this layer into the fused moe custom op # For smuggling this layer into the fused moe custom op
vllm_config = get_current_vllm_config() vllm_config = get_current_vllm_config()
...@@ -446,7 +445,7 @@ class MoEAlltoAllTokenDispatcher(MoETokenDispatcher): ...@@ -446,7 +445,7 @@ class MoEAlltoAllTokenDispatcher(MoETokenDispatcher):
if self.config.moe_shared_expert_overlap and self.shared_experts is not None: if self.config.moe_shared_expert_overlap and self.shared_experts is not None:
shared_output = self.shared_experts.get_output() shared_output = self.shared_experts.get_output()
if hidden_states.dtype != torch.float16 or self.dpsk_fp16_quick: if hidden_states.dtype != torch.float16:
output = output + shared_output output = output + shared_output
else: else:
# Fix FP16 overflow # Fix FP16 overflow
......
...@@ -45,9 +45,6 @@ from lightop import op ...@@ -45,9 +45,6 @@ from lightop import op
# from .rocm_aiter_fused_moe import is_rocm_aiter_moe_enabled # from .rocm_aiter_fused_moe import is_rocm_aiter_moe_enabled
os.environ['DPSK_FP16_QUICK'] = os.environ.get('DPSK_FP16_QUICK', '0')
dpsk_fp16_quick = os.environ.get('DPSK_FP16_QUICK') == '1'
logger = init_logger(__name__) logger = init_logger(__name__)
if envs.VLLM_USE_GLOBAL_CACHE13: if envs.VLLM_USE_GLOBAL_CACHE13:
...@@ -1899,7 +1896,7 @@ def fused_experts_impl( ...@@ -1899,7 +1896,7 @@ def fused_experts_impl(
block_shape=block_shape, block_shape=block_shape,
use_nn_moe=use_nn_moe) use_nn_moe=use_nn_moe)
if envs.VLLM_USE_LIGHTOP and not dpsk_fp16_quick: if envs.VLLM_USE_LIGHTOP:
from lightop import op as op from lightop import op as op
op.moe_sum(input=intermediate_cache3.view(*intermediate_cache3.size()), op.moe_sum(input=intermediate_cache3.view(*intermediate_cache3.size()),
output=out_hidden_states[begin_chunk_idx:end_chunk_idx], bias=shared_output[begin_chunk_idx:end_chunk_idx], output=out_hidden_states[begin_chunk_idx:end_chunk_idx], bias=shared_output[begin_chunk_idx:end_chunk_idx],
......
...@@ -514,6 +514,7 @@ class AWQMoEMethod(FusedMoEMethodBase): ...@@ -514,6 +514,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
expert_load_view: Optional[torch.Tensor] = None, expert_load_view: Optional[torch.Tensor] = None,
logical_to_physical_map: Optional[torch.Tensor] = None, logical_to_physical_map: Optional[torch.Tensor] = None,
logical_replica_count: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None,
shared_output: Optional[torch.Tensor] = None,
) -> torch.Tensor: ) -> torch.Tensor:
if enable_eplb: if enable_eplb:
raise NotImplementedError( raise NotImplementedError(
......
...@@ -473,6 +473,7 @@ class BlockInt8MoEMethod: ...@@ -473,6 +473,7 @@ class BlockInt8MoEMethod:
use_nn_moe: Optional[bool] = False, use_nn_moe: Optional[bool] = False,
routed_scaling_factor: Optional[float] = None, routed_scaling_factor: Optional[float] = None,
use_fused_gate: Optional[bool] = False, use_fused_gate: Optional[bool] = False,
shared_output: Optional[torch.Tensor] = None,
**_ **_
) -> torch.Tensor: ) -> torch.Tensor:
from vllm.model_executor.layers.fused_moe import fused_experts from vllm.model_executor.layers.fused_moe import fused_experts
...@@ -514,5 +515,6 @@ class BlockInt8MoEMethod: ...@@ -514,5 +515,6 @@ class BlockInt8MoEMethod:
a1_scale=layer.w13_input_scale, a1_scale=layer.w13_input_scale,
a2_scale=layer.w2_input_scale, a2_scale=layer.w2_input_scale,
block_shape=self.quant_config.weight_block_size, block_shape=self.quant_config.weight_block_size,
use_nn_moe=use_nn_moe use_nn_moe=use_nn_moe,
shared_output=shared_output,
) )
\ No newline at end of file
...@@ -348,6 +348,7 @@ class MoeWNA16Method(FusedMoEMethodBase): ...@@ -348,6 +348,7 @@ class MoeWNA16Method(FusedMoEMethodBase):
expert_load_view: Optional[torch.Tensor] = None, expert_load_view: Optional[torch.Tensor] = None,
logical_to_physical_map: Optional[torch.Tensor] = None, logical_to_physical_map: Optional[torch.Tensor] = None,
logical_replica_count: Optional[torch.Tensor] = None, logical_replica_count: Optional[torch.Tensor] = None,
shared_output: Optional[torch.Tensor] = None,
use_nn_moe: Optional[bool] = False, use_nn_moe: Optional[bool] = False,
routed_scaling_factor: Optional[float] = None, routed_scaling_factor: Optional[float] = None,
use_fused_gate: Optional[bool] = False, use_fused_gate: Optional[bool] = False,
...@@ -430,7 +431,9 @@ class MoeWNA16Method(FusedMoEMethodBase): ...@@ -430,7 +431,9 @@ class MoeWNA16Method(FusedMoEMethodBase):
w1_zp=layer.w13_qzeros if has_zp else None, w1_zp=layer.w13_qzeros if has_zp else None,
w2_zp=layer.w2_qzeros if has_zp else None, w2_zp=layer.w2_qzeros if has_zp else None,
block_shape=[0, layer.group_size], block_shape=[0, layer.group_size],
use_nn_moe=False) use_nn_moe=False,
shared_output=shared_output,
)
@staticmethod @staticmethod
def get_weight_loader(layer, weight_loader): def get_weight_loader(layer, weight_loader):
......
...@@ -67,7 +67,6 @@ from .utils import (PPMissingLayer, is_pp_missing_parameter, ...@@ -67,7 +67,6 @@ from .utils import (PPMissingLayer, is_pp_missing_parameter,
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.utils import W8a8GetCacheJSON from vllm.utils import W8a8GetCacheJSON
os.environ['DPSK_FP16_QUICK'] = os.environ.get('DPSK_FP16_QUICK', '0')
class DeepseekV2MLP(nn.Module): class DeepseekV2MLP(nn.Module):
def __init__( def __init__(
...@@ -155,7 +154,6 @@ class DeepseekV2MoE(nn.Module): ...@@ -155,7 +154,6 @@ class DeepseekV2MoE(nn.Module):
vllm_config = get_current_vllm_config() vllm_config = get_current_vllm_config()
parallel_config = vllm_config.parallel_config parallel_config = vllm_config.parallel_config
self.enable_eplb = enable_eplb self.enable_eplb = enable_eplb
self.dpsk_fp16_quick = os.environ.get('DPSK_FP16_QUICK') == '1'
self.n_redundant_experts = parallel_config.num_redundant_experts self.n_redundant_experts = parallel_config.num_redundant_experts
self.n_logical_experts = self.n_routed_experts self.n_logical_experts = self.n_routed_experts
...@@ -227,13 +225,13 @@ class DeepseekV2MoE(nn.Module): ...@@ -227,13 +225,13 @@ class DeepseekV2MoE(nn.Module):
router_logits, _ = self.gate(hidden_states) router_logits, _ = self.gate(hidden_states)
if not self.use_mori_ep: if not self.use_mori_ep:
if envs.VLLM_USE_LIGHTOP and not self.dpsk_fp16_quick: if envs.VLLM_USE_LIGHTOP:
final_hidden_states = self.experts( final_hidden_states = self.experts(
hidden_states=hidden_states, hidden_states=hidden_states,
router_logits=router_logits, router_logits=router_logits,
shared_output=shared_output) shared_output=shared_output)
else: else:
if hidden_states.dtype != torch.float16 or self.dpsk_fp16_quick: if hidden_states.dtype != torch.float16:
final_hidden_states = self.experts( final_hidden_states = self.experts(
hidden_states=hidden_states, hidden_states=hidden_states,
router_logits=router_logits) * self.routed_scaling_factor router_logits=router_logits) * self.routed_scaling_factor
...@@ -243,7 +241,7 @@ class DeepseekV2MoE(nn.Module): ...@@ -243,7 +241,7 @@ class DeepseekV2MoE(nn.Module):
final_hidden_states = self.experts(hidden_states=hidden_states, final_hidden_states = self.experts(hidden_states=hidden_states,
router_logits=router_logits) router_logits=router_logits)
if shared_output is not None: if shared_output is not None:
if hidden_states.dtype != torch.float16 or self.dpsk_fp16_quick: if hidden_states.dtype != torch.float16:
final_hidden_states = final_hidden_states + shared_output final_hidden_states = final_hidden_states + shared_output
else: else:
# Fix FP16 overflow # Fix FP16 overflow
...@@ -671,7 +669,7 @@ class DeepseekV2DecoderLayer(nn.Module): ...@@ -671,7 +669,7 @@ class DeepseekV2DecoderLayer(nn.Module):
quant_config=quant_config, quant_config=quant_config,
prefix=f"{prefix}.self_attn", prefix=f"{prefix}.self_attn",
) )
self.dpsk_fp16_quick = os.environ.get('DPSK_FP16_QUICK') == '1'
if (config.n_routed_experts is not None if (config.n_routed_experts is not None
and layer_idx >= config.first_k_dense_replace and layer_idx >= config.first_k_dense_replace
and layer_idx % config.moe_layer_freq == 0): and layer_idx % config.moe_layer_freq == 0):
...@@ -724,7 +722,7 @@ class DeepseekV2DecoderLayer(nn.Module): ...@@ -724,7 +722,7 @@ class DeepseekV2DecoderLayer(nn.Module):
) )
residual = new_residual residual = new_residual
if hidden_states.dtype == torch.float16 and not self.dpsk_fp16_quick: if hidden_states.dtype == torch.float16:
# rmsnorm, and rmsnorm result would not affect by scale. # rmsnorm, and rmsnorm result would not affect by scale.
hidden_states *= 1. / self.routed_scaling_factor hidden_states *= 1. / self.routed_scaling_factor
if self.layer_idx == 0 or residual_fix_overflow: if self.layer_idx == 0 or residual_fix_overflow:
...@@ -735,7 +733,7 @@ class DeepseekV2DecoderLayer(nn.Module): ...@@ -735,7 +733,7 @@ class DeepseekV2DecoderLayer(nn.Module):
hidden_states, new_resi = self.mlp(hidden_states, self.post_attention_layernorm.weight.data, residual) hidden_states, new_resi = self.mlp(hidden_states, self.post_attention_layernorm.weight.data, residual)
if isinstance(self.mlp, if isinstance(self.mlp,
DeepseekV2MLP) and hidden_states.dtype == torch.float16 and not self.dpsk_fp16_quick: DeepseekV2MLP) and hidden_states.dtype == torch.float16:
# Fix FP16 overflow # Fix FP16 overflow
# Scaling the DeepseekV2MLP output, it is the input of # Scaling the DeepseekV2MLP output, it is the input of
# input_layernorm of next decoder layer. # input_layernorm of next decoder layer.
...@@ -760,7 +758,7 @@ class DeepseekV2DecoderLayer(nn.Module): ...@@ -760,7 +758,7 @@ class DeepseekV2DecoderLayer(nn.Module):
hidden_states=hidden_states, hidden_states=hidden_states,
) )
if hidden_states.dtype == torch.float16 and not self.dpsk_fp16_quick: if hidden_states.dtype == torch.float16:
# Fix FP16 overflow # Fix FP16 overflow
# We scale both hidden_states and residual before # We scale both hidden_states and residual before
# rmsnorm, and rmsnorm result would not affect by scale. # rmsnorm, and rmsnorm result would not affect by scale.
...@@ -776,7 +774,7 @@ class DeepseekV2DecoderLayer(nn.Module): ...@@ -776,7 +774,7 @@ class DeepseekV2DecoderLayer(nn.Module):
hidden_states = self.mlp(hidden_states) hidden_states = self.mlp(hidden_states)
if isinstance(self.mlp, if isinstance(self.mlp,
DeepseekV2MLP) and hidden_states.dtype == torch.float16 and not self.dpsk_fp16_quick: DeepseekV2MLP) and hidden_states.dtype == torch.float16:
# Fix FP16 overflow # Fix FP16 overflow
# Scaling the DeepseekV2MLP output, it is the input of # Scaling the DeepseekV2MLP output, it is the input of
# input_layernorm of next decoder layer. # input_layernorm of next decoder layer.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment