Commit 5a5e4f3b authored by 王敏's avatar 王敏
Browse files

Merge remote-tracking branch 'origin/v0.9.2-dev-ds' into v0.9.2-dev-ds

# Conflicts:
#	vllm/model_executor/layers/fused_moe/ep_moe/layer.py
parents f505d366 a7992f79
...@@ -21,6 +21,7 @@ from vllm.utils import W8a8GetCacheJSON ...@@ -21,6 +21,7 @@ from vllm.utils import W8a8GetCacheJSON
import os import os
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm import envs from vllm import envs
try: try:
...@@ -441,3 +442,5 @@ class SlimQuantW4A8Int8MoEMethod: ...@@ -441,3 +442,5 @@ class SlimQuantW4A8Int8MoEMethod:
shared_output=shared_output, shared_output=shared_output,
routed_scaling_factor=routed_scaling_factor, routed_scaling_factor=routed_scaling_factor,
) )
...@@ -67,7 +67,6 @@ from .utils import (PPMissingLayer, is_pp_missing_parameter, ...@@ -67,7 +67,6 @@ from .utils import (PPMissingLayer, is_pp_missing_parameter,
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.utils import W8a8GetCacheJSON from vllm.utils import W8a8GetCacheJSON
os.environ['DPSK_FP16_QUICK'] = os.environ.get('DPSK_FP16_QUICK', '0')
class DeepseekV2MLP(nn.Module): class DeepseekV2MLP(nn.Module):
def __init__( def __init__(
...@@ -155,7 +154,6 @@ class DeepseekV2MoE(nn.Module): ...@@ -155,7 +154,6 @@ class DeepseekV2MoE(nn.Module):
vllm_config = get_current_vllm_config() vllm_config = get_current_vllm_config()
parallel_config = vllm_config.parallel_config parallel_config = vllm_config.parallel_config
self.enable_eplb = enable_eplb self.enable_eplb = enable_eplb
self.dpsk_fp16_quick = os.environ.get('DPSK_FP16_QUICK') == '1'
self.n_redundant_experts = parallel_config.num_redundant_experts self.n_redundant_experts = parallel_config.num_redundant_experts
self.n_logical_experts = self.n_routed_experts self.n_logical_experts = self.n_routed_experts
...@@ -227,13 +225,13 @@ class DeepseekV2MoE(nn.Module): ...@@ -227,13 +225,13 @@ class DeepseekV2MoE(nn.Module):
router_logits, _ = self.gate(hidden_states) router_logits, _ = self.gate(hidden_states)
if not self.use_mori_ep: if not self.use_mori_ep:
if envs.VLLM_USE_LIGHTOP and not self.dpsk_fp16_quick: if envs.VLLM_USE_LIGHTOP:
final_hidden_states = self.experts( final_hidden_states = self.experts(
hidden_states=hidden_states, hidden_states=hidden_states,
router_logits=router_logits, router_logits=router_logits,
shared_output=shared_output) shared_output=shared_output)
else: else:
if hidden_states.dtype != torch.float16 or self.dpsk_fp16_quick: if hidden_states.dtype != torch.float16:
final_hidden_states = self.experts( final_hidden_states = self.experts(
hidden_states=hidden_states, hidden_states=hidden_states,
router_logits=router_logits) * self.routed_scaling_factor router_logits=router_logits) * self.routed_scaling_factor
...@@ -243,7 +241,7 @@ class DeepseekV2MoE(nn.Module): ...@@ -243,7 +241,7 @@ class DeepseekV2MoE(nn.Module):
final_hidden_states = self.experts(hidden_states=hidden_states, final_hidden_states = self.experts(hidden_states=hidden_states,
router_logits=router_logits) router_logits=router_logits)
if shared_output is not None: if shared_output is not None:
if hidden_states.dtype != torch.float16 or self.dpsk_fp16_quick: if hidden_states.dtype != torch.float16:
final_hidden_states = final_hidden_states + shared_output final_hidden_states = final_hidden_states + shared_output
else: else:
# Fix FP16 overflow # Fix FP16 overflow
...@@ -671,7 +669,7 @@ class DeepseekV2DecoderLayer(nn.Module): ...@@ -671,7 +669,7 @@ class DeepseekV2DecoderLayer(nn.Module):
quant_config=quant_config, quant_config=quant_config,
prefix=f"{prefix}.self_attn", prefix=f"{prefix}.self_attn",
) )
self.dpsk_fp16_quick = os.environ.get('DPSK_FP16_QUICK') == '1'
if (config.n_routed_experts is not None if (config.n_routed_experts is not None
and layer_idx >= config.first_k_dense_replace and layer_idx >= config.first_k_dense_replace
and layer_idx % config.moe_layer_freq == 0): and layer_idx % config.moe_layer_freq == 0):
...@@ -724,7 +722,7 @@ class DeepseekV2DecoderLayer(nn.Module): ...@@ -724,7 +722,7 @@ class DeepseekV2DecoderLayer(nn.Module):
) )
residual = new_residual residual = new_residual
if hidden_states.dtype == torch.float16 and not self.dpsk_fp16_quick: if hidden_states.dtype == torch.float16:
# rmsnorm, and rmsnorm result would not affect by scale. # rmsnorm, and rmsnorm result would not affect by scale.
hidden_states *= 1. / self.routed_scaling_factor hidden_states *= 1. / self.routed_scaling_factor
if self.layer_idx == 0 or residual_fix_overflow: if self.layer_idx == 0 or residual_fix_overflow:
...@@ -735,7 +733,7 @@ class DeepseekV2DecoderLayer(nn.Module): ...@@ -735,7 +733,7 @@ class DeepseekV2DecoderLayer(nn.Module):
hidden_states, new_resi = self.mlp(hidden_states, self.post_attention_layernorm.weight.data, residual) hidden_states, new_resi = self.mlp(hidden_states, self.post_attention_layernorm.weight.data, residual)
if isinstance(self.mlp, if isinstance(self.mlp,
DeepseekV2MLP) and hidden_states.dtype == torch.float16 and not self.dpsk_fp16_quick: DeepseekV2MLP) and hidden_states.dtype == torch.float16:
# Fix FP16 overflow # Fix FP16 overflow
# Scaling the DeepseekV2MLP output, it is the input of # Scaling the DeepseekV2MLP output, it is the input of
# input_layernorm of next decoder layer. # input_layernorm of next decoder layer.
...@@ -760,7 +758,7 @@ class DeepseekV2DecoderLayer(nn.Module): ...@@ -760,7 +758,7 @@ class DeepseekV2DecoderLayer(nn.Module):
hidden_states=hidden_states, hidden_states=hidden_states,
) )
if hidden_states.dtype == torch.float16 and not self.dpsk_fp16_quick: if hidden_states.dtype == torch.float16:
# Fix FP16 overflow # Fix FP16 overflow
# We scale both hidden_states and residual before # We scale both hidden_states and residual before
# rmsnorm, and rmsnorm result would not affect by scale. # rmsnorm, and rmsnorm result would not affect by scale.
...@@ -776,7 +774,7 @@ class DeepseekV2DecoderLayer(nn.Module): ...@@ -776,7 +774,7 @@ class DeepseekV2DecoderLayer(nn.Module):
hidden_states = self.mlp(hidden_states) hidden_states = self.mlp(hidden_states)
if isinstance(self.mlp, if isinstance(self.mlp,
DeepseekV2MLP) and hidden_states.dtype == torch.float16 and not self.dpsk_fp16_quick: DeepseekV2MLP) and hidden_states.dtype == torch.float16:
# Fix FP16 overflow # Fix FP16 overflow
# Scaling the DeepseekV2MLP output, it is the input of # Scaling the DeepseekV2MLP output, it is the input of
# input_layernorm of next decoder layer. # input_layernorm of next decoder layer.
...@@ -1120,7 +1118,10 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts): ...@@ -1120,7 +1118,10 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts):
if is_pp_missing_parameter(name, self): if is_pp_missing_parameter(name, self):
continue continue
try:
param = params_dict[name] param = params_dict[name]
except Exception as e:
continue
weight_loader = getattr(param, "weight_loader", weight_loader = getattr(param, "weight_loader",
default_weight_loader) default_weight_loader)
weight_loader(param, loaded_weight) weight_loader(param, loaded_weight)
......
...@@ -159,8 +159,6 @@ def prepare_tbo_atten_metadata( ...@@ -159,8 +159,6 @@ def prepare_tbo_atten_metadata(
# The block_table for RIGHT starts from (req_offset-1). # The block_table for RIGHT starts from (req_offset-1).
# Align both offsets to that, and re-build the seq_lens for row-0. # Align both offsets to that, and re-build the seq_lens for row-0.
seq_len_offset = req_offset - 1 seq_len_offset = req_offset - 1
# query_start_offset = req_offset - 1
query_start_offset = req_offset query_start_offset = req_offset
# row-0 is the split request (global row index = req_offset-1): # row-0 is the split request (global row index = req_offset-1):
...@@ -182,7 +180,6 @@ def prepare_tbo_atten_metadata( ...@@ -182,7 +180,6 @@ def prepare_tbo_atten_metadata(
else: else:
# RIGHT without split-in-req: natural positions # RIGHT without split-in-req: natural positions
seq_len_offset = req_offset seq_len_offset = req_offset
# query_start_offset = req_offset
query_start_offset = req_offset + 1 query_start_offset = req_offset + 1
seq_lens_cpu_local = torch.as_tensor(default_seq_lens, device=runner.seq_lens_cpu.device) seq_lens_cpu_local = torch.as_tensor(default_seq_lens, device=runner.seq_lens_cpu.device)
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import copy import copy
import gc import gc
import time import time
...@@ -1301,7 +1302,6 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -1301,7 +1302,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
scheduler_output: "SchedulerOutput", scheduler_output: "SchedulerOutput",
intermediate_tensors: Optional[IntermediateTensors] = None, intermediate_tensors: Optional[IntermediateTensors] = None,
) -> Union[ModelRunnerOutput, IntermediateTensors]: ) -> Union[ModelRunnerOutput, IntermediateTensors]:
# profile.StartTracer()
self._update_states(scheduler_output) self._update_states(scheduler_output)
if not scheduler_output.total_num_scheduled_tokens: if not scheduler_output.total_num_scheduled_tokens:
if not has_kv_transfer_group(): if not has_kv_transfer_group():
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment