Commit 2eb579dd authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.9.2-dev-ds' of http://10.16.6.30/dcutoolkit/deeplearing/vllm into v0.9.2-dev-ds

parents 7b8a9e18 0e92caa0
...@@ -81,7 +81,7 @@ class Attention(nn.Module): ...@@ -81,7 +81,7 @@ class Attention(nn.Module):
calculate_kv_scales = cache_config.calculate_kv_scales calculate_kv_scales = cache_config.calculate_kv_scales
else: else:
kv_cache_dtype = "auto" kv_cache_dtype = "auto"
block_size = 64 if envs.VLLM_USE_FLASH_ATTN_PA or envs.VLLM_USE_FLASH_MLA else 16 block_size = 64 if envs.VLLM_USE_FLASH_ATTN_PA and envs.VLLM_USE_FLASH_MLA else 16
is_attention_free = False is_attention_free = False
calculate_kv_scales = False calculate_kv_scales = False
if num_kv_heads is None: if num_kv_heads is None:
...@@ -312,7 +312,7 @@ class MultiHeadAttention(nn.Module): ...@@ -312,7 +312,7 @@ class MultiHeadAttention(nn.Module):
attn_backend = get_attn_backend(head_size, attn_backend = get_attn_backend(head_size,
dtype, dtype,
kv_cache_dtype=None, kv_cache_dtype=None,
block_size=64 if envs.VLLM_USE_FLASH_ATTN_PA or envs.VLLM_USE_FLASH_MLA else 16, block_size=64 if envs.VLLM_USE_FLASH_ATTN_PA and envs.VLLM_USE_FLASH_MLA else 16,
is_attention_free=False) is_attention_free=False)
backend = backend_name_to_enum(attn_backend.get_name()) backend = backend_name_to_enum(attn_backend.get_name())
if current_platform.is_rocm(): if current_platform.is_rocm():
......
...@@ -1499,7 +1499,7 @@ PrefixCachingHashAlgo = Literal["builtin", "sha256"] ...@@ -1499,7 +1499,7 @@ PrefixCachingHashAlgo = Literal["builtin", "sha256"]
class CacheConfig: class CacheConfig:
"""Configuration for the KV cache.""" """Configuration for the KV cache."""
block_size: BlockSize = 64 if envs.VLLM_USE_FLASH_ATTN_PA or envs.VLLM_USE_FLASH_MLA else 16 # type: ignore block_size: BlockSize = 64 if envs.VLLM_USE_FLASH_ATTN_PA and envs.VLLM_USE_FLASH_MLA else 16 # type: ignore
"""Size of a contiguous cache block in number of tokens. This is ignored on """Size of a contiguous cache block in number of tokens. This is ignored on
neuron devices and set to `--max-model-len`. On CUDA devices, only block neuron devices and set to `--max-model-len`. On CUDA devices, only block
sizes up to 32 are supported. On HPU devices, block size defaults to 128. sizes up to 32 are supported. On HPU devices, block size defaults to 128.
......
...@@ -949,6 +949,7 @@ def init_distributed_environment( ...@@ -949,6 +949,7 @@ def init_distributed_environment(
backend = "gloo" backend = "gloo"
# this backend is used for WORLD # this backend is used for WORLD
parallel_config = config.parallel_config
data_parallel_size = parallel_config.data_parallel_size data_parallel_size = parallel_config.data_parallel_size
use_mori_ep = envs.VLLM_USE_MORI_EP and data_parallel_size > 1 and parallel_config.enable_expert_parallel use_mori_ep = envs.VLLM_USE_MORI_EP and data_parallel_size > 1 and parallel_config.enable_expert_parallel
if use_mori_ep: if use_mori_ep:
......
...@@ -21,11 +21,14 @@ from vllm.model_executor.layers.fused_moe.layer import FusedMoEMethodBase, Unqua ...@@ -21,11 +21,14 @@ from vllm.model_executor.layers.fused_moe.layer import FusedMoEMethodBase, Unqua
from vllm.model_executor.layers.fused_moe.ep_moe.token_dispatcher import MoEAlltoAllTokenDispatcher from vllm.model_executor.layers.fused_moe.ep_moe.token_dispatcher import MoEAlltoAllTokenDispatcher
from vllm.model_executor.layers.fused_moe.ep_moe.ep_moe_utlis import EpMoeConfig from vllm.model_executor.layers.fused_moe.ep_moe.ep_moe_utlis import EpMoeConfig
from vllm.utils import direct_register_custom_op from vllm.utils import direct_register_custom_op
import mori
import torch.distributed as dist import torch.distributed as dist
from lmslim.layers.gemm.int8_utils import ( try:
import mori
from lmslim.layers.gemm.int8_utils import (
per_token_quant_int8) per_token_quant_int8)
except ImportError:
is_mori_available = False
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -239,6 +242,8 @@ class EPMoE(FusedMoE): ...@@ -239,6 +242,8 @@ class EPMoE(FusedMoE):
self.scales = None self.scales = None
self.use_int8_dispatch = True self.use_int8_dispatch = True
vllm_config = get_current_vllm_config()
self.max_num_inp_token_per_rank = vllm_config.scheduler_config.max_num_seqs
self.mori_op = self.get_mori_op() self.mori_op = self.get_mori_op()
self.first = True self.first = True
...@@ -270,7 +275,7 @@ class EPMoE(FusedMoE): ...@@ -270,7 +275,7 @@ class EPMoE(FusedMoE):
hidden_dim=self.hidden_size, hidden_dim=self.hidden_size,
scale_dim=1 if self.use_int8_dispatch else 0, scale_dim=1 if self.use_int8_dispatch else 0,
scale_type_size=mori_scale_type_size, scale_type_size=mori_scale_type_size,
max_num_inp_token_per_rank=512, max_num_inp_token_per_rank=self.max_num_inp_token_per_rank,
num_experts_per_rank=self.local_num_experts, num_experts_per_rank=self.local_num_experts,
num_experts_per_token=self.top_k, num_experts_per_token=self.top_k,
max_token_type_size=2, max_token_type_size=2,
...@@ -381,16 +386,33 @@ class EPMoE(FusedMoE): ...@@ -381,16 +386,33 @@ class EPMoE(FusedMoE):
) )
#self.sync() #self.sync()
expect_m = hidden_states.shape[0] * self.ep_size # expect_m = topk_ids.shape[0] * self.ep_size
dispatch_output_clip = dispatch_output[:expect_m] # dispatch_output_clip = dispatch_output[:expect_m]
dispatch_weights_clip = dispatch_weights[:expect_m] # dispatch_weights_clip = dispatch_weights[:expect_m]
dispatch_indices_clip = dispatch_indices[:expect_m] # dispatch_indices_clip = dispatch_indices[:expect_m]
dispatch_scales_clip = dispatch_scales[:expect_m] # dispatch_scales_clip = dispatch_scales[:expect_m]
# expert_output = self.quant_method.apply_ep(
# layer=self,
# x=dispatch_output_clip,
# topk_weights=dispatch_weights_clip,
# topk_ids=dispatch_indices_clip,
# global_num_experts=self.global_num_experts,
# expert_map=self.expert_map,
# activation=self.activation,
# apply_router_weight_on_input=self.apply_router_weight_on_input,
# use_nn_moe=self.use_nn_moe,
# num_local_tokens=dispatch_recv_num_token,
# config_select_bs=hidden_states.shape[0],
# scales=dispatch_scales_clip if self.use_int8_dispatch else None
# #routed_scaling_factor=self.routed_scaling_factor,
# )
expert_output = self.quant_method.apply_ep( expert_output = self.quant_method.apply_ep(
layer=self, layer=self,
x=dispatch_output_clip, x=dispatch_output,
topk_weights=dispatch_weights_clip, topk_weights=dispatch_weights,
topk_ids=dispatch_indices_clip, topk_ids=dispatch_indices,
global_num_experts=self.global_num_experts, global_num_experts=self.global_num_experts,
expert_map=self.expert_map, expert_map=self.expert_map,
activation=self.activation, activation=self.activation,
...@@ -398,25 +420,9 @@ class EPMoE(FusedMoE): ...@@ -398,25 +420,9 @@ class EPMoE(FusedMoE):
use_nn_moe=self.use_nn_moe, use_nn_moe=self.use_nn_moe,
num_local_tokens=dispatch_recv_num_token, num_local_tokens=dispatch_recv_num_token,
config_select_bs=hidden_states.shape[0], config_select_bs=hidden_states.shape[0],
scales=dispatch_scales_clip if self.use_int8_dispatch else None scales=dispatch_scales if self.use_int8_dispatch else None
#routed_scaling_factor=self.routed_scaling_factor, #routed_scaling_factor=self.routed_scaling_factor,
) )
# expert_output = self.quant_method.apply_ep(
# layer=self,
# x=dispatch_output,
# topk_weights=dispatch_weights,
# topk_ids=dispatch_indices,
# global_num_experts=self.global_num_experts,
# expert_map=self.expert_map,
# activation=self.activation,
# apply_router_weight_on_input=self.apply_router_weight_on_input,
# use_nn_moe=self.use_nn_moe,
# num_local_tokens=dispatch_recv_num_token,
# config_select_bs=hidden_states.shape[0]*2,
# scales=dispatch_scales if self.use_int8_dispatch else None
# #routed_scaling_factor=self.routed_scaling_factor,
# )
#self.sync() #self.sync()
combine_output, _ = self.mori_op.combine(expert_output, dispatch_weights, topk_ids) combine_output, _ = self.mori_op.combine(expert_output, dispatch_weights, topk_ids)
......
...@@ -242,11 +242,6 @@ class DeepseekV2MoE(nn.Module): ...@@ -242,11 +242,6 @@ class DeepseekV2MoE(nn.Module):
# See DeepseekV2DecoderLayer for more details. # See DeepseekV2DecoderLayer for more details.
final_hidden_states = self.experts(hidden_states=hidden_states, final_hidden_states = self.experts(hidden_states=hidden_states,
router_logits=router_logits) router_logits=router_logits)
else:
final_hidden_states = self.experts(hidden_states=hidden_states,
router_logits=router_logits)
if not self.use_mori_ep:
if shared_output is not None: if shared_output is not None:
if hidden_states.dtype != torch.float16 or self.dpsk_fp16_quick: if hidden_states.dtype != torch.float16 or self.dpsk_fp16_quick:
final_hidden_states = final_hidden_states + shared_output final_hidden_states = final_hidden_states + shared_output
...@@ -255,7 +250,11 @@ class DeepseekV2MoE(nn.Module): ...@@ -255,7 +250,11 @@ class DeepseekV2MoE(nn.Module):
# See DeepseekV2DecoderLayer for more details. # See DeepseekV2DecoderLayer for more details.
final_hidden_states = final_hidden_states + shared_output \ final_hidden_states = final_hidden_states + shared_output \
* (1. / self.routed_scaling_factor) * (1. / self.routed_scaling_factor)
else:
final_hidden_states = self.experts(hidden_states=hidden_states,
router_logits=router_logits)
if not self.use_mori_ep:
if self.tp_size > 1: if self.tp_size > 1:
if envs.VLLM_ENABLE_TBO: if envs.VLLM_ENABLE_TBO:
final_hidden_states = self.tbo_all_reduce(final_hidden_states) final_hidden_states = self.tbo_all_reduce(final_hidden_states)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment