Commit 99a0c39e authored by xingjinliang's avatar xingjinliang
Browse files

同步最新代码

parent 50fe58fa
Pipeline #2152 passed with stage
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
......@@ -57,8 +57,11 @@ Megatron-Core offers rich parallelism mappings, combining Expert Parallelism wit
| --expert-tensor-parallel-size | Degree of tensor model parallelism of expert layer. Default is same to --tensor-model-parallel-size. |
| --moe-layer-freq | Frequency between MoE layers and Dense layers. Accepts either: 1) An integer N for 1:N ratio (one expert layer for every N-1 dense layers), 2) A string "N" for the same ratio, or 3) A string with Python list expression for custom patterns like `([1]*3+[0]*1)*3` which gives [1,1,1,0,1,1,1,0,1,1,1,0] where 1=expert layer and 0=dense layer. Examples: `([0]+[1]*23)` for 1 dense layer followed by 23 experts layers, `([1]*3+[0]*2)*2` for three expert layers followed by two dense layers, repeated twice. Default is 1. |
| --moe-grouped-gemm | When there are multiple experts per rank, launch multiple local GEMM kernels in multiple streams to improve the utilization and performance with GroupedLinear in TransformerEngine. |
| --moe-router-load-balancing-type | Determines the load balancing strategy for the router. "aux_loss" corresponds to the load balancing loss used in GShard and SwitchTransformer, "sinkhorn" corresponds to the balancing algorithm used in S-BASE, and "none" implies no load balancing. The default is "aux_loss". |
| --moe-router-load-balancing-type | Determines the load balancing strategy for the router. "aux_loss" corresponds to the load balancing loss used in GShard and SwitchTransformer; "seq_aux_loss" corresponds to the load balancing loss used in DeepSeekV2, which computes the loss for each individual sample; "sinkhorn" corresponds to the balancing algorithm used in S-BASE, and "none" implies no load balancing. The default is "aux_loss". |
| --moe-router-topk | Number of experts to route to for each token. The default is 2. |
| --moe-router-pre-softmax | Enable pre-softmax routing for MoE, which means softmax is before the top-k selection. By default, softmax is done after top-k. |
| --moe-router-topk-limited-devices | Number of expert parallel ranks to consider for each token during routing. Perform top-k routing on a subset of expert parallel ranks by first selecting N ranks for each token, then conducting top-k selection among experts on these devices. None means no device limitation. Default is None, which means no limited devices. |
| --moe-router-topk-scaling-factor | Scaling factor for routing score in top-k selection, only works when --moe-router-pre-softmax enabled. Defaults to None, which means no scaling. |
| --moe-aux-loss-coeff | Scaling coefficient for the aux loss: a starting value of 1e-2 is recommended. Default is 0.0. |
| --moe-z-loss-coeff | Scaling coefficient for the z-loss: a starting value of 1e-3 is recommended. Default is None. |
| --moe-input-jitter-eps | Add noise to the input tensor by applying jitter with a specified epsilon value. Default is None. |
......
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
......@@ -9,15 +9,13 @@ import torch
from megatron.core import parallel_state, tensor_parallel
from megatron.core.transformer.mlp import MLPSubmodules
from megatron.core.transformer.module import MegatronModule
from megatron.core.transformer.moe.experts import GroupedMLP, SequentialMLP, TEGroupedMLP
from megatron.core.transformer.moe.legacy_a2a_token_dispatcher import MoEAlltoAllSEQTokenDispatcher
from megatron.core.transformer.moe.router import TopKRouter
from megatron.core.transformer.moe.shared_experts import SharedExpertMLP
from megatron.core.transformer.moe.token_dispatcher import (
MoEAllGatherTokenDispatcher,
MoEAlltoAllTokenDispatcher,
)
from megatron.core.transformer.spec_utils import ModuleSpec
from megatron.core.transformer.spec_utils import ModuleSpec, build_module
from megatron.core.transformer.transformer_config import TransformerConfig
......@@ -89,20 +87,6 @@ class MoELayer(BaseMoELayer):
# Initialize router
self.router = TopKRouter(config=self.config)
# Initialize experts
if self.config.moe_grouped_gemm:
if isinstance(self.submodules.experts, MLPSubmodules):
self.experts = TEGroupedMLP(
self.num_local_experts, self.config, self.submodules.experts
)
else:
self.experts = GroupedMLP(self.num_local_experts, self.config)
else:
assert isinstance(self.submodules.experts, MLPSubmodules)
self.experts = SequentialMLP(
self.num_local_experts, self.config, self.submodules.experts
)
# Initialize token dispatcher
if config.moe_token_dispatcher_type == "allgather":
self.token_dispatcher = MoEAllGatherTokenDispatcher(
......@@ -121,9 +105,12 @@ class MoELayer(BaseMoELayer):
f"Unsupported token dispatcher type: {config.moe_token_dispatcher_type}"
)
# Initialize experts
self.experts = build_module(self.submodules.experts, self.num_local_experts, self.config)
# Initialize shared experts
if self.use_shared_expert:
self.shared_experts = SharedExpertMLP(self.config, self.submodules.shared_experts)
self.shared_experts = build_module(self.submodules.shared_experts, config=self.config)
if self.shared_expert_overlap:
self.token_dispatcher.set_shared_experts(self.shared_experts)
......
......@@ -56,6 +56,45 @@ def switch_load_balancing_loss_func(
return aux_loss
def sequence_load_balancing_loss_func(
probs: torch.Tensor,
routing_map: torch.Tensor,
tokens_per_expert: torch.Tensor,
batch_size: int,
seq_length: int,
topk: int,
moe_aux_loss_coeff: float,
sequence_partition_group=None,
):
"""
Calculate the auxiliary loss in sequence-level by computing the loss for each individual sample.
Refer to the DeepSeek-V2 huggingface repo
(https://huggingface.co/deepseek-ai/DeepSeek-V2) for details.
"""
num_sub_sequence = 1
# If the sequence is partitioned by certain parallelism strategies like Sequence Parallelism
# or Context Parallelism, compute the gradient of the auxiliary loss with respect to the full
# sequence.
if sequence_partition_group is not None:
# We can keep `aggregated_probs_per_expert` local since we don't need the gradient for
# `tokens_per_expert`, saving one allreduce operation for `aggregated_probs_per_expert`.
num_sub_sequence = torch.distributed.get_world_size(sequence_partition_group)
torch.distributed.all_reduce(tokens_per_expert, group=sequence_partition_group)
assert num_sub_sequence == 1, "Do not support sequence aux loss in sequence partition case"
num_experts = probs.shape[1]
probs_for_aux_loss = probs.view(seq_length, batch_size, -1)
cost_coeff = routing_map.view(seq_length, batch_size, -1).sum(dim=0).float()
cost_coeff.div_(seq_length * topk / num_experts)
seq_aux_loss = (cost_coeff * probs_for_aux_loss.mean(dim=0)).sum(dim=1).mean()
seq_aux_loss *= moe_aux_loss_coeff
return seq_aux_loss
def z_loss_func(logits, z_loss_coeff):
"""Encourages the router's logits to remain small to enhance stability.
Please refer to the ST-MoE paper (https://arxiv.org/pdf/2202.08906.pdf) for details.
......@@ -108,7 +147,7 @@ def get_capacity(num_tokens: int, num_experts: int, capacity_factor: float, min_
class MoEAuxLossAutoScaler(torch.autograd.Function):
"""An AutoScaler that compute and scales the grad for auxiliary loss."""
"""An AutoScaler that triggers the backward pass and scales the grad for auxiliary loss."""
main_loss_backward_scale: torch.Tensor = torch.tensor(1.0)
......@@ -228,6 +267,52 @@ def sort_chunks_by_idxs(input: torch.Tensor, split_sizes: torch.Tensor, sorted_i
return output
def device_limited_topk(
scores: torch.Tensor,
topk: int,
num_tokens: int,
num_experts: int,
moe_router_topk_limited_devices: int,
):
"""Perform top-k routing on a subset of expert parallel ranks.
Selects N ranks for each token, then conducts top-k selection among experts on these devices.
See DeepSeek-V2 technical report (https://arxiv.org/pdf/2405.04434) for details.
Args:
scores (torch.Tensor): Softmax scores from the router.
topk (int): The number of experts to select for each token.
num_tokens (int): The number of tokens.
num_experts (int): The number of experts.
moe_router_topk_limited_devices (int): Number of expert parallel ranks to consider for
each token during routing. None means no device limitation.
Returns:
Tuple[torch.Tensor, torch.Tensor]: Probs and indices tensor.
"""
# Organize the experts into groups
num_group = (
parallel_state.get_expert_model_parallel_world_size()
) # num_group equals to expert parallel size
group_scores = scores.view(num_tokens, num_group, -1).max(dim=-1).values
group_idx = torch.topk(group_scores, k=moe_router_topk_limited_devices, dim=-1, sorted=False)[1]
group_mask = torch.zeros_like(group_scores)
group_mask.scatter_(1, group_idx, 1)
# Mask the experts based on selection groups
score_mask = (
group_mask.unsqueeze(-1)
.expand(num_tokens, num_group, num_experts // num_group)
.reshape(num_tokens, -1)
)
masked_scores = scores.masked_fill(~score_mask.bool(), 0.0)
probs, top_indices = torch.topk(masked_scores, k=topk, dim=-1)
return probs, top_indices
def topk_softmax_with_capacity(
logits: torch.Tensor,
topk: int,
......@@ -235,6 +320,8 @@ def topk_softmax_with_capacity(
pad_to_capacity: bool = False,
drop_policy: str = "probs",
use_pre_softmax: bool = False,
moe_router_topk_limited_devices: int = None,
moe_router_topk_scaling_factor: float = None,
deterministic_mode: bool = False,
):
"""Apply capacity and padding to the top-k selection.
......@@ -247,6 +334,12 @@ def topk_softmax_with_capacity(
drop_policy (str): The policy to drop tokens. Can be either "prob" or "position".
If "prob", the tokens with the lowest probabilities will be dropped.
If "position", tokens at the end of each batch will be dropped.
use_pre_softmax (bool): Whether to apply softmax before top-k selection.
moe_router_topk_limited_devices (int): Number of expert parallel ranks to consider for
each token during routing. None means no device limitation.
moe_router_topk_scaling_factor (float): Scaling factor for routing score in top-k
selection, only works when use_pre_softmax enabled.
deterministic_mode (bool): Deprecated.
Returns:
Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
- routing_probs (torch.Tensor): A tensor of shape [num_tokens, num_experts] containing
......@@ -255,7 +348,7 @@ def topk_softmax_with_capacity(
indicating which experts were selected for each token. True values represent
the selected experts.
- tokens_per_expert (torch.Tensor): A tensor of shape [num_experts] containing
the number of local tokens assigned to each expert.
the number of local tokens assigned to each expert before dropping and padding.
"""
assert logits.dim() == 2, f"Expected 2D logits [num_tokens, num_experts], got {logits.dim()}."
num_tokens = logits.shape[0]
......@@ -263,14 +356,32 @@ def topk_softmax_with_capacity(
if use_pre_softmax:
# Pre softmax
scores = torch.softmax(logits, dim=-1, dtype=torch.float32).type_as(logits)
probs, top_indices = torch.topk(scores, k=topk, dim=1)
if moe_router_topk_limited_devices:
probs, top_indices = device_limited_topk(
scores, topk, num_tokens, num_experts, moe_router_topk_limited_devices
)
else:
probs, top_indices = torch.topk(scores, k=topk, dim=1)
# Normalize the probs.
if moe_router_topk_scaling_factor:
probs = probs * moe_router_topk_scaling_factor
else:
# Post softmax
if topk == 1:
# Requires applying softmax before selecting the top-k when k is 1,
# since softmax on a [num_tokens, 1] would yield a zero gradient.
raise ValueError("Please use --moe-router-pre-softmax when topk is 1.")
scores, top_indices = torch.topk(logits, k=topk, dim=1)
assert (
moe_router_topk_scaling_factor is None
), "moe_router_topk_scaling_factor is not supported with post-softmax"
if moe_router_topk_limited_devices:
scores, top_indices = device_limited_topk(
logits, topk, num_tokens, num_experts, moe_router_topk_limited_devices
)
else:
scores, top_indices = torch.topk(logits, k=topk, dim=1)
probs = torch.softmax(scores, dim=-1, dtype=torch.float32).type_as(logits)
# TODO Try using element-wise operations instead of scatter?
......
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
from abc import ABC, abstractmethod
from functools import partial
from typing import Callable
import torch
......@@ -10,6 +12,7 @@ from megatron.core.transformer.module import MegatronModule
from megatron.core.transformer.moe.moe_utils import (
MoEAuxLossAutoScaler,
save_to_aux_losses_tracker,
sequence_load_balancing_loss_func,
sinkhorn,
switch_load_balancing_loss_func,
topk_softmax_with_capacity,
......@@ -142,7 +145,7 @@ class TopKRouter(Router):
Returns:
probs (torch.Tensor): The probabilities of token to experts assignment.
indices (torch.Tensor): The mask of token to experts assignment.
routing_map (torch.Tensor): The mask of token to experts assignment.
"""
probs, routing_map, tokens_per_expert = topk_softmax_with_capacity(
logits,
......@@ -151,33 +154,61 @@ class TopKRouter(Router):
pad_to_capacity=self.config.moe_pad_expert_input_to_capacity,
drop_policy=self.config.moe_token_drop_policy,
use_pre_softmax=self.config.moe_router_pre_softmax,
moe_router_topk_limited_devices=self.config.moe_router_topk_limited_devices,
moe_router_topk_scaling_factor=self.config.moe_router_topk_scaling_factor,
deterministic_mode=self.config.deterministic_mode,
)
if self.training:
# Apply load balancing loss
scores = torch.softmax(logits, dim=-1, dtype=torch.float32)
probs = self.apply_load_balancing_loss(scores, tokens_per_expert, activation=probs)
aux_loss_func = partial(
switch_load_balancing_loss_func,
probs=scores,
tokens_per_expert=tokens_per_expert,
topk=self.topk,
)
probs = self.apply_load_balancing_loss(
activation=probs, load_balancing_loss_func=aux_loss_func
)
return probs, routing_map
def apply_load_balancing_loss(
self,
probs: torch.Tensor,
num_local_tokens_per_expert: torch.Tensor,
activation: torch.Tensor,
):
"""Applies auxiliary loss to the MoE layer.
def seq_aux_loss_load_balancing(self, logits: torch.Tensor, bsz: int, seq_length: int):
"""Apply loss-based load balancing to the logits tensor."""
Args:
probs (torch.Tensor): The probs output by the router for each token.
[num_tokens, num_experts]
num_local_tokens_per_expert (torch.Tensor): The number of tokens per expert.
[num_experts]
activation (torch.Tensor): The activation tensor to attach the gradient function to.
probs, routing_map, tokens_per_expert = topk_softmax_with_capacity(
logits,
self.topk,
capacity_factor=self.config.moe_expert_capacity_factor,
pad_to_capacity=self.config.moe_pad_expert_input_to_capacity,
drop_policy=self.config.moe_token_drop_policy,
use_pre_softmax=self.config.moe_router_pre_softmax,
moe_router_topk_limited_devices=self.config.moe_router_topk_limited_devices,
moe_router_topk_scaling_factor=self.config.moe_router_topk_scaling_factor,
deterministic_mode=self.config.deterministic_mode,
)
Returns:
torch.Tensor: The activation tensor with the attached gradient function.
"""
if self.training:
scores = torch.softmax(logits, dim=-1, dtype=torch.float32)
aux_loss_func = partial(
sequence_load_balancing_loss_func,
probs=scores,
routing_map=routing_map,
tokens_per_expert=tokens_per_expert,
batch_size=bsz,
seq_length=seq_length,
topk=self.topk,
)
probs = self.apply_load_balancing_loss(
activation=probs, load_balancing_loss_func=aux_loss_func
)
return probs, routing_map
def apply_load_balancing_loss(
self, activation: torch.Tensor, load_balancing_loss_func: Callable
):
"""Calculate auxiliary loss, attach gradient function to activation and add to logging."""
moe_aux_loss_coeff = self.config.moe_aux_loss_coeff
sequence_partition_group = None
if self.config.moe_token_dispatcher_type == "alltoall_seq":
......@@ -186,12 +217,8 @@ class TopKRouter(Router):
else:
sequence_partition_group = parallel_state.get_tensor_and_context_parallel_group()
aux_loss = switch_load_balancing_loss_func(
probs,
num_local_tokens_per_expert,
self.topk,
moe_aux_loss_coeff,
sequence_partition_group=sequence_partition_group,
aux_loss = load_balancing_loss_func(
moe_aux_loss_coeff=moe_aux_loss_coeff, sequence_partition_group=sequence_partition_group
)
save_to_aux_losses_tracker(
"load_balancing_loss",
......@@ -257,6 +284,7 @@ class TopKRouter(Router):
routing_map (torch.Tensor): The mapping of token to experts assignment,
with shape [num_tokens, num_experts].
"""
seq_length, bsz = logits.shape[:2]
logits = logits.view(-1, self.config.num_moe_experts)
# Apply Z-Loss
......@@ -270,6 +298,8 @@ class TopKRouter(Router):
scores, routing_map = self.sinkhorn_load_balancing(logits)
elif self.routing_type == "aux_loss":
scores, routing_map = self.aux_loss_load_balancing(logits)
elif self.routing_type == "seq_aux_loss":
scores, routing_map = self.seq_aux_loss_load_balancing(logits, bsz, seq_length)
elif self.routing_type == "none":
# A naive top-k routing without load balancing
scores, routing_map, _ = topk_softmax_with_capacity(
......@@ -279,6 +309,7 @@ class TopKRouter(Router):
pad_to_capacity=self.config.moe_pad_expert_input_to_capacity,
drop_policy=self.config.moe_token_drop_policy,
use_pre_softmax=self.config.moe_router_pre_softmax,
moe_router_topk_scaling_factor=self.config.moe_router_topk_scaling_factor,
deterministic_mode=self.config.deterministic_mode,
)
else:
......@@ -293,12 +324,10 @@ class TopKRouter(Router):
Args:
input (torch.Tensor): Input tensor.
"""
self.hidden = input.shape[-1]
# Apply input jitter
input = self.apply_input_jitter(input)
logits = self.gating(input)
logits = logits.view(-1, self.config.num_moe_experts)
scores, routing_map = self.routing(logits)
......
......@@ -17,8 +17,7 @@ from megatron.core.tensor_parallel.mappings import (
reduce_from_tensor_model_parallel_region,
reduce_scatter_to_sequence_parallel_region,
)
from megatron.core.transformer.mlp import MLP
from megatron.core.transformer.spec_utils import ModuleSpec
from megatron.core.transformer.mlp import MLP, MLPSubmodules
from megatron.core.transformer.transformer_config import TransformerConfig
from megatron.core.utils import is_torch_min_version, make_sharded_tensor_for_checkpoint
......@@ -32,15 +31,15 @@ class SharedExpertMLP(MLP):
# The shared experts are scheduled into this stream to be overlapped with the dispatcher.
stream = None
def __init__(self, config: TransformerConfig, spec: ModuleSpec):
def __init__(self, config: TransformerConfig, submodules: MLPSubmodules, gate: bool):
config = deepcopy(config)
assert config.add_bias_linear == False, "bias is not supported in the shared experts, "
"please set '--disable-bias-linear' instead."
config.ffn_hidden_size = config.moe_shared_expert_intermediate_size
super().__init__(config=config, submodules=spec.submodules)
super().__init__(config=config, submodules=submodules)
self.use_shared_expert_gate = spec.params.get("gate", False)
self.use_shared_expert_gate = gate
if self.use_shared_expert_gate:
# TODO: Add support for GPU initialization, which requires updating the golden values.
self.gate_weight = torch.nn.Parameter(torch.empty((1, self.config.hidden_size)))
......
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
......@@ -576,7 +576,10 @@ class TransformerBlock(MegatronModule):
non_homogeneous_layers = metadata is not None and metadata.get(
'non_homogeneous_layers', False
)
if self.config.num_moe_experts is not None:
if isinstance(self.config.moe_layer_freq, int):
if self.config.moe_layer_freq > 1:
non_homogeneous_layers = True
elif isinstance(self.config.moe_layer_freq, list):
non_homogeneous_layers = True
sharded_state_dict = {}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment