同步最新代码

99a0c39e · xingjinliang · 50fe58fa · 99a0c39e · 99a0c39e · 99a0c39e
Commit 99a0c39e authored Dec 25, 2024 by xingjinliang
20 changed files
--- a/megatron/core/transformer/enums.py
+++ b/megatron/core/transformer/enums.py
--- a/megatron/core/transformer/identity_op.py
+++ b/megatron/core/transformer/identity_op.py
--- a/megatron/core/transformer/mlp.py
+++ b/megatron/core/transformer/mlp.py
--- a/megatron/core/transformer/module.py
+++ b/megatron/core/transformer/module.py
--- a/megatron/core/transformer/moe/README.md
+++ b/megatron/core/transformer/moe/README.md
@@ -57,8 +57,11 @@ Megatron-Core offers rich parallelism mappings, combining Expert Parallelism wit
 | --expert-tensor-parallel-size | Degree of tensor model parallelism of expert layer. Default is same to --tensor-model-parallel-size. |
 | --moe-layer-freq | Frequency between MoE layers and Dense layers. Accepts either: 1) An integer N for 1:N ratio (one expert layer for every N-1 dense layers), 2) A string "N" for the same ratio, or 3) A string with Python list expression for custom patterns like `([1]*3+[0]*1)*3` which gives [1,1,1,0,1,1,1,0,1,1,1,0] where 1=expert layer and 0=dense layer. Examples: `([0]+[1]*23)` for 1 dense layer followed by 23 experts layers, `([1]*3+[0]*2)*2` for three expert layers followed by two dense layers, repeated twice. Default is 1. |
 | --moe-grouped-gemm | When there are multiple experts per rank, launch multiple local GEMM kernels in multiple streams to improve the utilization and performance with GroupedLinear in TransformerEngine. |
-| --moe-router-load-balancing-type | Determines the load balancing strategy for the router. "aux_loss" corresponds to the load balancing loss used in GShard and SwitchTransformer, "sinkhorn" corresponds to the balancing algorithm used in S-BASE, and "none" implies no load balancing. The default is "aux_loss". |
+| --moe-router-load-balancing-type | Determines the load balancing strategy for the router. "aux_loss" corresponds to the load balancing loss used in GShard and SwitchTransformer; "seq_aux_loss" corresponds to the load balancing loss used in DeepSeekV2, which computes the loss for each individual sample; "sinkhorn" corresponds to the balancing algorithm used in S-BASE, and "none" implies no load balancing. The default is "aux_loss". |
 | --moe-router-topk | Number of experts to route to for each token. The default is 2. |  
+| --moe-router-pre-softmax | Enable pre-softmax routing for MoE, which means softmax is before the top-k selection. By default, softmax is done after top-k. |
+| --moe-router-topk-limited-devices | Number of expert parallel ranks to consider for each token during routing. Perform top-k routing on a subset of expert parallel ranks by first selecting N ranks for each token, then conducting top-k selection among experts on these devices. None means no device limitation. Default is None, which means no limited devices. |
+| --moe-router-topk-scaling-factor | Scaling factor for routing score in top-k selection, only works when --moe-router-pre-softmax enabled. Defaults to None, which means no scaling. |
 | --moe-aux-loss-coeff | Scaling coefficient for the aux loss: a starting value of 1e-2 is recommended. Default is 0.0. |
 | --moe-z-loss-coeff | Scaling coefficient for the z-loss: a starting value of 1e-3 is recommended. Default is None. |
 | --moe-input-jitter-eps | Add noise to the input tensor by applying jitter with a specified epsilon value. Default is None. |

--- a/megatron/core/transformer/moe/__init__.py
+++ b/megatron/core/transformer/moe/__init__.py
--- a/megatron/core/transformer/moe/experts.py
+++ b/megatron/core/transformer/moe/experts.py
--- a/megatron/core/transformer/moe/grouped_gemm_util.py
+++ b/megatron/core/transformer/moe/grouped_gemm_util.py
--- a/megatron/core/transformer/moe/legacy_a2a_token_dispatcher.py
+++ b/megatron/core/transformer/moe/legacy_a2a_token_dispatcher.py
--- a/megatron/core/transformer/moe/moe_layer.py
+++ b/megatron/core/transformer/moe/moe_layer.py
@@ -9,15 +9,13 @@ import torch
 from megatron.core import parallel_state, tensor_parallel
 from megatron.core.transformer.mlp import MLPSubmodules
 from megatron.core.transformer.module import MegatronModule
-from megatron.core.transformer.moe.experts import GroupedMLP, SequentialMLP, TEGroupedMLP
 from megatron.core.transformer.moe.legacy_a2a_token_dispatcher import MoEAlltoAllSEQTokenDispatcher
 from megatron.core.transformer.moe.router import TopKRouter
-from megatron.core.transformer.moe.shared_experts import SharedExpertMLP
 from megatron.core.transformer.moe.token_dispatcher import (
    MoEAllGatherTokenDispatcher,
    MoEAlltoAllTokenDispatcher,
 )
-from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
 from megatron.core.transformer.transformer_config import TransformerConfig


@@ -89,20 +87,6 @@ class MoELayer(BaseMoELayer):
        # Initialize router
        self.router = TopKRouter(config=self.config)

-        # Initialize experts
-        if self.config.moe_grouped_gemm:
-            if isinstance(self.submodules.experts, MLPSubmodules):
-                self.experts = TEGroupedMLP(
-                    self.num_local_experts, self.config, self.submodules.experts
-                )
-            else:
-                self.experts = GroupedMLP(self.num_local_experts, self.config)
-        else:
-            assert isinstance(self.submodules.experts, MLPSubmodules)
-            self.experts = SequentialMLP(
-                self.num_local_experts, self.config, self.submodules.experts
-            )
-
        # Initialize token dispatcher
        if config.moe_token_dispatcher_type == "allgather":
            self.token_dispatcher = MoEAllGatherTokenDispatcher(
@@ -121,9 +105,12 @@ class MoELayer(BaseMoELayer):
                f"Unsupported token dispatcher type: {config.moe_token_dispatcher_type}"
            )

+        # Initialize experts
+        self.experts = build_module(self.submodules.experts, self.num_local_experts, self.config)
+
        # Initialize shared experts
        if self.use_shared_expert:
-            self.shared_experts = SharedExpertMLP(self.config, self.submodules.shared_experts)
+            self.shared_experts = build_module(self.submodules.shared_experts, config=self.config)
            if self.shared_expert_overlap:
                self.token_dispatcher.set_shared_experts(self.shared_experts)


--- a/megatron/core/transformer/moe/moe_utils.py
+++ b/megatron/core/transformer/moe/moe_utils.py
@@ -56,6 +56,45 @@ def switch_load_balancing_loss_func(
    return aux_loss


+def sequence_load_balancing_loss_func(
+    probs: torch.Tensor,
+    routing_map: torch.Tensor,
+    tokens_per_expert: torch.Tensor,
+    batch_size: int,
+    seq_length: int,
+    topk: int,
+    moe_aux_loss_coeff: float,
+    sequence_partition_group=None,
+):
+    """
+    Calculate the auxiliary loss in sequence-level by computing the loss for each individual sample.
+    Refer to the DeepSeek-V2 huggingface repo
+    (https://huggingface.co/deepseek-ai/DeepSeek-V2) for details.
+    """
+    num_sub_sequence = 1
+
+    # If the sequence is partitioned by certain parallelism strategies like Sequence Parallelism
+    # or Context Parallelism, compute the gradient of the auxiliary loss with respect to the full
+    # sequence.
+    if sequence_partition_group is not None:
+        # We can keep `aggregated_probs_per_expert` local since we don't need the gradient for
+        # `tokens_per_expert`, saving one allreduce operation for `aggregated_probs_per_expert`.
+        num_sub_sequence = torch.distributed.get_world_size(sequence_partition_group)
+        torch.distributed.all_reduce(tokens_per_expert, group=sequence_partition_group)
+
+    assert num_sub_sequence == 1, "Do not support sequence aux loss in sequence partition case"
+
+    num_experts = probs.shape[1]
+
+    probs_for_aux_loss = probs.view(seq_length, batch_size, -1)
+    cost_coeff = routing_map.view(seq_length, batch_size, -1).sum(dim=0).float()
+    cost_coeff.div_(seq_length * topk / num_experts)
+    seq_aux_loss = (cost_coeff * probs_for_aux_loss.mean(dim=0)).sum(dim=1).mean()
+    seq_aux_loss *= moe_aux_loss_coeff
+
+    return seq_aux_loss
+
+
 def z_loss_func(logits, z_loss_coeff):
    """Encourages the router's logits to remain small to enhance stability.
    Please refer to the ST-MoE paper (https://arxiv.org/pdf/2202.08906.pdf) for details.
@@ -108,7 +147,7 @@ def get_capacity(num_tokens: int, num_experts: int, capacity_factor: float, min_


 class MoEAuxLossAutoScaler(torch.autograd.Function):
-    """An AutoScaler that compute and scales the grad for auxiliary loss."""
+    """An AutoScaler that triggers the backward pass and scales the grad for auxiliary loss."""

    main_loss_backward_scale: torch.Tensor = torch.tensor(1.0)

@@ -228,6 +267,52 @@ def sort_chunks_by_idxs(input: torch.Tensor, split_sizes: torch.Tensor, sorted_i
    return output


+def device_limited_topk(
+    scores: torch.Tensor,
+    topk: int,
+    num_tokens: int,
+    num_experts: int,
+    moe_router_topk_limited_devices: int,
+):
+    """Perform top-k routing on a subset of expert parallel ranks.
+
+    Selects N ranks for each token, then conducts top-k selection among experts on these devices.
+    See DeepSeek-V2 technical report (https://arxiv.org/pdf/2405.04434) for details.
+
+    Args:
+        scores (torch.Tensor): Softmax scores from the router.
+        topk (int): The number of experts to select for each token.
+        num_tokens (int): The number of tokens.
+        num_experts (int): The number of experts.
+        moe_router_topk_limited_devices (int): Number of expert parallel ranks to consider for
+            each token during routing. None means no device limitation.
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Probs and indices tensor.
+    """
+
+    # Organize the experts into groups
+    num_group = (
+        parallel_state.get_expert_model_parallel_world_size()
+    )  # num_group equals to expert parallel size
+    group_scores = scores.view(num_tokens, num_group, -1).max(dim=-1).values
+    group_idx = torch.topk(group_scores, k=moe_router_topk_limited_devices, dim=-1, sorted=False)[1]
+    group_mask = torch.zeros_like(group_scores)
+    group_mask.scatter_(1, group_idx, 1)
+
+    # Mask the experts based on selection groups
+    score_mask = (
+        group_mask.unsqueeze(-1)
+        .expand(num_tokens, num_group, num_experts // num_group)
+        .reshape(num_tokens, -1)
+    )
+
+    masked_scores = scores.masked_fill(~score_mask.bool(), 0.0)
+    probs, top_indices = torch.topk(masked_scores, k=topk, dim=-1)
+
+    return probs, top_indices
+
+
 def topk_softmax_with_capacity(
    logits: torch.Tensor,
    topk: int,
@@ -235,6 +320,8 @@ def topk_softmax_with_capacity(
    pad_to_capacity: bool = False,
    drop_policy: str = "probs",
    use_pre_softmax: bool = False,
+    moe_router_topk_limited_devices: int = None,
+    moe_router_topk_scaling_factor: float = None,
    deterministic_mode: bool = False,
 ):
    """Apply capacity and padding to the top-k selection.
@@ -247,6 +334,12 @@ def topk_softmax_with_capacity(
        drop_policy (str): The policy to drop tokens. Can be either "prob" or "position".
                           If "prob", the tokens with the lowest probabilities will be dropped.
                           If "position", tokens at the end of each batch will be dropped.
+        use_pre_softmax (bool): Whether to apply softmax before top-k selection.
+        moe_router_topk_limited_devices (int): Number of expert parallel ranks to consider for
+            each token during routing. None means no device limitation.
+        moe_router_topk_scaling_factor (float): Scaling factor for routing score in top-k
+            selection, only works when use_pre_softmax enabled.
+        deterministic_mode (bool): Deprecated.
    Returns:
        Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
            - routing_probs (torch.Tensor): A tensor of shape [num_tokens, num_experts] containing
@@ -255,7 +348,7 @@ def topk_softmax_with_capacity(
              indicating which experts were selected for each token. True values represent
              the selected experts.
            - tokens_per_expert (torch.Tensor): A tensor of shape [num_experts] containing
-              the number of local tokens assigned to each expert.
+              the number of local tokens assigned to each expert before dropping and padding.
    """
    assert logits.dim() == 2, f"Expected 2D logits [num_tokens, num_experts], got {logits.dim()}."
    num_tokens = logits.shape[0]
@@ -263,14 +356,32 @@ def topk_softmax_with_capacity(
    if use_pre_softmax:
        # Pre softmax
        scores = torch.softmax(logits, dim=-1, dtype=torch.float32).type_as(logits)
-        probs, top_indices = torch.topk(scores, k=topk, dim=1)
+
+        if moe_router_topk_limited_devices:
+            probs, top_indices = device_limited_topk(
+                scores, topk, num_tokens, num_experts, moe_router_topk_limited_devices
+            )
+        else:
+            probs, top_indices = torch.topk(scores, k=topk, dim=1)
+
+        # Normalize the probs.
+        if moe_router_topk_scaling_factor:
+            probs = probs * moe_router_topk_scaling_factor
    else:
        # Post softmax
        if topk == 1:
            # Requires applying softmax before selecting the top-k when k is 1,
            # since softmax on a [num_tokens, 1] would yield a zero gradient.
            raise ValueError("Please use --moe-router-pre-softmax when topk is 1.")
-        scores, top_indices = torch.topk(logits, k=topk, dim=1)
+        assert (
+            moe_router_topk_scaling_factor is None
+        ), "moe_router_topk_scaling_factor is not supported with post-softmax"
+        if moe_router_topk_limited_devices:
+            scores, top_indices = device_limited_topk(
+                logits, topk, num_tokens, num_experts, moe_router_topk_limited_devices
+            )
+        else:
+            scores, top_indices = torch.topk(logits, k=topk, dim=1)
        probs = torch.softmax(scores, dim=-1, dtype=torch.float32).type_as(logits)

    # TODO Try using element-wise operations instead of scatter?

--- a/megatron/core/transformer/moe/router.py
+++ b/megatron/core/transformer/moe/router.py
 # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.

 from abc import ABC, abstractmethod
+from functools import partial
+from typing import Callable

 import torch

@@ -10,6 +12,7 @@ from megatron.core.transformer.module import MegatronModule
 from megatron.core.transformer.moe.moe_utils import (
    MoEAuxLossAutoScaler,
    save_to_aux_losses_tracker,
+    sequence_load_balancing_loss_func,
    sinkhorn,
    switch_load_balancing_loss_func,
    topk_softmax_with_capacity,
@@ -142,7 +145,7 @@ class TopKRouter(Router):

        Returns:
            probs (torch.Tensor): The probabilities of token to experts assignment.
-            indices (torch.Tensor): The mask of token to experts assignment.
+            routing_map (torch.Tensor): The mask of token to experts assignment.
        """
        probs, routing_map, tokens_per_expert = topk_softmax_with_capacity(
            logits,
@@ -151,33 +154,61 @@ class TopKRouter(Router):
            pad_to_capacity=self.config.moe_pad_expert_input_to_capacity,
            drop_policy=self.config.moe_token_drop_policy,
            use_pre_softmax=self.config.moe_router_pre_softmax,
+            moe_router_topk_limited_devices=self.config.moe_router_topk_limited_devices,
+            moe_router_topk_scaling_factor=self.config.moe_router_topk_scaling_factor,
            deterministic_mode=self.config.deterministic_mode,
        )

        if self.training:
            # Apply load balancing loss
            scores = torch.softmax(logits, dim=-1, dtype=torch.float32)
-            probs = self.apply_load_balancing_loss(scores, tokens_per_expert, activation=probs)
+            aux_loss_func = partial(
+                switch_load_balancing_loss_func,
+                probs=scores,
+                tokens_per_expert=tokens_per_expert,
+                topk=self.topk,
+            )
+            probs = self.apply_load_balancing_loss(
+                activation=probs, load_balancing_loss_func=aux_loss_func
+            )
        return probs, routing_map

-    def apply_load_balancing_loss(
-        self,
-        probs: torch.Tensor,
-        num_local_tokens_per_expert: torch.Tensor,
-        activation: torch.Tensor,
-    ):
-        """Applies auxiliary loss to the MoE layer.
+    def seq_aux_loss_load_balancing(self, logits: torch.Tensor, bsz: int, seq_length: int):
+        """Apply loss-based load balancing to the logits tensor."""

-        Args:
-            probs (torch.Tensor): The probs output by the router for each token.
-                [num_tokens, num_experts]
-            num_local_tokens_per_expert (torch.Tensor): The number of tokens per expert.
-                [num_experts]
-            activation (torch.Tensor): The activation tensor to attach the gradient function to.
+        probs, routing_map, tokens_per_expert = topk_softmax_with_capacity(
+            logits,
+            self.topk,
+            capacity_factor=self.config.moe_expert_capacity_factor,
+            pad_to_capacity=self.config.moe_pad_expert_input_to_capacity,
+            drop_policy=self.config.moe_token_drop_policy,
+            use_pre_softmax=self.config.moe_router_pre_softmax,
+            moe_router_topk_limited_devices=self.config.moe_router_topk_limited_devices,
+            moe_router_topk_scaling_factor=self.config.moe_router_topk_scaling_factor,
+            deterministic_mode=self.config.deterministic_mode,
+        )

-        Returns:
-            torch.Tensor: The activation tensor with the attached gradient function.
-        """
+        if self.training:
+            scores = torch.softmax(logits, dim=-1, dtype=torch.float32)
+            aux_loss_func = partial(
+                sequence_load_balancing_loss_func,
+                probs=scores,
+                routing_map=routing_map,
+                tokens_per_expert=tokens_per_expert,
+                batch_size=bsz,
+                seq_length=seq_length,
+                topk=self.topk,
+            )
+            probs = self.apply_load_balancing_loss(
+                activation=probs, load_balancing_loss_func=aux_loss_func
+            )
+
+        return probs, routing_map
+
+    def apply_load_balancing_loss(
+        self, activation: torch.Tensor, load_balancing_loss_func: Callable
+    ):
+        """Calculate auxiliary loss, attach gradient function to activation and add to logging."""
        moe_aux_loss_coeff = self.config.moe_aux_loss_coeff
        sequence_partition_group = None
        if self.config.moe_token_dispatcher_type == "alltoall_seq":
@@ -186,12 +217,8 @@ class TopKRouter(Router):
        else:
            sequence_partition_group = parallel_state.get_tensor_and_context_parallel_group()

-        aux_loss = switch_load_balancing_loss_func(
-            probs,
-            num_local_tokens_per_expert,
-            self.topk,
-            moe_aux_loss_coeff,
-            sequence_partition_group=sequence_partition_group,
+        aux_loss = load_balancing_loss_func(
+            moe_aux_loss_coeff=moe_aux_loss_coeff, sequence_partition_group=sequence_partition_group
        )
        save_to_aux_losses_tracker(
            "load_balancing_loss",
@@ -257,6 +284,7 @@ class TopKRouter(Router):
            routing_map (torch.Tensor): The mapping of token to experts assignment,
                with shape [num_tokens, num_experts].
        """
+        seq_length, bsz = logits.shape[:2]
        logits = logits.view(-1, self.config.num_moe_experts)

        # Apply Z-Loss
@@ -270,6 +298,8 @@ class TopKRouter(Router):
            scores, routing_map = self.sinkhorn_load_balancing(logits)
        elif self.routing_type == "aux_loss":
            scores, routing_map = self.aux_loss_load_balancing(logits)
+        elif self.routing_type == "seq_aux_loss":
+            scores, routing_map = self.seq_aux_loss_load_balancing(logits, bsz, seq_length)
        elif self.routing_type == "none":
            # A naive top-k routing without load balancing
            scores, routing_map, _ = topk_softmax_with_capacity(
@@ -279,6 +309,7 @@ class TopKRouter(Router):
                pad_to_capacity=self.config.moe_pad_expert_input_to_capacity,
                drop_policy=self.config.moe_token_drop_policy,
                use_pre_softmax=self.config.moe_router_pre_softmax,
+                moe_router_topk_scaling_factor=self.config.moe_router_topk_scaling_factor,
                deterministic_mode=self.config.deterministic_mode,
            )
        else:
@@ -293,12 +324,10 @@ class TopKRouter(Router):
        Args:
            input (torch.Tensor): Input tensor.
        """
-        self.hidden = input.shape[-1]

        # Apply input jitter
        input = self.apply_input_jitter(input)
        logits = self.gating(input)
-        logits = logits.view(-1, self.config.num_moe_experts)

        scores, routing_map = self.routing(logits)


--- a/megatron/core/transformer/moe/shared_experts.py
+++ b/megatron/core/transformer/moe/shared_experts.py
@@ -17,8 +17,7 @@ from megatron.core.tensor_parallel.mappings import (
    reduce_from_tensor_model_parallel_region,
    reduce_scatter_to_sequence_parallel_region,
 )
-from megatron.core.transformer.mlp import MLP
-from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.utils import is_torch_min_version, make_sharded_tensor_for_checkpoint

@@ -32,15 +31,15 @@ class SharedExpertMLP(MLP):
    # The shared experts are scheduled into this stream to be overlapped with the dispatcher.
    stream = None

-    def __init__(self, config: TransformerConfig, spec: ModuleSpec):
+    def __init__(self, config: TransformerConfig, submodules: MLPSubmodules, gate: bool):
        config = deepcopy(config)
        assert config.add_bias_linear == False, "bias is not supported in the shared experts, "
        "please set '--disable-bias-linear' instead."

        config.ffn_hidden_size = config.moe_shared_expert_intermediate_size
-        super().__init__(config=config, submodules=spec.submodules)
+        super().__init__(config=config, submodules=submodules)

-        self.use_shared_expert_gate = spec.params.get("gate", False)
+        self.use_shared_expert_gate = gate
        if self.use_shared_expert_gate:
            # TODO: Add support for GPU initialization, which requires updating the golden values.
            self.gate_weight = torch.nn.Parameter(torch.empty((1, self.config.hidden_size)))

--- a/megatron/core/transformer/moe/token_dispatcher.py
+++ b/megatron/core/transformer/moe/token_dispatcher.py
--- a/megatron/core/transformer/moe/upcycling_utils.py
+++ b/megatron/core/transformer/moe/upcycling_utils.py
--- a/megatron/core/transformer/multi_latent_attention.py
+++ b/megatron/core/transformer/multi_latent_attention.py
--- a/megatron/core/transformer/spec_utils.py
+++ b/megatron/core/transformer/spec_utils.py
--- a/megatron/core/transformer/torch_layer_norm.py
+++ b/megatron/core/transformer/torch_layer_norm.py
--- a/megatron/core/transformer/torch_norm.py
+++ b/megatron/core/transformer/torch_norm.py
--- a/megatron/core/transformer/transformer_block.py
+++ b/megatron/core/transformer/transformer_block.py
@@ -576,7 +576,10 @@ class TransformerBlock(MegatronModule):
        non_homogeneous_layers = metadata is not None and metadata.get(
            'non_homogeneous_layers', False
        )
-        if self.config.num_moe_experts is not None:
+        if isinstance(self.config.moe_layer_freq, int):
+            if self.config.moe_layer_freq > 1:
+                non_homogeneous_layers = True
+        elif isinstance(self.config.moe_layer_freq, list):
            non_homogeneous_layers = True

        sharded_state_dict = {}