同步最新代码

99a0c39e · xingjinliang · 50fe58fa · 99a0c39e · 99a0c39e · 99a0c39e
Commit 99a0c39e authored Dec 25, 2024 by xingjinliang
20 changed files
--- a/megatron/core/distributed/distributed_data_parallel_config.py
+++ b/megatron/core/distributed/distributed_data_parallel_config.py
--- a/megatron/core/distributed/finalize_model_grads.py
+++ b/megatron/core/distributed/finalize_model_grads.py
--- a/megatron/core/distributed/param_and_grad_buffer.py
+++ b/megatron/core/distributed/param_and_grad_buffer.py
@@ -270,13 +270,12 @@ class _ParamAndGradBucketGroup:
        if self.ddp_config.average_in_collective:
            reduce_op = torch.distributed.ReduceOp.AVG
-        # Stream synchronization logic of the CUDA streams that is
+        # We use the following stream synchronization for the gradient reduction
-        # implemented below for the gradient reduction within and across
+        # within and across DistOpt instances.
-        # distributed optimizer instances.
-        # Compute Stream - -------------Gradient Compute-------------------
+        # Compute Stream: -------------Gradient compute-------------------
-        # Comm. Stream   - ------(wait for nccl)-----(wait for nccl)-------
+        # Comm. Stream:   ------(wait for NCCL)-----(wait for NCCL)-------
-        # NCCL Stream    -       -------RS------     -------AR------
+        # NCCL Stream:          -------RS------     -------AR------
        # Use async communications only when overlap_grad_reduce is True.
        async_op = (
@@ -287,13 +286,13 @@ class _ParamAndGradBucketGroup:
            self.ddp_config.num_distributed_optimizer_instances > 1
            and self.ddp_config.overlap_grad_reduce
        ):
-            # Assign a communication stream if we use partial DP DistOpt and we
+            # Assign a communication stream if we have multiple DistOpt instances and we
-            # need to overlap communication
+            # need to overlap communication.
            stream_context = torch.cuda.stream(self.communication_stream)
            # The RS/AR communication stream needs to wait for the default stream
            # to complete its gradient computation before launching the next
-            # gradient reduction collective
+            # gradient reduction collective.
            self.communication_stream.wait_stream(torch.cuda.default_stream())
        else:
            stream_context = nullcontext()
@@ -314,24 +313,21 @@ class _ParamAndGradBucketGroup:
                        local_data_view,
                        bucket.grad_data,
                        op=reduce_op,
-                        group=self.intra_distributed_optimizer_instance_group,
+                        group=communication_group,
                        async_op=async_op,
                    )
                else:
                    torch.distributed.all_reduce(
-                        bucket.grad_data,
+                        bucket.grad_data, op=reduce_op, group=communication_group, async_op=async_op
-                        op=reduce_op,
-                        group=self.data_parallel_group,
-                        async_op=async_op,
                    )
-        # When enabling partial DP domain DistOpt, we need to All-Reduce across all partial domains
+        # With multiple DistOpt instances, we need to all-reduce across instances.
        if (
            self.ddp_config.use_distributed_optimizer
            and self.ddp_config.num_distributed_optimizer_instances > 1
        ):
-            # Create a new coalescing facility for the inter partial DP-AllReduce here
+            # Create a new coalescing manager for the inter-instance all-reduce.
            with stream_context, _coalescing_manager(
                self.inter_distributed_optimizer_instance_group, async_ops=async_op
            ) as cm:
@@ -366,13 +362,13 @@ class _ParamAndGradBucketGroup:
        communication call to complete. When ddp_config.overlap_grad_reduce is set to False,
        makes synchronous call.
        """
-        # If overlap_grad_reduce is False, start (and finish) synchronous communication call here.
        self.param_gather_dispatched = False
+        # If overlap_grad_reduce is False, start (and finish) synchronous communication call here.
        if not self.ddp_config.overlap_grad_reduce:
            self.start_grad_sync()
            return
-        # When using partial DP DistOpt, we don't need to sync as we launch comms on a separate
+        # When using multiple DistOpt instances, we don't need to sync here as we launch
-        # communication stream
+        # communications on a separate communication stream.
        if self.ddp_config.num_distributed_optimizer_instances > 1:
            torch.cuda.default_stream().wait_stream(self.communication_stream)
            return

--- a/megatron/core/distributed/torch_fully_sharded_data_parallel.py
+++ b/megatron/core/distributed/torch_fully_sharded_data_parallel.py
--- a/megatron/core/enums.py
+++ b/megatron/core/enums.py
--- a/megatron/core/export/__init__.py
+++ b/megatron/core/export/__init__.py
--- a/megatron/core/export/data_type.py
+++ b/megatron/core/export/data_type.py
--- a/megatron/core/export/export_config.py
+++ b/megatron/core/export/export_config.py
--- a/megatron/core/export/model_type.py
+++ b/megatron/core/export/model_type.py
--- a/megatron/core/export/trtllm/__init__.py
+++ b/megatron/core/export/trtllm/__init__.py
--- a/megatron/core/export/trtllm/engine_builder/__init__.py
+++ b/megatron/core/export/trtllm/engine_builder/__init__.py
--- a/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py
+++ b/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py
--- a/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py
+++ b/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py
--- a/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py
+++ b/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py
--- a/megatron/core/export/trtllm/trt_model_config.py
+++ b/megatron/core/export/trtllm/trt_model_config.py
--- a/megatron/core/export/trtllm/trt_model_type.py
+++ b/megatron/core/export/trtllm/trt_model_type.py
--- a/megatron/core/export/trtllm/trtllm_helper.py
+++ b/megatron/core/export/trtllm/trtllm_helper.py
--- a/megatron/core/export/trtllm/trtllm_layers.py
+++ b/megatron/core/export/trtllm/trtllm_layers.py
--- a/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py
+++ b/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py
--- a/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py
+++ b/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py