Merge pull request #3915 from FrankLeeeee/update/develop

[sync] update develop with main

Merge pull request #3915 from FrankLeeeee/update/develop
[sync] update develop with main
c622bb36 · Frank Lee · GitHub · 34966378 · 9c88b6cb · c622bb36
Unverified Commit c622bb36 authored Jun 07, 2023 by Frank Lee Committed by GitHub Jun 07, 2023
20 changed files
--- a/applications/Chat/examples/ray/test_ci.sh
+++ b/applications/Chat/examples/ray/test_ci.sh
+#!/bin/bash
+
+set -xe
+BASE=$(realpath $(dirname $0))
+
+export RAY_NAMESPACE=admin
+export DATA=/data/scratch/chatgpt/prompts.csv
+
+# install requirements
+pip install -r ${BASE}/requirements.txt
+
+python ${BASE}/mmmt_prompt.py --prompt_path $DATA --num_makers 2 --num_trainers 2 --trainer_strategy colossalai_gemini --model opt --critic_model opt --pretrain facebook/opt-350m --critic_pretrain facebook/opt-125m --experience_batch_size 4 --train_batch_size 2
--- a/applications/Chat/examples/test_ci.sh
+++ b/applications/Chat/examples/test_ci.sh
@@ -124,3 +124,6 @@ torchrun --standalone --nproc_per_node=2 ${BASE}/train_prompts.py --prompt_datas
 rm -rf ${BASE}/rm_ckpt_gpt.pt

 rm -rf ${BASE}/actor_checkpoint_prompts.pt
+
+# 3080 doesn't support P2P, skip this test
+# cd ${BASE}/ray && bash test_ci.sh && cd ${BASE}
--- a/colossalai/amp/naive_amp/mixed_precision_mixin/__init__.py
+++ b/colossalai/amp/naive_amp/mixed_precision_mixin/__init__.py
+from .base import MixedPrecisionMixin
+from .bf16 import BF16MixedPrecisionMixin
+from .fp16 import FP16MixedPrecisionMixin
+
+__all__ = [
+    'MixedPrecisionMixin',
+    'FP16MixedPrecisionMixin',
+    'BF16MixedPrecisionMixin',
+]
--- a/colossalai/amp/naive_amp/mixed_precision_mixin/base.py
+++ b/colossalai/amp/naive_amp/mixed_precision_mixin/base.py
+from abc import ABC, abstractmethod
+
+import torch
+from torch import Tensor
+
+
+class MixedPrecisionMixin(ABC):
+    """A helper class for mixed precision training. This mixin is used in mixed precision optimizers.
+
+    Attributes:
+        dtype (torc.dtype): The expected dtype of the gradients.
+
+    Examples:
+        ```python
+        class MyMixedPrecisionOptimizer(OptimizerWrapper):
+            def __init__(self, optim: Optimizer):
+                super().__init__(optim)
+                self.mixed_precision = MixedPrecisionMixin()
+
+            def backward(self, loss):
+                loss = self.mixed_precision.pre_backward(loss)
+                loss.backward()
+
+            def backward_by_grad(self, tensor, grad):
+                grad = self.mixed_precision.pre_backward_by_grad(tensor, grad)
+                tensor.backward(grad)
+
+            def step(self):
+                if self.mixed_precision.should_skip_step():
+                    self.zero_grad()
+                    return
+                div_scale = self.mixed_precision.get_grad_div_scale()
+                # maybe clip grad here
+                # maybe scale grad here
+                self.optim.step()
+
+            def zero_grad(self):
+                self.mixed_precision.pre_zero_grad()
+                return self.optim.zero_grad()
+        ```
+    """
+    dtype: torch.dtype
+
+    @abstractmethod
+    def pre_backward(self, loss: Tensor) -> Tensor:
+        """Called before backward.
+
+        Args:
+            loss (Tensor): Loss value.
+
+        Returns:
+            Tensor: Loss value (possibly scaled).
+        """
+        pass
+
+    @abstractmethod
+    def pre_backward_by_grad(self, tensor: Tensor, grad: Tensor) -> Tensor:
+        """Called before backward by grad. This is helpful for pipeline parallelism.
+
+        Args:
+            tensor (Tensor): Tensor to backward.
+            grad (Tensor): Gradient of the tensor.
+
+        Returns:
+            Tensor: Gradient of the tensor (possibly scaled).
+        """
+        pass
+
+    @abstractmethod
+    def should_skip_step(self) -> bool:
+        """Called before step.
+
+        Returns:
+            bool: Whether to skip the step.
+        """
+        pass
+
+    @abstractmethod
+    def pre_zero_grad(self) -> None:
+        """Called before zero_grad.
+        """
+        pass
+
+    @abstractmethod
+    def get_grad_div_scale(self) -> float:
+        """Called before step or clip_grad. To keep computation efficiency, this method does not (maybe) unscale grads.
+
+        Returns:
+            float: A divisor for gradient clipping or step.
+        """
+        pass
--- a/colossalai/amp/naive_amp/mixed_precision_mixin/bf16.py
+++ b/colossalai/amp/naive_amp/mixed_precision_mixin/bf16.py
+import torch
+from torch import Tensor
+
+from .base import MixedPrecisionMixin
+
+
+class BF16MixedPrecisionMixin(MixedPrecisionMixin):
+    dtype = torch.bfloat16
+
+    def pre_backward(self, loss: Tensor) -> Tensor:
+        return loss
+
+    def pre_backward_by_grad(self, tensor: Tensor, grad: Tensor) -> Tensor:
+        return grad
+
+    def should_skip_step(self) -> bool:
+        return False
+
+    def pre_zero_grad(self) -> None:
+        pass
+
+    def get_grad_div_scale(self) -> float:
+        return 1.0
--- a/colossalai/amp/naive_amp/mixed_precision_mixin/fp16.py
+++ b/colossalai/amp/naive_amp/mixed_precision_mixin/fp16.py
+from abc import abstractmethod
+from enum import Enum
+
+import torch
+import torch.distributed as dist
+from torch import Tensor
+
+from colossalai.amp.naive_amp.grad_scaler import DynamicGradScaler
+from colossalai.utils import get_current_device
+
+from .base import MixedPrecisionMixin
+
+
+class OptimState(Enum):
+    SCALED = 0
+    UNSCALED = 1
+
+
+class FP16MixedPrecisionMixin(MixedPrecisionMixin):
+    dtype = torch.float16
+
+    def __init__(self,
+                 initial_scale: float = 2**16,
+                 min_scale: float = 1,
+                 growth_factor: float = 2,
+                 backoff_factor: float = 0.5,
+                 growth_interval: int = 1000,
+                 hysteresis: int = 2,
+                 max_scale: float = 2**32) -> None:
+        super().__init__()
+        self.grad_scaler = DynamicGradScaler(initial_scale=initial_scale,
+                                             min_scale=min_scale,
+                                             growth_factor=growth_factor,
+                                             backoff_factor=backoff_factor,
+                                             growth_interval=growth_interval,
+                                             hysteresis=hysteresis,
+                                             max_scale=max_scale)
+        self.optim_state = OptimState.UNSCALED
+        self.found_overflow = torch.zeros(1, dtype=torch.float, device=get_current_device())
+
+    @property
+    def loss_scale(self) -> float:
+        return self.grad_scaler.scale.item()
+
+    @abstractmethod
+    def check_local_overflow(self) -> bool:
+        """Check whether there is overflow in the local process. This method should be implemented by subclasses.
+
+        Returns:
+            bool: Whether there is overflow in the local process.
+        """
+        pass
+
+    def check_overflow(self) -> bool:
+        # clear previous overflow record
+        self.found_overflow.fill_(0.0)
+        if self.check_local_overflow():
+            self.found_overflow.fill_(1.0)
+        dist.all_reduce(self.found_overflow, op=dist.ReduceOp.MAX)
+        return self.found_overflow.item() > 0
+
+    def pre_backward(self, loss: Tensor) -> Tensor:
+        loss = self.loss_scale * loss
+        self.optim_state = OptimState.SCALED
+        return loss
+
+    def pre_backward_by_grad(self, tensor: Tensor, grad: Tensor) -> Tensor:
+        self.optim_state = OptimState.SCALED
+        return grad
+
+    def should_skip_step(self) -> bool:
+        found_inf = self.check_overflow()
+        self.grad_scaler.update(found_inf)
+        if found_inf:
+            self.optim_state = OptimState.UNSCALED
+        return found_inf
+
+    def pre_zero_grad(self) -> None:
+        pass
+
+    def get_grad_div_scale(self) -> float:
+        assert self.optim_state == OptimState.SCALED, 'grads should be scaled before clipping'
+        self.optim_state = OptimState.UNSCALED
+        return self.loss_scale
--- a/colossalai/auto_parallel/tensor_shard/node_handler/matmul_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/matmul_handler.py
@@ -206,7 +206,7 @@ class Broadcaster(BmmTransform):
                    # e.g. [1, 2, 4] x [4, 4, 8] -> [4, 2, 8]
                    # the dim 0 of [1, 2, 4] is multiplied to 4
                    tensor_shape[dim_idx] = 1
-                elif broadcast_type == BroadcastType.PADDDING:
+                elif broadcast_type == BroadcastType.PADDING:
                    # if the dim is padded
                    # we remove its sharding
                    tensor_shape[dim_idx] = None

--- a/colossalai/auto_parallel/tensor_shard/utils/broadcast.py
+++ b/colossalai/auto_parallel/tensor_shard/utils/broadcast.py
@@ -21,7 +21,7 @@ __all__ = [

 class BroadcastType(Enum):
    EQUAL = auto()
-    PADDDING = auto()
+    PADDING = auto()
    MULTIPLE = auto()


@@ -69,18 +69,18 @@ def get_broadcast_dim_info(logical_shape, physical_shape):
    for i in range(logical_num_dims):
        # get the trailing dim size
        logical_dim_idx = logical_num_dims - i - 1
-        phyiscal_dim_idx = physical_num_dims - i - 1
+        physical_dim_idx = physical_num_dims - i - 1
        logical_dim_size = logical_shape[logical_dim_idx]

-        if phyiscal_dim_idx >= 0:
-            physical_dim_size = physical_shape[phyiscal_dim_idx]
+        if physical_dim_idx >= 0:
+            physical_dim_size = physical_shape[physical_dim_idx]

            if physical_dim_size == logical_dim_size:
                logical_dim_broadcast_info[logical_dim_idx] = BroadcastType.EQUAL
            elif physical_dim_size == 1 and physical_dim_size != logical_dim_size:
                logical_dim_broadcast_info[logical_dim_idx] = BroadcastType.MULTIPLE
        else:
-            logical_dim_broadcast_info[logical_dim_idx] = BroadcastType.PADDDING
+            logical_dim_broadcast_info[logical_dim_idx] = BroadcastType.PADDING

    return logical_dim_broadcast_info

@@ -117,7 +117,7 @@ def recover_sharding_spec_for_broadcast_shape(logical_sharding_spec: ShardingSpe
    for shape_dim, mesh_dim in logical_dim_partition.items():
        logical_broadcast_type = logical_dim_broadcast_info[shape_dim]

-        if logical_broadcast_type == BroadcastType.PADDDING or logical_broadcast_type == BroadcastType.MULTIPLE:
+        if logical_broadcast_type == BroadcastType.PADDING or logical_broadcast_type == BroadcastType.MULTIPLE:
            removed_dims.extend(mesh_dim)
        else:
            # get the corresponding physical dim

--- a/colossalai/booster/booster.py
+++ b/colossalai/booster/booster.py
@@ -25,11 +25,11 @@ class Booster:
    Examples:
        ```python
        colossalai.launch(...)
-        plugin = GeminiPlugin(stage=3, ...)
+        plugin = GeminiPlugin(...)
        booster = Booster(precision='fp16', plugin=plugin)

        model = GPT2()
-        optimizer = Adam(model.parameters())
+        optimizer = HybridAdam(model.parameters())
        dataloader = Dataloader(Dataset)
        lr_scheduler = LinearWarmupScheduler()
        criterion = GPTLMLoss()

--- a/colossalai/booster/plugin/gemini_plugin.py
+++ b/colossalai/booster/plugin/gemini_plugin.py
@@ -23,6 +23,9 @@ from .dp_plugin_base import DPPluginBase

 __all__ = ['GeminiPlugin']

+SUPPORTED_PRECISION = ['fp16', 'bf16']
+PRECISION_STR_TO_DTYPE = {'fp16': torch.half, 'bf16': torch.bfloat16}
+

 class GeminiCheckpointIO(GeneralCheckpointIO):

@@ -171,6 +174,7 @@ class GeminiPlugin(DPPluginBase):
    Args:
        device (torch.device): device to place the model.
        placement_policy (str, optional): "cpu", "cuda", "auto". Defaults to "cpu".
+        precision (str, optional): precision. Support 'fp16' and 'bf16'. Defaults to 'fp16'.
        pin_memory (bool, optional): use pin memory on CPU. Defaults to False.
        force_outputs_fp32 (bool, optional): force outputs are fp32. Defaults to False.
        strict_ddp_mode (bool, optional): use strict ddp mode (only use dp without other parallelism). Defaults to False.
@@ -203,6 +207,7 @@ class GeminiPlugin(DPPluginBase):
        self,
        device: Optional[torch.device] = None,
        placement_policy: str = "cpu",
+        precision: str = "fp16",
        pin_memory: bool = False,
        force_outputs_fp32: bool = False,
        strict_ddp_mode: bool = False,
@@ -223,6 +228,7 @@ class GeminiPlugin(DPPluginBase):
        verbose: bool = False,
    ) -> None:
        super().__init__()
+        assert precision in SUPPORTED_PRECISION, f'precision {precision} is not supported'
        self.gemini_config = dict(
            device=(device or get_current_device()),
            placement_policy=placement_policy,
@@ -233,6 +239,7 @@ class GeminiPlugin(DPPluginBase):
            hidden_dim=hidden_dim,
            min_chunk_size_mb=min_chunk_size_mb,
            memstats=memstats,
+            mixed_precision=PRECISION_STR_TO_DTYPE[precision],
        )
        self.zero_optim_config = dict(gpu_margin_mem_ratio=gpu_margin_mem_ratio,)
        self.optim_kwargs = dict(initial_scale=initial_scale,
@@ -253,7 +260,7 @@ class GeminiPlugin(DPPluginBase):
        return True

    def supported_precisions(self) -> List[str]:
-        return ['fp16']
+        return SUPPORTED_PRECISION

    def control_device(self) -> bool:
        return True

--- a/colossalai/booster/plugin/low_level_zero_plugin.py
+++ b/colossalai/booster/plugin/low_level_zero_plugin.py
 import warnings
+from functools import partial
 from typing import Callable, Iterator, List, Optional, Tuple, Union

 import torch
@@ -20,12 +21,15 @@ from .torch_ddp_plugin import TorchDDPCheckpointIO
 __all__ = ['LowLevelZeroPlugin']


-def _convert_to_fp16(x):
+def _convert_floating_point(x, dtype: torch.dtype = torch.float16):
    if isinstance(x, torch.Tensor) and torch.is_floating_point(x):
-        return x.half()
+        return x.to(dtype)
    return x


+SUPPORTED_PRECISION = ['fp16', 'bf16', 'fp32']
+
+
 class LowLevelZeroCheckpointIO(TorchDDPCheckpointIO):

    def save_unsharded_optimizer(self, optimizer: Optimizer, checkpoint: str, gather_dtensor: bool):
@@ -49,17 +53,24 @@ class LowLevelZeroModel(ModelWrapper):

    def __init__(self, module: nn.Module, stage: int, precision: str) -> None:
        super().__init__(module)
-        self.convert_inputs = (precision == 'fp16')
-        module = zero_model_wrapper(module, zero_stage=stage)
+        self.dtype = None
        if precision == 'fp16':
-            module = module.half()
+            self.dtype = torch.float16
+        elif precision == 'bf16':
+            self.dtype = torch.bfloat16
+        module = zero_model_wrapper(module, zero_stage=stage)
+        if self.dtype is not None:
+            module = module.to(self.dtype)
        module = module.to(get_current_device())
        self.module = module
+        self.convert_fn = None
+        if self.dtype is not None:
+            self.convert_fn = partial(_convert_floating_point, dtype=self.dtype)

    def forward(self, *args, **kwargs):
-        if self.convert_inputs:
-            args = tree_map(_convert_to_fp16, args)
-            kwargs = tree_map(_convert_to_fp16, kwargs)
+        if self.convert_fn is not None:
+            args = tree_map(self.convert_fn, args)
+            kwargs = tree_map(self.convert_fn, kwargs)
        return super().forward(*args, **kwargs)


@@ -110,7 +121,7 @@ class LowLevelZeroPlugin(DPPluginBase):

    Args:
        strage (int, optional): ZeRO stage. Defaults to 1.
-        precision (str, optional): precision. Support 'fp16' and 'fp32'. Defaults to 'fp16'.
+        precision (str, optional): precision. Support 'fp16', 'bf16' and 'fp32'. Defaults to 'fp16'.
        initial_scale (float, optional): Initial scale used by DynamicGradScaler. Defaults to 2**32.
        min_scale (float, optional): Min scale used by DynamicGradScaler. Defaults to 1.
        growth_factor (float, optional): growth_factor used by DynamicGradScaler. Defaults to 2.
@@ -149,7 +160,7 @@ class LowLevelZeroPlugin(DPPluginBase):
    ) -> None:
        super().__init__()
        assert stage in (1, 2), f'LowLevelZeroPlugin only supports stage 1/2 training'
-        assert precision in ('fp16', 'fp32'), f'LowLevelZeroPlugin only supports fp16/fp32 training'
+        assert precision in SUPPORTED_PRECISION, f'LowLevelZeroPlugin only supports {SUPPORTED_PRECISION} training'

        self.stage = stage
        self.precision = precision
@@ -175,7 +186,7 @@ class LowLevelZeroPlugin(DPPluginBase):
        return True

    def supported_precisions(self) -> List[str]:
-        return ['fp16', 'fp32']
+        return SUPPORTED_PRECISION

    def control_device(self) -> bool:
        return True

--- a/colossalai/booster/plugin/torch_fsdp_plugin.py
+++ b/colossalai/booster/plugin/torch_fsdp_plugin.py
@@ -3,10 +3,10 @@ from typing import Callable, Iterable, Iterator, List, Optional, Tuple, Union

 import torch
 import torch.nn as nn
+import warnings
 from packaging import version
 from torch.distributed import ProcessGroup

-
 if version.parse(torch.__version__) >= version.parse('1.12.0'):
    from torch.distributed.fsdp import FullStateDictConfig
    from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
@@ -202,6 +202,11 @@ class TorchFSDPPlugin(DPPluginBase):

        # wrap the model with PyTorch FSDP
        fsdp_model = TorchFSDPModel(model, device_id=torch.cuda.current_device(), **self.fsdp_kwargs)
+
+        if len(optimizer.param_groups) > 1:
+            warnings.warn(
+                'TorchFSDPPlugin does not support optimizer that use multi param groups. The results may not be as expected if used.'
+            )
        optimizer.__init__(fsdp_model.parameters(), **optimizer.defaults)

        if not isinstance(optimizer, FSDPOptimizerWrapper):

--- a/colossalai/cli/launcher/__init__.py
+++ b/colossalai/cli/launcher/__init__.py
@@ -28,7 +28,7 @@ from .run import launch_multi_processes
    type=str,
    default=None,
    help=
-    "Specify computing devices to NOT use during execution. Mutually exclusive with --include. Formatting is the same as --includ,"
+    "Specify computing devices to NOT use during execution. Mutually exclusive with --include. Formatting is the same as --include,"
    " only effective when used with --hostfile.")
 @click.option("--num_nodes",
              type=int,

--- a/colossalai/cli/launcher/hostinfo.py
+++ b/colossalai/cli/launcher/hostinfo.py
@@ -38,7 +38,7 @@ class HostInfo:

        # socket.getfqdn("127.0.0.1") does not return localhost
        # on some users' machines
-        # thus, we directly return True if hostname is locahost, 127.0.0.1 or 0.0.0.0
+        # thus, we directly return True if hostname is localhost, 127.0.0.1 or 0.0.0.0
        if hostname in ("localhost", "127.0.0.1", "0.0.0.0"):
            return True


--- a/colossalai/cli/launcher/multinode_runner.py
+++ b/colossalai/cli/launcher/multinode_runner.py
@@ -114,7 +114,7 @@ class MultiNodeRunner:
        Receive messages from all hosts

        Returns:
-            msg_from_node (dict): a dictionry which contains messages from each node
+            msg_from_node (dict): a dictionary which contains messages from each node
        """

        msg_from_node = dict()

--- a/colossalai/cli/launcher/run.py
+++ b/colossalai/cli/launcher/run.py
@@ -154,7 +154,7 @@ def get_launch_command(
        extra_launch_args = dict()

    torch_version = version.parse(torch.__version__)
-    assert torch_version.major == 1
+    assert torch_version.major >= 1

    if torch_version.minor < 9:
        cmd = [
@@ -298,7 +298,7 @@ def launch_multi_processes(args: Config) -> None:
    # receive the stop status
    msg_from_node = runner.recv_from_all()

-    # printe node status
+    # print node status
    click.echo("\n====== Stopping All Nodes =====")
    for hostname, msg in msg_from_node.items():
        click.echo(f"{hostname}: {msg}")

--- a/colossalai/device/alpha_beta_profiler.py
+++ b/colossalai/device/alpha_beta_profiler.py
@@ -197,7 +197,7 @@ class AlphaBetaProfiler:
            dist.broadcast_object_list(broadcast_list, src=process_group[0])
            alpha_beta_dict[process_group] = tuple(broadcast_list)

-        # add symmetry pair to the apha_beta_dict
+        # add symmetry pair to the alpha_beta_dict
        symmetry_ab_dict = {}
        for process_group, alpha_beta_pair in alpha_beta_dict.items():
            symmetry_process_group = (process_group[1], process_group[0])

--- a/colossalai/fx/tracer/bias_addition_patch/patched_bias_addition_module/bias_addition_module.py
+++ b/colossalai/fx/tracer/bias_addition_patch/patched_bias_addition_module/bias_addition_module.py
@@ -51,7 +51,7 @@ class BiasAdditionModule(ABC):

        For example:
            The kwargs for conv2d module is {} because the attributes like 'padding' or 'groups' are
-            considered during module initilizing. However, we need to consider those attributes as kwargs
+            considered during module initializing. However, we need to consider those attributes as kwargs
            in F.conv2d.
        """
        pass

--- a/colossalai/fx/tracer/experimental.py
+++ b/colossalai/fx/tracer/experimental.py
@@ -295,7 +295,7 @@ class ColoTracer(Tracer):

                @staticmethod
                def forward(ctx, run_function, preserve_rng_state, *args):
-                    # signal that the current tracing occurs within activaton checkpoint part
+                    # signal that the current tracing occurs within activation checkpoint part
                    self.inside_torch_checkpoint_func = True
                    out = run_function(*args)
                    self.inside_torch_checkpoint_func = False

--- a/colossalai/fx/tracer/tracer.py
+++ b/colossalai/fx/tracer/tracer.py
@@ -92,7 +92,7 @@ class ColoTracer(Tracer):
            return proxy

        # if graph is traced for auto parallelism module, some extra node will be added during
-        # graph construction to deal with the compatability between bias addition and all reduce.
+        # graph construction to deal with the compatibility between bias addition and all reduce.

        # if no extra manipulation is applied, we just pass the origin arguments to create_proxy function
        # to create node on computation graph
@@ -208,7 +208,7 @@ class ColoTracer(Tracer):
            self.proxy_cls = ColoProxy
            self.tracer_type = TracerType.META
        else:
-            raise ValueError(f"Unrecognised tracer type {tracer_type}")
+            raise ValueError(f"Unrecognized tracer type {tracer_type}")

    def _meta_data_computing(self, kind, target, args, kwargs):

@@ -445,7 +445,7 @@ class ColoTracer(Tracer):

                @staticmethod
                def forward(ctx, run_function, preserve_rng_state, *args):
-                    # signal that the current tracing occurs within activaton checkpoint part
+                    # signal that the current tracing occurs within activation checkpoint part
                    self.inside_torch_checkpoint_func = True
                    out = run_function(*args)
                    self.inside_torch_checkpoint_func = False