Unverified Commit b5f9e37c authored by Hongxin Liu's avatar Hongxin Liu Committed by GitHub
Browse files

[legacy] clean up legacy code (#4743)

* [legacy] remove outdated codes of pipeline (#4692)

* [legacy] remove cli of benchmark and update optim (#4690)

* [legacy] remove cli of benchmark and update optim

* [doc] fix cli doc test

* [legacy] fix engine clip grad norm

* [legacy] remove outdated colo tensor (#4694)

* [legacy] remove outdated colo tensor

* [test] fix test import

* [legacy] move outdated zero to legacy (#4696)

* [legacy] clean up utils (#4700)

* [legacy] clean up utils

* [example] update examples

* [legacy] clean up amp

* [legacy] fix amp module

* [legacy] clean up gpc (#4742)

* [legacy] clean up context

* [legacy] clean core, constants and global vars

* [legacy] refactor initialize

* [example] fix examples ci

* [example] fix examples ci

* [legacy] fix tests

* [example] fix gpt example

* [example] fix examples ci

* [devops] fix ci installation

* [example] fix examples ci
parent 32e7f994
......@@ -4,13 +4,13 @@ import torch
import torch.distributed as dist
from colossalai.legacy.registry import OPHOOKS
from colossalai.legacy.zero.gemini.ophooks import BaseOpHook
from colossalai.legacy.zero.gemini.stateful_tensor import TensorState
from colossalai.legacy.zero.gemini.stateful_tensor_mgr import StatefulTensorMgr
from colossalai.legacy.zero.shard_utils import BaseShardStrategy
from colossalai.logging import get_dist_logger
from colossalai.utils import get_current_device
from colossalai.zero.gemini.memory_tracer import MemStatsCollector
from colossalai.zero.legacy.gemini.ophooks import BaseOpHook
from colossalai.zero.legacy.gemini.stateful_tensor import TensorState
from colossalai.zero.legacy.gemini.stateful_tensor_mgr import StatefulTensorMgr
from colossalai.zero.legacy.shard_utils import BaseShardStrategy
@OPHOOKS.register_module
......
......@@ -12,15 +12,15 @@ from torch.nn.parameter import Parameter
from torch.optim import Optimizer
from colossalai.amp.naive_amp.grad_scaler import DynamicGradScaler
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.interface import OptimizerWrapper
from colossalai.legacy.context.parallel_mode import ParallelMode
from colossalai.legacy.core import global_context as gpc
from colossalai.legacy.zero.gemini.stateful_tensor import StatefulTensor, TensorState
from colossalai.legacy.zero.gemini.tensor_placement_policy import AutoTensorPlacementPolicy
from colossalai.legacy.zero.gemini.tensor_utils import colo_model_data_tensor_move_inline, colo_tensor_mem_usage
from colossalai.legacy.zero.sharded_model import ShardedModelV2
from colossalai.legacy.zero.sharded_model._utils import cast_tensor_to_fp32
from colossalai.logging import get_dist_logger
from colossalai.nn.optimizer import ColossalaiOptimizer
from colossalai.zero.legacy.gemini.stateful_tensor import StatefulTensor, TensorState
from colossalai.zero.legacy.gemini.tensor_placement_policy import AutoTensorPlacementPolicy
from colossalai.zero.legacy.gemini.tensor_utils import colo_model_data_tensor_move_inline, colo_tensor_mem_usage
from colossalai.zero.legacy.sharded_model import ShardedModelV2
from colossalai.zero.legacy.sharded_model._utils import cast_tensor_to_fp32
class OptimState(Enum):
......@@ -28,7 +28,7 @@ class OptimState(Enum):
UNSCALED = 2
class ShardedOptimizerV2(ColossalaiOptimizer):
class ShardedOptimizerV2(OptimizerWrapper):
"""A wrapper for optimizer. ``ShardedOptimizerV2`` and ``ShardedModelV2`` implement Zero Redundancy Optimizer (ZeRO).
By default the ZeRO optimizer stage 3 offload Optimizer States on CPU.
......
......@@ -2,8 +2,8 @@ from typing import List, Optional, Tuple
import torch
from colossalai.zero.legacy.gemini.stateful_tensor import StatefulTensor, TensorState
from colossalai.zero.legacy.gemini.tensor_utils import colo_tensor_mem_usage
from colossalai.legacy.zero.gemini.stateful_tensor import StatefulTensor, TensorState
from colossalai.legacy.zero.gemini.tensor_utils import colo_tensor_mem_usage
from .sharded_tensor import ShardedTensor
......
import torch
from colossalai.zero.legacy.gemini.stateful_tensor import StatefulTensor, TensorState
from colossalai.legacy.zero.gemini.stateful_tensor import StatefulTensor, TensorState
class ShardedTensor(StatefulTensor):
......
......@@ -134,8 +134,6 @@ class DistributedLogger:
Args:
message (str): The message to be logged.
parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`):
The parallel mode used for logging. Defaults to ParallelMode.GLOBAL.
ranks (List[int]): List of parallel ranks.
"""
message_prefix = "{}:{} {}".format(*self.__get_call_info())
......@@ -147,8 +145,6 @@ class DistributedLogger:
Args:
message (str): The message to be logged.
parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`):
The parallel mode used for logging. Defaults to ParallelMode.GLOBAL.
ranks (List[int]): List of parallel ranks.
"""
message_prefix = "{}:{} {}".format(*self.__get_call_info())
......@@ -160,8 +156,6 @@ class DistributedLogger:
Args:
message (str): The message to be logged.
parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`):
The parallel mode used for logging. Defaults to ParallelMode.GLOBAL.
ranks (List[int]): List of parallel ranks.
"""
message_prefix = "{}:{} {}".format(*self.__get_call_info())
......@@ -173,8 +167,6 @@ class DistributedLogger:
Args:
message (str): The message to be logged.
parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`):
The parallel mode used for logging. Defaults to ParallelMode.GLOBAL.
ranks (List[int]): List of parallel ranks.
"""
message_prefix = "{}:{} {}".format(*self.__get_call_info())
......
from .moe import *
# from .moe import *
from .utils import *
......@@ -6,10 +6,10 @@ import torch
import torch.distributed as dist
import torch.nn as nn
from colossalai.context import ParallelMode, seed
from colossalai.context.moe_context import MOE_CONTEXT
from colossalai.legacy.context import ParallelMode, seed
from colossalai.legacy.zero.init_ctx import no_shard_zero_decrator
from colossalai.utils import get_current_device
from colossalai.zero.legacy.init_ctx import no_shard_zero_decrator
class MoeExperts(nn.Module):
......
......@@ -6,6 +6,7 @@ import torch.nn as nn
import torch.nn.functional as F
from colossalai.context.moe_context import MOE_CONTEXT
from colossalai.legacy.zero.init_ctx import no_shard_zero_context, no_shard_zero_decrator
from colossalai.nn.layer.moe._operation import (
COL_MOE_KERNEL_FLAG,
AllGather,
......@@ -18,7 +19,6 @@ from colossalai.nn.layer.moe.experts import Experts, MoeExperts
from colossalai.nn.layer.moe.routers import MoeRouter, Top1Router, Top2Router
from colossalai.nn.layer.moe.utils import NormalNoiseGenerator, UniformNoiseGenerator
from colossalai.utils import get_current_device
from colossalai.zero.legacy.init_ctx import no_shard_zero_context, no_shard_zero_decrator
@no_shard_zero_decrator(is_replicated=True)
......
from .loss_moe import MoeCrossEntropyLoss, MoeLoss
# from .loss_moe import MoeCrossEntropyLoss, MoeLoss
from .colossalai_optimizer import ColossalaiOptimizer
from .cpu_adam import CPUAdam
from .fused_adam import FusedAdam
from .fused_lamb import FusedLAMB
from .fused_sgd import FusedSGD
from .hybrid_adam import HybridAdam
from .lamb import Lamb
from .lars import Lars
from .cpu_adam import CPUAdam
from .hybrid_adam import HybridAdam
__all__ = ['ColossalaiOptimizer', 'FusedLAMB', 'FusedAdam', 'FusedSGD', 'Lamb', 'Lars', 'CPUAdam', 'HybridAdam']
__all__ = ['FusedLAMB', 'FusedAdam', 'FusedSGD', 'Lamb', 'Lars', 'CPUAdam', 'HybridAdam']
import torch
import torch.nn as nn
from torch import Tensor
from torch.optim import Optimizer
from colossalai.utils import clip_grad_norm_fp32
class ColossalaiOptimizer(Optimizer):
def __init__(self, optim: Optimizer):
self.optim = optim
@property
def param_groups(self):
return self.optim.param_groups
@property
def defaults(self):
return self.optim.defaults
def add_param_group(self, *args, **kwargs):
return self.optim.add_param_group(*args, **kwargs)
def step(self, *args, **kwargs):
return self.optim.step(*args, **kwargs)
def zero_grad(self, *args, **kwargs):
self.optim.zero_grad(*args, **kwargs)
def load_state_dict(self, *args, **kwargs):
self.optim.load_state_dict(*args, **kwargs)
def state_dict(self):
return self.optim.state_dict()
def backward(self, loss: Tensor):
loss.backward()
def backward_by_grad(self, tensor: Tensor, grad: Tensor):
torch.autograd.backward(tensors=tensor, grad_tensors=grad)
def clip_grad_norm(self, model: nn.Module, max_norm: float):
if max_norm > 0.0:
clip_grad_norm_fp32(model.parameters(), max_norm)
from .pipelinable import PipelinableContext, PipelinableModel
from .layer_spec import LayerSpec
from .p2p import PipelineP2PCommunication
from .schedule import InterleavedSchedule, OneForwardOneBackwardSchedule, PipelineSchedule
from .stage_manager import PipelineStageManager
__all__ = ['PipelinableModel', 'PipelinableContext', 'LayerSpec']
\ No newline at end of file
__all__ = [
'PipelineSchedule',
'OneForwardOneBackwardSchedule',
'InterleavedSchedule',
'PipelineP2PCommunication',
'PipelineStageManager',
]
from .base import PipelineSchedule
from .interleaved_pp import InterleavedSchedule
from .one_f_one_b import OneForwardOneBackwardSchedule
__all__ = [
'PipelineSchedule',
'OneForwardOneBackwardSchedule',
'InterleavedSchedule',
]
from . import distspec
from .colo_parameter import ColoParameter
from .colo_tensor import ColoTensor
from .comm_spec import CollectiveCommPattern, CommSpec
from .compute_spec import ComputePattern, ComputeSpec
from .dist_spec_mgr import DistSpecManager
from .distspec import ReplicaSpec, ShardSpec
from .param_op_hook import ColoParamOpHook, ColoParamOpHookManager
from .process_group import ProcessGroup
from .tensor_spec import ColoTensorSpec
from .utils import convert_dim_partition_dict, convert_parameter, merge_same_dim_mesh_list, named_params_with_colotensor
__all__ = [
'ColoTensor', 'convert_parameter', 'ComputePattern', 'ComputeSpec', 'named_params_with_colotensor', 'ColoParameter',
'distspec', 'DistSpecManager', 'ColoParamOpHook', 'ColoParamOpHookManager', 'ProcessGroup', 'ColoTensorSpec',
'ShardSpec', 'ReplicaSpec', 'CommSpec', 'CollectiveCommPattern', 'convert_dim_partition_dict',
'ColoTensor', 'convert_parameter', 'named_params_with_colotensor', 'ColoParameter', 'ColoParamOpHook',
'ColoParamOpHookManager', 'CommSpec', 'CollectiveCommPattern', 'convert_dim_partition_dict',
'merge_same_dim_mesh_list'
]
from .activation_checkpoint import checkpoint
from .checkpointing import load_checkpoint, save_checkpoint
from .common import (
_cast_float,
clip_grad_norm_fp32,
conditional_context,
copy_tensor_parallel_attributes,
count_zeros_fp32,
disposable,
ensure_path_exists,
free_storage,
is_ddp_ignored,
is_dp_rank_0,
is_model_parallel_parameter,
is_no_pp_or_last_stage,
is_tp_rank_0,
is_using_ddp,
is_using_pp,
is_using_sequence,
multi_tensor_applier,
param_is_not_tensor_parallel_duplicate,
print_rank_0,
switch_virtual_pipeline_parallel_rank,
sync_model_param,
)
from .cuda import empty_cache, get_current_device, set_to_cuda, synchronize
from .data_sampler import DataParallelSampler, get_dataloader
from .memory import (
colo_device_memory_capacity,
colo_device_memory_used,
colo_get_cpu_memory_capacity,
colo_set_cpu_memory_capacity,
colo_set_process_memory_fraction,
report_memory_usage,
set_seed,
)
from .cuda import empty_cache, get_current_device, set_device, set_to_cuda, synchronize
from .multi_tensor_apply import multi_tensor_applier
from .tensor_detector import TensorDetector
from .timer import MultiTimer, Timer
__all__ = [
'checkpoint',
'print_rank_0',
'sync_model_param',
'is_ddp_ignored',
'is_dp_rank_0',
'is_tp_rank_0',
'is_no_pp_or_last_stage',
'is_using_ddp',
'is_using_pp',
'is_using_sequence',
'conditional_context',
'is_model_parallel_parameter',
'clip_grad_norm_fp32',
'count_zeros_fp32',
'copy_tensor_parallel_attributes',
'param_is_not_tensor_parallel_duplicate',
'get_current_device',
'synchronize',
'empty_cache',
'set_to_cuda',
'report_memory_usage',
'colo_device_memory_capacity',
'colo_device_memory_used',
'colo_set_process_memory_fraction',
'Timer',
'MultiTimer',
'multi_tensor_applier',
'DataParallelSampler',
'get_dataloader',
'switch_virtual_pipeline_parallel_rank',
'TensorDetector',
'load_checkpoint',
'save_checkpoint',
'ensure_path_exists',
'disposable',
'colo_set_cpu_memory_capacity',
'colo_get_cpu_memory_capacity',
'_cast_float',
'free_storage',
'set_seed',
'is_ddp_ignored',
'set_device',
]
......@@ -3,44 +3,12 @@
import functools
import os
import random
import socket
from collections import defaultdict
from contextlib import contextmanager
from pathlib import Path
from typing import Callable, Dict, List, Optional, Union
from typing import Callable
import numpy as np
import torch
import torch.distributed as dist
from torch import inf
from torch.nn.parameter import Parameter
from colossalai.constants import IS_TENSOR_PARALLEL, NUM_PARTITIONS, TENSOR_PARALLEL_ATTRIBUTES
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.global_variables import tensor_parallel_env as env
from colossalai.tensor import ColoParameter, ProcessGroup
from .multi_tensor_apply import multi_tensor_applier
try:
from colossalai._C import fused_optim
except:
fused_optim = None
def print_rank_0(msg: str, logger=None):
"""Print messages and save logs(optional). This is executed only if you are the rank-0 gpu.
Args:
msg (str): A string message to output.
logger (:class:`colossalai.logging.DistributedLogger`, optional):
The logger to record the message, defaults to None.
"""
if gpc.get_global_rank() == 0:
if logger is None:
print(msg, flush=True)
else:
logger.info(msg)
def ensure_path_exists(filename: str):
......@@ -50,47 +18,6 @@ def ensure_path_exists(filename: str):
Path(dirpath).mkdir(parents=True, exist_ok=True)
def sync_model_param(model, parallel_mode):
r"""Make sure data parameters are consistent during Data Parallel Mode.
Args:
model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency.
parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel mode to be checked.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
"""
if gpc.is_initialized(parallel_mode) and gpc.get_world_size(parallel_mode) > 1:
for param in model.parameters():
ranks = gpc.get_ranks_in_group(parallel_mode)
dist.broadcast(param, src=ranks[0], group=gpc.get_group(parallel_mode))
def is_dp_rank_0():
return not gpc.is_initialized(ParallelMode.DATA) or gpc.is_first_rank(ParallelMode.DATA)
def is_tp_rank_0():
return not gpc.is_initialized(ParallelMode.TENSOR) or gpc.is_first_rank(ParallelMode.TENSOR)
def is_no_pp_or_last_stage():
return not gpc.is_initialized(ParallelMode.PIPELINE) or gpc.is_last_rank(ParallelMode.PIPELINE)
def is_using_ddp():
return gpc.is_initialized(ParallelMode.DATA) and gpc.get_world_size(ParallelMode.DATA) > 1
def is_using_pp():
return gpc.is_initialized(ParallelMode.PIPELINE) and gpc.get_world_size(ParallelMode.PIPELINE) > 1
def is_using_sequence():
return gpc.is_initialized(ParallelMode.SEQUENCE) and gpc.get_world_size(ParallelMode.SEQUENCE) > 1
@contextmanager
def conditional_context(context_manager, enable=True):
if enable:
......@@ -100,365 +27,10 @@ def conditional_context(context_manager, enable=True):
yield
class model_branch_context(object):
def __enter__(self):
self.env_status = env.save()
def __exit__(self, *exc_info):
env.load(**self.env_status)
def is_model_parallel_parameter(p):
return hasattr(p, IS_TENSOR_PARALLEL) and getattr(p, IS_TENSOR_PARALLEL)
def is_ddp_ignored(p):
return getattr(p, '_ddp_to_ignore', False)
def _calc_l2_norm(grads):
# we should not
global fused_optim
if fused_optim is None:
from colossalai.kernel.op_builder import FusedOptimBuilder
fused_optim = FusedOptimBuilder().load()
norm = 0.0
if len(grads) > 0:
dummy_overflow_buf = torch.cuda.IntTensor([0])
norm, _ = multi_tensor_applier(
fused_optim.multi_tensor_l2norm,
dummy_overflow_buf,
[grads],
False # no per-parameter norm
)
return norm
def _calc_lp(grads, norm_type):
norm = 0.0
for grad in grads:
grad_norm = torch.norm(grad, norm_type)
norm += grad_norm**norm_type
return norm
def _move_norm_to_cuda(norm: Union[float, torch.Tensor]) -> Union[float, torch.Tensor]:
if torch.is_tensor(norm) and norm.device.type != 'cuda':
norm = norm.to(torch.cuda.current_device())
return norm
def _get_tensor_norm(norm: Union[float, torch.Tensor], move_to_cuda) -> torch.Tensor:
if isinstance(norm, float):
norm = torch.Tensor([norm])
if move_to_cuda:
norm = norm.to(torch.cuda.current_device())
return norm
# ======== Gradient Clipping =========
def _compute_local_lp(params: List[ColoParameter], norm_type: float) -> float:
if len(params) == 0:
return 0.0
grads = [p.grad for p in params]
use_cuda_kernel = grads[0].device.type == 'cuda'
if norm_type == inf:
local_lp = max([g.abs().max() for g in grads])
elif norm_type == 2.0 and use_cuda_kernel:
local_lp = _calc_l2_norm(grads)**norm_type
else:
local_lp = _calc_lp(grads, norm_type)
if isinstance(local_lp, torch.Tensor):
return local_lp.item()
return local_lp
def _compute_buckets_lp(params: List[ColoParameter], norm_type: float) -> float:
if len(params) == 0:
return 0.0
buckets: Dict[Optional[ProcessGroup], List[ColoParameter]] = defaultdict(list)
for p in params:
if p.is_replicate():
buckets[None].append(p)
else:
buckets[p.get_process_group().tp_process_group()].append(p)
total_lp = 0.0
for group, bucket in buckets.items():
local_lp = _compute_local_lp(bucket, norm_type)
if group is not None:
local_lp_tensor = torch.tensor([local_lp], device=torch.cuda.current_device())
if norm_type == inf:
dist.all_reduce(local_lp_tensor, op=dist.ReduceOp.MAX, group=group)
else:
dist.all_reduce(local_lp_tensor, group=group)
local_lp = local_lp_tensor.item()
if norm_type == inf:
total_lp = max(total_lp, local_lp)
else:
total_lp += local_lp
return total_lp
def _compute_pp_grad_lp(total_lp: float, norm_type: float) -> float:
if gpc.is_initialized(ParallelMode.PIPELINE) and gpc.get_world_size(ParallelMode.PIPELINE) > 1:
total_lp_tensor = torch.tensor([total_lp], device=torch.cuda.current_device())
if norm_type == inf:
dist.all_reduce(total_lp_tensor, op=dist.ReduceOp.MAX, group=gpc.get_group(ParallelMode.PIPELINE))
else:
dist.all_reduce(total_lp_tensor, group=gpc.get_group(ParallelMode.PIPELINE))
total_lp = total_lp_tensor.item()
return total_lp
def _compute_grad_lp(parameters, norm_type: float = 2.0) -> float:
if isinstance(parameters, torch.Tensor):
parameters = [parameters]
grad_dtype = None
cpu_grad_params: List[ColoParameter] = []
cuda_grad_params: List[ColoParameter] = []
for p in parameters:
if p.grad is None:
continue
assert isinstance(p, ColoParameter)
if grad_dtype is None:
grad_dtype = p.grad.dtype
assert p.grad.dtype == grad_dtype, f'Expected all grads are {grad_dtype}, got {p.grad.dtype}'
if p.grad.device.type == 'cuda':
cuda_grad_params.append(p)
else:
cpu_grad_params.append(p)
norm_type = float(norm_type)
cpu_lp = _compute_buckets_lp(cpu_grad_params, norm_type)
cuda_lp = _compute_buckets_lp(cuda_grad_params, norm_type)
if norm_type == inf:
total_lp = max(cpu_lp, cuda_lp)
else:
total_lp = cpu_lp + cuda_lp
return _compute_pp_grad_lp(total_lp, norm_type)
def compute_grad_norm(parameters, norm_type: float = 2.0) -> float:
norm_type = float(norm_type)
total_norm = _compute_grad_lp(parameters, norm_type)
if norm_type != inf:
total_norm = total_norm**(1 / norm_type)
return total_norm
def _clip_grad_norm(parameters, max_norm: float, total_norm: float) -> None:
clip_coef = max_norm / (total_norm + 1e-6)
if clip_coef < 1.0:
cuda_grads: List[torch.Tensor] = []
cpu_grads: List[torch.Tensor] = []
if isinstance(parameters, torch.Tensor):
parameters = [parameters]
for p in parameters:
if p.grad is None:
continue
if p.grad.device.type == 'cuda':
cuda_grads.append(p.grad.detach())
else:
cpu_grads.append(p.grad.detach())
if len(cuda_grads) > 0:
dummy_overflow_buf = torch.cuda.IntTensor([0])
multi_tensor_applier(fused_optim.multi_tensor_scale, dummy_overflow_buf, [cuda_grads, cuda_grads],
clip_coef)
for g in cpu_grads:
g.mul_(clip_coef)
def clip_grad_norm(parameters, max_norm: float, norm_type: float = 2.0) -> float:
total_norm = compute_grad_norm(parameters, norm_type)
_clip_grad_norm(parameters, max_norm, total_norm)
return total_norm
def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
"""Clips gradient norm of an iterable of parameters whose gradients are in fp32.
This is adapted from :func:`torch.nn.utils.clip_grad.clip_grad_norm_` and
added functionality to handle model parallel parameters.
Note:
the gradients are modified in place.
Args:
parameters (Iterable[:class:`torch.tensor`] or :class:`torch.tensor`):
An iterable of Tensors or a single Tensor that will have gradients normalized.
max_norm (Union[float, int]): Max norm of the gradients.
norm_type (Union[float, int, 'inf']): Type of the used p-norm. Can be ``'inf'`` for infinity norm.
Returns:
float: Total norm of the parameters.
"""
if isinstance(parameters, torch.Tensor):
parameters = [parameters]
# Filter parameters based on:
# - grad should not be none
# - parameter should not be shared
# - should not be a replica due to tensor model parallelism
params: List[Parameter] = []
has_zero_shared_param: bool = False
for param in parameters:
if param.grad is not None:
# Make sure the grads are in fp32
assert param.grad.dtype == torch.float, \
f'expected gradient to be dtype torch.float, but got {param.grad.type()}'
if hasattr(param, 'colo_attr') and param.colo_attr.sharded_data_tensor.is_sharded:
has_zero_shared_param = True
params.append(param)
if len(params) == 0:
enable_cuda_kernels = False
else:
enable_cuda_kernels = params[0].grad.device.type == 'cuda'
# Norm parameters.
max_norm = float(max_norm)
norm_type = float(norm_type)
# Parameters can be on CPU or CUDA
# If parameters are on CPU, disable CUDA kernels
# Calculate norm.
if norm_type == inf:
total_norm = max(p.grad.data.abs().max() for p in params)
total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
# Take max across all model-parallel GPUs.
if gpc.is_initialized(ParallelMode.MODEL) and gpc.get_world_size(ParallelMode.MODEL) > 1:
dist.all_reduce(total_norm_cuda,
op=dist.ReduceOp.MAX,
group=gpc.get_group(ParallelMode.MODEL),
async_op=False)
if has_zero_shared_param:
dist.all_reduce(total_norm_cuda,
op=dist.ReduceOp.MAX,
group=gpc.get_group(ParallelMode.DATA),
async_op=False)
total_norm = total_norm_cuda[0].item()
else:
tensor_parallel_grads = []
no_tensor_parallel_grads = []
zero_sharded_grads = []
for p in params:
if is_model_parallel_parameter(p):
reductor = (gpc.get_world_size(ParallelMode.TENSOR) / getattr(p, NUM_PARTITIONS))**(1 / norm_type)
tensor_parallel_grads.append(p.grad.data / reductor)
elif hasattr(p, 'colo_attr') and p.colo_attr.sharded_data_tensor.is_sharded:
zero_sharded_grads.append(p.grad.data)
else:
no_tensor_parallel_grads.append(p.grad.data)
if norm_type == 2.0 and enable_cuda_kernels:
tensor_parallel_norm = _calc_l2_norm(tensor_parallel_grads)**norm_type
no_tensor_parallel_norm = _calc_l2_norm(no_tensor_parallel_grads)**norm_type
zero_sharded_norm = _calc_l2_norm(zero_sharded_grads)**norm_type
else:
tensor_parallel_norm = _calc_lp(tensor_parallel_grads, norm_type)
no_tensor_parallel_norm = _calc_lp(no_tensor_parallel_grads, norm_type)
zero_sharded_norm = _calc_lp(zero_sharded_grads, norm_type)
# If norm is type of float, then we convert them into torch.Tensor.
tensor_parallel_norm = _get_tensor_norm(tensor_parallel_norm, enable_cuda_kernels)
no_tensor_parallel_norm = _get_tensor_norm(no_tensor_parallel_norm, enable_cuda_kernels)
zero_sharded_norm = _get_tensor_norm(zero_sharded_norm, enable_cuda_kernels)
# If grads are on CPU, the norms is also on CPU. Cast them to CUDA tensors
if not enable_cuda_kernels:
tensor_parallel_norm = _move_norm_to_cuda(tensor_parallel_norm)
no_tensor_parallel_norm = _move_norm_to_cuda(no_tensor_parallel_norm)
zero_sharded_norm = _move_norm_to_cuda(zero_sharded_norm)
# Sum across all model-parallel GPUs.
if gpc.is_initialized(ParallelMode.TENSOR) and len(tensor_parallel_grads) > 0:
dist.all_reduce(tensor_parallel_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.TENSOR))
# Sum across all zero sharded GPUs
if len(zero_sharded_grads) > 0:
dist.all_reduce(zero_sharded_norm, group=gpc.get_group(ParallelMode.DATA))
no_tensor_parallel_norm += zero_sharded_norm
total_norm = tensor_parallel_norm + no_tensor_parallel_norm
if gpc.is_initialized(ParallelMode.PIPELINE) and gpc.get_world_size(ParallelMode.PIPELINE) > 1:
dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.PIPELINE))
total_norm = total_norm**(1.0 / norm_type)
if torch.is_tensor(total_norm):
total_norm = total_norm.item()
# Scale.
clip_coeff = max_norm / (total_norm + 1.0e-6)
if clip_coeff < 1.0:
if enable_cuda_kernels:
grads = [p.grad.detach() for p in params]
dummy_overflow_buf = torch.cuda.IntTensor([0])
multi_tensor_applier(fused_optim.multi_tensor_scale, dummy_overflow_buf, [grads, grads], clip_coeff)
else:
for p in params:
p.grad.detach().mul_(clip_coeff)
return total_norm
def count_zeros_fp32(parameters):
if isinstance(parameters, torch.Tensor):
parameters = [parameters]
# Filter parameters based on:
# - grad should not be none
# - parameter should not be shared
# - should not be a replica due to tensor model parallelism
total_num_zeros = 0.0
for param in parameters:
grad_not_none = param.grad is not None
is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param)
if grad_not_none and is_not_tp_duplicate:
grad = param.grad.detach()
num_zeros = grad.numel() - torch.count_nonzero(grad)
total_num_zeros = num_zeros + total_num_zeros
total_num_zeros = torch.IntTensor([int(total_num_zeros)]).cuda()
# Sum across all model-parallel GPUs.
ops = []
ops.append(
dist.all_reduce(total_num_zeros, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.TENSOR), async_op=True))
if gpc.is_initialized(ParallelMode.PIPELINE):
ops.append(
dist.all_reduce(total_num_zeros,
op=dist.ReduceOp.SUM,
group=gpc.get_group(ParallelMode.PIPELINE),
async_op=True))
for req in ops:
req.wait()
total_num_zeros = total_num_zeros.item()
return total_num_zeros
def copy_tensor_parallel_attributes(src_tensor, dst_tensor):
for attr in TENSOR_PARALLEL_ATTRIBUTES:
if hasattr(src_tensor, attr):
val = getattr(src_tensor, attr)
setattr(dst_tensor, attr, val)
def param_is_not_tensor_parallel_duplicate(param):
return (hasattr(param, IS_TENSOR_PARALLEL) and getattr(param, IS_TENSOR_PARALLEL)) or (gpc.get_local_rank(
ParallelMode.TENSOR) == 0)
@contextmanager
def switch_virtual_pipeline_parallel_rank(rank):
prev_rank = gpc.virtual_pipeline_parallel_rank
try:
gpc.set_virtual_pipeline_parallel_rank(rank)
yield
finally:
gpc.set_virtual_pipeline_parallel_rank(prev_rank)
def disposable(func: Callable) -> Callable:
executed = False
......@@ -489,3 +61,9 @@ def _cast_float(args, dtype: torch.dtype):
elif isinstance(args, dict):
args = {k: _cast_float(v, dtype) for k, v in args.items()}
return args
def set_seed(seed):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from typing import Optional
import torch
import torch.distributed as dist
def set_to_cuda(models):
......@@ -45,3 +48,9 @@ def empty_cache():
"""
if torch.cuda.is_available():
torch.cuda.empty_cache()
def set_device(index: Optional[int] = None) -> None:
if index is None:
index = dist.get_rank() % torch.cuda.device_count()
torch.cuda.set_device(index)
import torch.nn as nn
from typing import Dict, List
import torch.distributed as dist
from colossalai.core import global_context as gpc
import torch.nn as nn
from colossalai.context.moe_context import MOE_CONTEXT
from colossalai.context import ParallelMode
from .common import is_using_ddp
from typing import Dict, List
from colossalai.legacy.context import ParallelMode
from colossalai.legacy.core import global_context as gpc
from colossalai.legacy.utils import is_using_ddp
def get_moe_epsize_param_dict(model: nn.Module) -> Dict[int, List[nn.Parameter]]:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment