Commit da3f0934 authored by zhuwenwen's avatar zhuwenwen
Browse files

delete unused files

parent c4dd1fd4
This diff is collapsed.
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import torch.nn as nn
import torch.cuda.amp as torch_amp
from torch import Tensor
from torch.nn.modules.loss import _Loss
from torch.optim import Optimizer
from ._grad_scaler import GradScaler
from colossalai.nn.optimizer import ColossalaiOptimizer
from colossalai.utils import clip_grad_norm_fp32
class TorchAMPOptimizer(ColossalaiOptimizer):
"""A wrapper class which integrate pytorch amp with an optimizer
:param optim: A normal optimizer like Adam or SGD
:param args: Args used to initialize gradient scaler
:param kwargs: Kwargs used to initialize gradient scaler
:type optim: torch.optim.Optimizer
"""
def __init__(self, optim: Optimizer, *args, **kwargs):
super().__init__(optim)
self.scaler = GradScaler(*args, **kwargs)
def backward(self, loss: Tensor):
"""Backward with torch amp gradient scaler
:param loss: Loss computed by a loss function
:type loss: torch.Tensor
"""
self.scaler.scale(loss).backward()
def step(self):
"""Update the parameters of the model
"""
self.scaler.step(self.optim)
self.scaler.update()
def clip_grad_norm(self, model: nn.Module, max_norm: float):
"""Apply gradient clipping to the model parameters
:param model: Your model object
:type model: torch.nn.Module
:param max_norm: Max norm value for gradient clipping
:type max_norm: float
"""
if max_norm > 0.0:
self.scaler.unscale_(self.optim)
clip_grad_norm_fp32(model.parameters(), max_norm)
class TorchAMPModel(nn.Module):
"""A wrapper class for a model object which executes forward with values automatically
cast to fp16
"""
def __init__(self, model: nn.Module) -> None:
super().__init__()
self.model = model
@torch_amp.autocast()
def forward(self, *args, **kwargs):
return self.model(*args, **kwargs)
class TorchAMPLoss(nn.Module):
"""A wrapper class for a criterion object which computes the loss in mixed-precision context
:param loss: A loss function object
:type loss: torch.nn.modules.loss._Loss
"""
def __init__(self, loss: _Loss):
super().__init__()
self.loss = loss
@torch_amp.autocast()
def forward(self, *args, **kwargs):
return self.loss(*args, **kwargs)
from .builder import (build_schedule, build_lr_scheduler, build_model,
build_optimizer, build_layer, build_loss, build_hooks,
build_dataset, build_transform, build_data_sampler,
build_gradient_handler, build_ophooks)
from .pipeline import build_pipeline_model, build_pipeline_model_from_cfg
__all__ = [
'build_schedule', 'build_lr_scheduler', 'build_model', 'build_optimizer',
'build_layer', 'build_loss', 'build_hooks', 'build_dataset',
'build_transform', 'build_data_sampler', 'build_gradient_handler',
'build_pipeline_model', 'build_pipeline_model_from_cfg', 'build_ophooks'
]
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import inspect
from collections.abc import Iterable
from colossalai.registry import *
def build_from_config(module, config: dict):
"""Returns an object of :class:`module` constructed from `config`.
:param module: A python or user-defined class
:type module: class
:param config: A python dict containing information used in the construction
of the return object
:type config: dict
:raises AssertionError: Raises an AssertionError if `module` is not a class
:return: An object of interest
:rtype: Object
"""
assert inspect.isclass(module), 'module must be a class'
return module(**config)
def build_from_registry(config, registry: Registry):
"""Returns an object constructed from `config`, the type of the object
is specified by `registry`.
:param config: A python dict or a :class:`colossalai.context.Config` object
containing information used in the construction of the return object
:type config: dict or :class:`colossalai.context.colossalai.context.Config`
:param registry: A registry specifying the type of the return object
:type registry: :class:`Registry`
:raises AssertionError: Raises an AssertionError if `registry` is not an object
of :class:`Registry` or `mod_type` in `config` is not found in `registry`
:raises Exception: Raises an Exception if an error occurred when building
from registry
:return: An object specified by `registry`
:rtype: Python object specified by `registry`
"""
config_ = config.copy() # keep the original config untouched
assert isinstance(
registry, Registry), f'Expected type Registry but got {type(registry)}'
mod_type = config_.pop('type')
assert registry.has(
mod_type), f'{mod_type} is not found in registry {registry.name}'
try:
obj = registry.get_module(mod_type)(**config_)
except Exception as e:
print(
f'An error occurred when building {mod_type} from registry {registry.name}',
flush=True)
raise e
return obj
def build_layer(config):
"""Returns a layer object of :class:`nn.Module` constructed from `config`.
:param config: A python dict or a :class:`colossalai.context.Config` object
containing information used in the construction of the return object
:type config: dict or :class:`colossalai.context.Config`
:return: An object of :class:`torch.nn.Module`
:rtype: :class:`torch.nn.Module`
"""
return build_from_registry(config, LAYERS)
def build_loss(config):
"""Returns a loss function object of :class:`torch.autograd.Function` constructed
from `config`.
:param config: A python dict or a :class:`colossalai.context.Config` object
containing information used in the construction of the return object
:type config: dict or :class:`colossalai.context.Config`
:return: An object of :class:`torch.nn.modules.loss._Loss`
:rtype: :class:`torch.nn.modules.loss._Loss`
"""
return build_from_registry(config, LOSSES)
def build_model(config):
"""Returns a model object of :class:`nn.Module` constructed from `config`.
:param config: A python dict or a :class:`colossalai.context.Config` object
containing information used in the construction of the return object
:type config: dict or :class:`colossalai.context.Config`
:return: An object of :class:`torch.nn.Module`
:rtype: :class:`torch.nn.Module`
"""
return build_from_registry(config, MODELS)
def build_dataset(config):
"""Returns a dataset object of :class:`torch.utils.data.Dataset` constructed
from `config`.
:param config: A python dict or a :class:`colossalai.context.Config` object
containing information used in the construction of the return object
:type config: dict or :class:`colossalai.context.Config`
:return: An object of :class:`torch.utils.data.Dataset`
:rtype: :class:`torch.utils.data.Dataset`
"""
return build_from_registry(config, DATASETS)
def build_optimizer(config, model):
"""Returns an optimizer object of :class:`torch.optim.Optimizer` constructed from `config`,
'model' and 'params'.
:param config: A python dict or a :class:`colossalai.context.Config` object
containing information used in the construction of the return object
:type config: dict or :class:`colossalai.context.Config`
:param model: A model containing parameters for the optimizer
:type model: :class:`nn.Module`
:return: An object of :class:`torch.optim.Optimizer`
:rtype: :class:`torch.optim.Optimizer`
"""
config_ = config.copy()
config_['params'] = model.parameters()
return build_from_registry(config_, OPTIMIZERS)
def build_gradient_handler(config, model, optimizer):
"""Returns a gradient handler object of :class:`BaseGradientHandler` constructed from `config`,
`model` and `optimizer`.
:param config: A python dict or a :class:`colossalai.context.Config` object
containing information used in the construction of the return object
:type config: dict or :class:`colossalai.context.Config`
:param model: A model containing parameters for the gradient handler
:type model: :class:`nn.Module`
:param optimizer: An optimizer object containing parameters for the gradient handler
:type optimizer: :class:`torch.optim.Optimizer`
:return: An object of :class:`colossalai.engine.BaseGradientHandler`
:rtype: :class:`colossalai.engine.BaseGradientHandler`
"""
config_ = config.copy()
config_['model'] = model
config_['optimizer'] = optimizer
return build_from_registry(config_, GRADIENT_HANDLER)
def build_hooks(config, trainer):
"""Returns a hook object of :class:`BaseHook` constructed from `config` and `trainer`.
:param config: A python dict or a :class:`colossalai.context.Config` object
containing information used in the construction of the return object
:type config: dict or :class:`colossalai.context.Config`
:param trainer: A :class:`Trainer` object containing parameters for the hook
:type trainer: :class:`Trainer`
:return: An object of :class:`colossalai.trainer.hooks.BaseHook`
:rtype: :class:`colossalai.trainer.hooks.BaseHook`
"""
config_ = config.copy()
config_['trainer'] = trainer
return build_from_registry(config_, HOOKS)
def build_ophooks(config):
"""Returns a hook object of :class:`BaseOpHook` constructed from `config`.
:param config: A python dict or a :class:`colossalai.context.Config` object
containing information used in the construction of the return object
:type config: dict or :class:`colossalai.context.Config`
:return: An object of :class:`colossalai.trainer.hooks.BaseOpHook`
:rtype: :class:`colossalai.trainer.hooks.BaseOpHook`
"""
config_ = config.copy()
return build_from_registry(config_, OPHOOKS)
def build_transform(config):
"""Returns a transformation object of :class:`torchvision.transforms` constructed
from `config`.
:param config: A python dict or a :class:`colossalai.context.Config` object
containing information used in the construction of the return object
:type config: dict or :class:`colossalai.context.Config`
:return: An object of :class:`torchvision.transforms`
:rtype: :class:`torchvision.transforms`
"""
return build_from_registry(config, TRANSFORMS)
def build_data_sampler(config, dataset):
"""Returns a data sampler object of :class:`colossalai.nn.data.sampler.BaseSampler`
constructed from `config`.
:param config: A python dict or a :class:`colossalai.context.Config` object
containing information used in the construction of the return object
:type config: dict or :class:`colossalai.context.Config`
:param dataset: An object of :class:`torch.utils.data.Dataset` containing information
used in the construction of the return object
:type dataset: :class:`torch.utils.data.Dataset`
:return: An object of :class:`colossalai.utils.data_sampler.BaseSampler`
:rtype: :class:`colossalai.utils.data_sampler.BaseSampler`
"""
config_ = config.copy()
config_['dataset'] = dataset
return build_from_registry(config_, DATA_SAMPLERS)
def build_lr_scheduler(config, optimizer):
"""Returns a learning rate scheduler object of :class:`torch.optim.lr_scheduler`
constructed from `config`, `optimizer`, `total_steps` and `num_steps_per_epoch`.
:param config: A python dict or a :class:`colossalai.context.Config` object
containing information used in the construction of the return object
:type config: dict or :class:`colossalai.context.Config`
:param optimizer: An optimizer object containing parameters for the learning rate
scheduler
:type optimizer: :class:`torch.optim.Optimizer`
:return: An object of :class:`torch.optim.lr_scheduler`
:rtype: :class:`torch.optim.lr_scheduler`
"""
config_ = config.copy()
config_['optimizer'] = optimizer
return build_from_registry(config_, LR_SCHEDULERS)
def build_schedule(config):
"""Returns a schedule of :class:`colossalai.engine.schedule.BaseSchedule`.
:param config: A python dict or a :class:`colossalai.context.Config` object
containing information used in the construction of the return object
:type config: dict or :class:`colossalai.context.Config`
:return: An object of :class:`colossalai.engine.schedule.BaseSchedule`
:rtype: :class:`colossalai.engine.schedule.BaseSchedule`
"""
return build_from_registry(config, SCHEDULE)
import copy
import heapq
from colossalai.builder import build_model, build_layer
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.logging import get_dist_logger
import torch.nn as nn
def _binary_partition(weights, st, ed):
"""Returns the binary partition position of `weights`, given the start
position `st` and the end position `ed`.
:param weights: A python list to be binary partitioned
:type weights: list
:param st: the start position of the binary partition
:type st: int
:param ed: the end postition of the binary partition
:type ed: int
:return: the binary partition position of `weights`
:rtype: int
"""
w_sum = weights[ed - 1]
prefix = 0
if st > 0:
w_sum -= weights[st - 1]
prefix = weights[st - 1]
minimum = float("inf")
for idx in range(st + 1, ed):
front = weights[idx - 1] - prefix
diff = abs(w_sum - 2 * front)
if diff < minimum:
pos = idx
minimum = diff
return st, pos, ed
def _heap_addition(weights, intervals, add_cnt):
"""
"""
def _heap_push(heap, st, ed):
value = weights[ed - 1]
if st > 0:
value -= weights[st - 1]
heapq.heappush(heap, (-value, st, ed))
ret_intervals = []
heap = []
for st, ed in intervals:
_heap_push(heap, st, ed)
while add_cnt > 0:
_, st, ed = heapq.heappop(heap)
if ed - st == 1:
ret_intervals.append((st, ed))
else:
l, m, r = _binary_partition(weights, st, ed)
_heap_push(heap, l, m)
_heap_push(heap, m, r)
add_cnt -= 1
while heap:
_, st, ed = heapq.heappop(heap)
ret_intervals.append((st, ed))
ret_intervals.sort()
return ret_intervals
def _calc_partitions(weights, value):
prev = 0
prefix = 0
num_block = 0
intervals = []
for idx, w in enumerate(weights):
if weights[idx] - prefix > value:
intervals.append((prev, idx))
prev = idx
prefix = weights[idx - 1]
num_block += 1
intervals.append((prev, len(weights)))
return num_block + 1, intervals
def _binary_search(weights, num):
length = len(weights)
prefix = [1 if w == 0 else w for w in weights]
for i in range(1, length):
prefix[i] += prefix[i - 1]
lower_bound = max(weights)
upper_bound = prefix[length - 1]
while upper_bound > lower_bound:
mid = (upper_bound + lower_bound) // 2
number, _ = _calc_partitions(prefix, mid)
if number <= num:
upper_bound = mid
else:
lower_bound = mid + 1
num_block, intervals = _calc_partitions(prefix, upper_bound)
if num_block < num:
intervals = _heap_addition(prefix, intervals, num - num_block)
return intervals
def partition_uniform(num_items, pipeline_parallel_size, num_chunks):
assert num_items % num_chunks == 0, \
"Layer length should be divided by the number of chunks, otherwise parameter method is recomended"
logger = get_dist_logger()
parts = [[] for _ in range(pipeline_parallel_size)]
partition_items = num_items // num_chunks
for idx in range(num_chunks):
base_idx = idx * partition_items
chunk_size = partition_items // pipeline_parallel_size
left = pipeline_parallel_size - partition_items % pipeline_parallel_size
if chunk_size == 0:
logger.warning("Some nodes in Pipeline have no requests")
for p in range(pipeline_parallel_size):
st = base_idx
base_idx += chunk_size + (p >= left)
parts[p].append((st, base_idx))
return parts
def partition_balanced(weights, pipeline_parallel_size, num_chunks):
num_total = pipeline_parallel_size * num_chunks
num_items = len(weights)
if num_items <= num_total:
return partition_uniform(num_items, pipeline_parallel_size, num_chunks)
intervals = _binary_search(weights, num_total)
current = 0
parts = [[] for _ in range(pipeline_parallel_size)]
for inter in intervals:
parts[current].append(inter)
current = (current + 1) % pipeline_parallel_size
return parts
def count_layer_params(layers):
"""Count the number of parameters in each layer
"""
param_counts = [0] * len(layers)
for idx, cfg in enumerate(layers):
layer = build_layer(cfg)
params = filter(lambda p: p.requires_grad, layer.parameters())
param_counts[idx] = sum(p.numel() for p in params)
return param_counts
def build_pipeline_model_from_cfg(config, num_chunks: int = 1, partition_method: str = 'parameter', verbose: bool = False):
"""An intializer to split the model into different stages for pipeline parallelism.
An example for the model config is shown below. The class VisionTransformerFromConfig should
inherit colossalai.nn.model.ModelFromConfig to allow this initializer to build model from a sequence
of layer configurations.
model_config = dict(
type='VisionTransformerFromConfig',
embedding_cfg=dict(...),
...
)
:param config: Configuration of the model
:type config: dict
:param num_chunks: The number of chunks you want to have on the current stage. This value should be 1
in most cases unless you are using virutal pipeline parallelism.
:type num_chunks: int, optional
:param partition_method: This parameter determines how you want to split your model layers into stages,
you can set it as 'layer' or 'parameter'
:type partition_method: str, optional
:param verbose: Whether to print the logs
:type verbose: bool, optional
"""
ori_model = build_model(config)
layers = ori_model.layers_cfg
layer_length = len(layers)
logger = get_dist_logger()
if verbose:
logger.info(f"The total length of layers is {layer_length}", ranks=[0])
pipeline_parallel_size = gpc.get_world_size(ParallelMode.PIPELINE)
pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
method = partition_method.lower()
# Make a partition
if method == 'layer':
num_layers = len(layers)
parts = partition_uniform(num_layers, pipeline_parallel_size, num_chunks)
elif method == 'parameter':
param_counts = count_layer_params(layers)
# print_rank_0(param_counts)
parts = partition_balanced(param_counts, pipeline_parallel_size, num_chunks)
else:
raise ValueError("Method should be a pre-set string in [layer, parameter]")
# Display the partition
if verbose:
log_str = 'Layer allocation after partitioning: \n'
for stage in range(pipeline_parallel_size):
num_layers = 0
for st, ed in parts[stage]:
num_layers += ed - st
log_str += f'\n===== stage={stage}, layers={num_layers} =====\n'
for st, ed in parts[stage]:
for idx, layer in enumerate(layers[st: ed]):
log_str += f'\t{idx + st:2d}: {layer}\n'
logger.info(log_str, ranks=[0])
# Save the partition
interval = parts[pipeline_rank]
models = []
for st, ed in interval:
model = copy.deepcopy(ori_model)
model.build_from_cfg(st, ed)
models.append(model)
return nn.ModuleList(models) if len(models) > 1 else models[0]
def build_pipeline_model(layers: nn.Sequential, num_chunks: int = 1, verbose: bool = False):
"""An intializer to split the model into different stages for pipeline parallelism.
Note that `layer` must be `torch.nn.Sequential`.
:param layers: Layers of model
:type layers: `torch.nn.Sequential`
:param num_chunks: The number of chunks you want to have on the current stage. This value should be 1
in most cases unless you are using virutal pipeline parallelism.
:type num_chunks: int, optional
:param verbose: Whether to print the logs
:type verbose: bool, optional
"""
pipeline_parallel_size = gpc.get_world_size(ParallelMode.PIPELINE)
pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
partitions = partition_uniform(len(layers), pipeline_parallel_size, num_chunks)
module_list = []
for start, end in partitions[pipeline_rank]:
module_list.append(nn.Sequential(*layers[start:end]))
if verbose:
logger = get_dist_logger()
logger.info(f'Total {len(layers)} layers', ranks=[0])
for rank, part in enumerate(partitions):
log_str = f'===== stage={rank} =====\n'
for chunk, (start, end) in enumerate(part):
log_str += f'===== chunk={chunk}, layer=[{start}-{end}] =====\n'
log_str += '\n'.join([str(layer) for layer in layers[start:end]]) + '\n'
logger.info(log_str, ranks=[0])
return nn.ModuleList(module_list) if len(module_list) > 1 else module_list[0]
from .collective import all_gather, reduce_scatter, all_reduce, broadcast, reduce
from .p2p import (send_forward, send_forward_recv_forward,
send_backward_recv_forward, send_backward,
send_backward_recv_backward, send_forward_recv_backward,
send_forward_backward_recv_forward_backward, recv_forward,
recv_backward)
from .ring import ring_forward
from .utils import send_tensor_meta, recv_tensor_meta
__all__ = [
'all_gather', 'reduce_scatter', 'all_reduce', 'broadcast', 'reduce',
'send_forward', 'send_forward_recv_forward',
'send_forward_backward_recv_forward_backward', 'send_backward',
'send_backward_recv_backward', 'send_backward_recv_forward',
'send_forward_recv_backward', 'recv_backward', 'recv_forward',
'ring_forward', 'send_tensor_meta', 'recv_tensor_meta',
]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment