Commit 08f2920e authored by zhuwenwen's avatar zhuwenwen
Browse files

init colossalai, support dtk2304

parent da3f0934
Pipeline #237 failed with stages
in 0 seconds
import operator
import warnings
from functools import reduce
from typing import Dict, List, Optional, Union
import torch
from colossalai.device.device_mesh import DeviceMesh
from colossalai.tensor.shape_consistency import ShapeConsistencyManager
from colossalai.tensor.sharding_spec import ShardingSpec
from torch.fx.node import Node
from ..constants import INFINITY_COST
__all__ = ['generate_sharding_spec', 'generate_resharding_costs']
def generate_sharding_spec(input_: Union[Node, torch.Tensor], device_mesh: DeviceMesh,
dim_partition_dict: Dict[int, List[int]]) -> ShardingSpec:
"""
Generate the sharding spec of the tensor based on the given dim_partition_dict.
Args:
input_ (Union[Node, torch.Tensor]): the input can be a Node object or a PyTorch tensor. If a node is used, it will look for its meta data associated with this node.
device_mesh (DeviceMesh): a DeviceMesh object which contains the meta information about the cluster.
dim_partition_dict (Dict[int, List[int]]): a dictionary to specify the sharding specs, the key is the tensor dimension and the value is the mesh dimension for sharding.
"""
if isinstance(input_, Node):
assert hasattr(input_, '_meta_data'), f'The given node has no attribte _meta_data'
meta_tensor = input_._meta_data
assert meta_tensor is not None, "The given node's _meta_data attribute is None"
shape = meta_tensor.shape
elif isinstance(input_, torch.Tensor):
shape = input_.shape
else:
raise TypeError(
f'We cannot generate sharding spec for {type(input_)} type, only torch.fx.Node or torch.Tensor is expected.'
)
for dim_index, sharding_index_list in dim_partition_dict.items():
sharding_list = [device_mesh.mesh_shape[sharding_index] for sharding_index in sharding_index_list]
sharding_size = reduce(operator.mul, sharding_list, 1)
assert shape[
dim_index] % sharding_size == 0, f'we cannot shard the {dim_index} dimension of tensor into {sharding_size} partitions.'
sharding_spec = ShardingSpec(device_mesh=device_mesh, entire_shape=shape, dim_partition_dict=dim_partition_dict)
return sharding_spec
def generate_resharding_costs(nodes: List[Node],
sharding_specs: List[ShardingSpec],
count_backward: Optional[bool] = True,
dtype: Optional[torch.dtype] = None,
index=None):
'''
Compute the resharding costs with this specific strategy.
Argument:
nodes (List[Node]): a list of nodes
sharding_spec_for_input(ShardingSpec): a list of ShardingSpec for the nodes.
count_backward (Optional[bool]): whether to include the cost of resharding in the backward pass, default is True. False can be used for inference.
dtype (Optional[torch.dtype]): the data type for cost calculation, default is None.
'''
# The resharding_cost of weight is counted due to sharing weight cases.
resharding_costs = {}
size_per_elem_bytes = torch.tensor([], dtype=dtype).element_size()
# shape consistency manager is a singleton class
shape_consistency_manager = ShapeConsistencyManager()
for input_node, input_spec in zip(nodes, sharding_specs):
resharding_costs[input_node] = []
for strategy in input_node.strategies_vector:
input_sharding_spec = strategy.output_sharding_spec
if not isinstance(input_sharding_spec, ShardingSpec):
assert isinstance(input_sharding_spec, list), 'only ShardingSpec or List[ShardingSpec] is expected.'
input_sharding_spec = input_sharding_spec[index]
assert isinstance(input_sharding_spec, ShardingSpec), f'The input node should NOT be a tuple of tensor.'
try:
# compute the resharding cost
_, _, total_resharding_cost = shape_consistency_manager.shape_consistency(
input_sharding_spec, input_spec)
# we need multiply the size of elem dtype to get correct communication cost
resharding_cost = total_resharding_cost["total"] * size_per_elem_bytes
except AssertionError as e:
warnings.warn(f'{e}')
resharding_cost = INFINITY_COST
resharding_costs[input_node].append(resharding_cost)
return resharding_costs
import functools
from typing import Any, Callable, Dict, List, Tuple, Type, Union
import torch
from colossalai.logging import get_dist_logger
from colossalai.tensor.sharding_spec import ShardingSpec, ShardingSpecException
__all__ = ['ignore_sharding_exception', 'pytree_map']
def ignore_sharding_exception(func):
"""
A function wrapper to handle the ShardingSpecException in the function.
If ShardingSpecException occurs, this function will return None.
Usage:
# mute the assertion error in the function
@ignore_sharding_exception
def do_something():
...
"""
@functools.wraps(func)
def wrapper(*args, **kwargs):
try:
logger = get_dist_logger()
rst = func(*args, **kwargs)
return rst
except ShardingSpecException as e:
logger.debug(e)
return None
return wrapper
def check_sharding_spec_validity(sharding_spec: ShardingSpec, tensor: torch.Tensor):
"""
This function checks whether the ShardingSpec is valid for the physical tensor.
This check includes 3 items:
1. the sharding spec covers all dimensions of the physical tensor
2. the sharding spec for each dimension is divisible by the number of devices.
3. the sharding spec's entire shape must match the tensor shape
#
"""
# make sure all dims are covered in sharding spec
sharding_len = len(sharding_spec.sharding_sequence)
tensor_num_dim = tensor.dim()
num_devices_in_col = sharding_spec.device_mesh.mesh_shape[0]
num_devices_in_row = sharding_spec.device_mesh.mesh_shape[1]
assert sharding_len == tensor_num_dim, \
f'The ShardingSpec ({sharding_spec.sharding_sequence}) is created for {sharding_len}-dimension tensor, but the given tensor is {tensor_num_dim}-dimension ({tensor.shape}).'
# make sure the sharding is valid for each dim
for i in range(tensor_num_dim):
dim_size = tensor.shape[i]
dim_spec = sharding_spec.sharding_sequence[i]
if str(dim_spec).startswith('S'):
devices_str = str(dim_spec).lstrip('S')
num_devices = 1
if '0' in devices_str:
num_devices *= num_devices_in_col
if '1' in devices_str:
num_devices *= num_devices_in_row
assert dim_size >= num_devices and dim_size % num_devices == 0, \
f'The dimension at index {i} has value {dim_size}, but it is sharded over {num_devices} devices.'
# make sure the entire shape matches the physical tensor shape
assert sharding_spec.entire_shape == tensor.shape, \
f'The entire_shape of the sharding spec {sharding_spec.entire_shape} does not match the tensor shape {tensor.shape}'
def pytree_map(obj: Any, fn: Callable, process_types: Union[Type, Tuple[Type]] = (), map_all: bool = False) -> Any:
"""process object recursively, like pytree
Args:
obj (:class:`Any`): object to process
fn (:class:`Callable`): a function to process subobject in obj
process_types (:class: `type | tuple[type]`): types to determine the type to process
map_all (:class: `bool`): if map_all is True, then any type of element will use fn
Returns:
:class:`Any`: returns have the same structure of `obj` and type in process_types after map of `fn`
"""
if isinstance(obj, dict):
return {k: pytree_map(obj[k], fn, process_types, map_all) for k in obj}
elif isinstance(obj, tuple):
return tuple(pytree_map(o, fn, process_types, map_all) for o in obj)
elif isinstance(obj, list):
return list(pytree_map(o, fn, process_types, map_all) for o in obj)
elif isinstance(obj, process_types):
return fn(obj)
else:
return fn(obj) if map_all else obj
from enum import Enum
from typing import Dict, List, Tuple
import torch
class PreviousStatus(Enum):
"""
This class shows the status of previous comparision.
"""
RESET = 0
# ORIGIN means the dimension size of original tensor is larger in the previous comparision.
ORIGIN = 1
# TGT means the dimension size of target tensor is larger in the previous comparision.
TGT = 2
def detect_reshape_mapping(origin_shape: torch.Size, tgt_shape: torch.Size) -> Dict[Tuple[int], Tuple[int]]:
"""
This method is used to detect the reshape mapping between original tensor and target tensor.
Returns:
reshape_mapping_dict: The dictionary shows how a tuple of origin dims(keys) mapping to the related
target dims(values) during reshaping operation.
Examples:
import torch
origin_shape = torch.Size([4, 4, 4])
tgt_shape = torch.Size([2, 8, 2, 2])
reshape_mapping_dict = detect_reshape_mapping(origin_shape, tgt_shape)
print(reshape_mapping_dict)
Output:
{(2,): (3, 2), (1, 0): (1,), (0,): (0, 1)}
"""
# reverse the shape object
origin_shape = list(origin_shape)
tgt_shape = list(tgt_shape)
origin_shape.reverse()
tgt_shape.reverse()
# initialize arguments
reshape_mapping_dict = {}
origin_len = len(origin_shape)
tgt_len = len(tgt_shape)
origin_index = 0
tgt_index = 0
original_dimension_size = origin_shape[origin_index]
tgt_dimension_size = tgt_shape[tgt_index]
tgt_dims = [tgt_len - tgt_index - 1]
origin_dims = [origin_len - origin_index - 1]
previous_label = PreviousStatus.RESET
while origin_index != len(origin_shape) or tgt_index != len(tgt_shape):
if original_dimension_size == tgt_dimension_size:
reshape_mapping_dict[tuple(origin_dims)] = tuple(tgt_dims)
# if the origin_dims has no element, it means the original tensor has been fully matched.
# Therefore, we do not have to increase the origin_index for that case.
if len(origin_dims) > 0:
origin_index += 1
# if the tgt_dims has no element, it means the original tensor has been fully matched.
# Therefore, we do not have to increase the tgt_index for that case.
if len(tgt_dims) > 0:
tgt_index += 1
# the last step of loop should always end with condition
# so we need to manually skip the preparation for next step
# in the last step.
if origin_index == len(origin_shape) and tgt_index == len(tgt_shape):
continue
# If origin_index equals to origin_len, we just need to set the original_dimension_size
# to 1 to match the remaining '1's in the target tensor shape.
if origin_index == len(origin_shape):
original_dimension_size = 1
origin_dims = []
else:
original_dimension_size = origin_shape[origin_index]
origin_dims = [origin_len - origin_index - 1]
# If tgt_index equals to tgt_len, we just need to set the tgt_dimension_size
# to 1 to match the remaining '1's in the original tensor shape.
if tgt_index == len(tgt_shape):
tgt_dimension_size = 1
tgt_dims = []
else:
tgt_dimension_size = tgt_shape[tgt_index]
tgt_dims = [tgt_len - tgt_index - 1]
previous_label = PreviousStatus.RESET
elif original_dimension_size > tgt_dimension_size:
tgt_index += 1
if previous_label == PreviousStatus.TGT:
# if the target dimension size is larger in the previous comparision, which means
# the origin dimension size has already accumulated larger than target dimension size, so
# we need to offload the origin dims and tgt dims into the reshape_mapping_dict.
reshape_mapping_dict[tuple(origin_dims)] = tuple(tgt_dims)
original_dimension_size = original_dimension_size // tgt_dimension_size
origin_dims = [origin_len - origin_index - 1]
tgt_dimension_size = tgt_shape[tgt_index]
tgt_dims = [tgt_len - tgt_index - 1, tgt_len - tgt_index]
# reset the previous_label after offloading the origin dims and tgt dims
previous_label = PreviousStatus.RESET
else:
# accumulate the tgt_dimension_size until tgt_dimension_size larger than original_dimension_size
tgt_dimension_size *= tgt_shape[tgt_index]
tgt_dims.append(tgt_len - tgt_index - 1)
previous_label = PreviousStatus.ORIGIN
else:
origin_index += 1
if previous_label == PreviousStatus.ORIGIN:
# if the origin element is larger in the previous comparision, which means
# the target element has already accumulated larger than origin element, so
# we need to offload the origin dims and tgt dims into the reshape_mapping_dict.
reshape_mapping_dict[tuple(origin_dims)] = tuple(tgt_dims)
tgt_dimension_size = tgt_dimension_size // original_dimension_size
tgt_dims = [tgt_len - tgt_index - 1]
original_dimension_size = origin_shape[origin_index]
origin_dims = [origin_len - origin_index - 1, origin_len - origin_index]
# reset the previous_label after offloading the origin dims and tgt dims
previous_label = PreviousStatus.RESET
else:
# accumulate the original_dimension_size until original_dimension_size larger than tgt_dimension_size
original_dimension_size *= origin_shape[origin_index]
origin_dims.append(origin_len - origin_index - 1)
previous_label = PreviousStatus.TGT
return reshape_mapping_dict
def check_keep_sharding_status(input_dim_partition_dict: Dict[int, List[int]],
reshape_mapping_dict: Dict[Tuple[int], Tuple[int]]) -> bool:
"""
This method is used to check whether the reshape operation could implement without converting
the input to fully replicated status.
Rule:
For a sharded dimension of input tensor, if it is not the minimum element of the input tuple,
the function will return false.
To illustrate this issue, there are two cases to analyse:
1. no sharded dims in the input tuple: we could do the reshape operation safely just as the normal
operation without distributed tensor.
2. sharded dims in the input tuple: the sharded dim must be the minimum element, then during shape
consistency process, torch.cat will be implemented on the sharded dim, and everything after the sharded
dim get recovered.
Examples:
# the second dimension of the input has been sharded.
input_dim_partition_dict = {1: [1]}
origin_shape = torch.Size([8, 4, 2])
tgt_shape = torch.Size([2, 4, 8])
reshape_mapping_dict = detect_reshape_mapping(origin_shape, tgt_shape)
# {(2, 1): (2,), (0,): (1, 0)}
# the sharded dim of input is 1, which is the minimum element of the tuple (2, 1),
# so we do not have to convert the input to fully replicated status.
print(check_keep_sharding_status(input_dim_partition_dict, reshape_mapping_dict))
Output:
True
"""
sharded_dims = list(input_dim_partition_dict.keys())
for input_dims in reshape_mapping_dict.keys():
# if input_dims has no element, we could just skip this iteration.
if len(input_dims) == 0:
continue
min_element = min(input_dims)
for dim in input_dims:
if dim in sharded_dims and dim is not min_element:
return False
return True
def infer_output_dim_partition_dict(input_dim_partition_dict: Dict[int, List[int]],
reshape_mapping_dict: Dict[Tuple[int], Tuple[int]]) -> Dict[Tuple[int], Tuple[int]]:
"""
This method is used to infer the output dim partition dict for a reshape operation,
given the input dim partition dict and reshape mapping dict.
"""
assert check_keep_sharding_status(input_dim_partition_dict, reshape_mapping_dict), \
'we only infer output dim partition dict for the reshape operation could keep sharding spec.'
sharded_dims = list(input_dim_partition_dict.keys())
output_dim_partition_dict = {}
for input_dims, output_dims in reshape_mapping_dict.items():
for dim in input_dims:
if dim in sharded_dims:
output_dim_partition_dict[min(output_dims)] = input_dim_partition_dict[dim]
# we could break because input dims cannot contain two sharded dims, otherwise
# the keep sharding status check will fail.
break
return output_dim_partition_dict
import operator
from copy import deepcopy
from functools import reduce
from typing import Dict
import torch
from colossalai.tensor.sharding_spec import ShardingSpec
__all__ = [
'transpose_partition_dim', 'update_partition_dim', 'enumerate_all_possible_1d_sharding',
'enumerate_all_possible_2d_sharding', 'generate_sharding_size'
]
def transpose_partition_dim(sharding_spec: ShardingSpec, dim1: int, dim2: int) -> ShardingSpec:
"""
Switch the sharding mesh dimensions for two tensor dimensions. This operation is in-place.
Args:
sharding_spec (ShardingSpec): the sharding spec for which partition dim are switched
dim1 (int): the tensor dimension to switch
dim2 (int): the tensor dimension to switch
"""
assert len(sharding_spec.entire_shape) >= 2, \
'The entire_shape of the sharding spec must have at least 2 dimensions'
dim_partition_dict = sharding_spec.dim_partition_dict
# transpose the dim partition
dim1_partition = dim_partition_dict.pop(dim1, None)
dim2_partition = dim_partition_dict.pop(dim2, None)
if dim1_partition:
dim_partition_dict[dim2] = dim1_partition
if dim2_partition:
dim_partition_dict[dim1] = dim2_partition
# get the transposed shape
new_shape = list(sharding_spec.entire_shape[:])
new_shape[dim2], new_shape[dim1] = new_shape[dim1], new_shape[dim2]
new_shape = torch.Size(new_shape)
# re-init the sharding spec
sharding_spec.__init__(sharding_spec.device_mesh, new_shape, dim_partition_dict)
return sharding_spec
def update_partition_dim(sharding_spec: ShardingSpec,
dim_mapping: Dict[int, int],
physical_shape: torch.Size,
inplace: bool = False):
"""
This method is used to update the partition dim dict from the logical one to the physical one.
Args:
sharding_spec (ShardingSpec): the sharding spec for which partition dims are updated
dim_mapping (Dict[int, int]): the mapping from the logical tensor dimension to the physical tensor dimension
physical_shape (torch.Size): the physical shape for the tensor
"""
if inplace:
current_sharding_spec = sharding_spec
else:
current_sharding_spec = deepcopy(sharding_spec)
old_dim_partition_dict = current_sharding_spec.dim_partition_dict
new_dim_partition_dict = {}
# assign new dim
for old_dim, new_dim in dim_mapping.items():
mesh_dims = old_dim_partition_dict.pop(old_dim)
new_dim_partition_dict[new_dim] = mesh_dims
for tensor_dim, mesh_dims in old_dim_partition_dict.items():
if tensor_dim in new_dim_partition_dict:
raise KeyError(f"There are duplicated entries for the tensor sharding dimension {tensor_dim}")
else:
new_dim_partition_dict[tensor_dim] = mesh_dims
# update sharding spec
current_sharding_spec.__init__(device_mesh=sharding_spec.device_mesh,
entire_shape=physical_shape,
dim_partition_dict=new_dim_partition_dict)
return current_sharding_spec
def enumerate_all_possible_2d_sharding(mesh_dim_0, mesh_dim_1, dim_size):
dim_partition_list = []
# enumerate all the 2D sharding cases
for i in range(dim_size):
for j in range(i + 1, dim_size):
dim_partition_dict_0 = {i: [mesh_dim_0], j: [mesh_dim_1]}
dim_partition_dict_1 = {i: [mesh_dim_1], j: [mesh_dim_0]}
dim_partition_list.append(dim_partition_dict_0)
dim_partition_list.append(dim_partition_dict_1)
for i in range(dim_size):
dim_partition_dict_flatten = {i: [mesh_dim_0, mesh_dim_1]}
dim_partition_list.append(dim_partition_dict_flatten)
return dim_partition_list
def enumerate_all_possible_1d_sharding(mesh_dim_0, dim_size):
dim_partition_list = []
# enumerate all the 1D sharding cases
for i in range(dim_size):
dim_partition_dict_0 = {i: [mesh_dim_0]}
dim_partition_list.append(dim_partition_dict_0)
return dim_partition_list
def generate_sharding_size(dim_partition_dict, device_mesh):
total_sharding_size = 1
for mesh_dim_list in dim_partition_dict.values():
mesh_dim_sharding_size = [device_mesh.shape[mesh_dim] for mesh_dim in mesh_dim_list]
sharding_size = reduce(operator.mul, mesh_dim_sharding_size)
total_sharding_size *= sharding_size
return total_sharding_size
from .builder import build_from_config, build_from_registry, build_gradient_handler
__all__ = ['build_gradient_handler', 'build_from_config', 'build_from_registry']
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import inspect
from colossalai.registry import *
def build_from_config(module, config: dict):
"""Returns an object of :class:`module` constructed from `config`.
Args:
module: A python or user-defined class
config: A python dict containing information used in the construction of the return object
Returns: An ``object`` of interest
Raises:
AssertionError: Raises an AssertionError if `module` is not a class
"""
assert inspect.isclass(module), 'module must be a class'
return module(**config)
def build_from_registry(config, registry: Registry):
r"""Returns an object constructed from `config`, the type of the object
is specified by `registry`.
Note:
the `config` is used to construct the return object such as `LAYERS`, `OPTIMIZERS`
and other support types in `registry`. The `config` should contain
all required parameters of corresponding object. The details of support
types in `registry` and the `mod_type` in `config` could be found in
`registry <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/registry/__init__.py>`_.
Args:
config (dict or :class:`colossalai.context.colossalai.context.Config`): information
used in the construction of the return object.
registry (:class:`Registry`): A registry specifying the type of the return object
Returns:
A Python object specified by `registry`.
Raises:
Exception: Raises an Exception if an error occurred when building from registry.
"""
config_ = config.copy() # keep the original config untouched
assert isinstance(registry, Registry), f'Expected type Registry but got {type(registry)}'
mod_type = config_.pop('type')
assert registry.has(mod_type), f'{mod_type} is not found in registry {registry.name}'
try:
obj = registry.get_module(mod_type)(**config_)
except Exception as e:
print(f'An error occurred when building {mod_type} from registry {registry.name}', flush=True)
raise e
return obj
def build_gradient_handler(config, model, optimizer):
"""Returns a gradient handler object of :class:`BaseGradientHandler` constructed from `config`,
`model` and `optimizer`.
Args:
config (dict or :class:`colossalai.context.Config`): A python dict or
a :class:`colossalai.context.Config` object containing information
used in the construction of the ``GRADIENT_HANDLER``.
model (:class:`nn.Module`): A model containing parameters for the gradient handler
optimizer (:class:`torch.optim.Optimizer`): An optimizer object containing parameters for the gradient handler
Returns:
An object of :class:`colossalai.engine.BaseGradientHandler`
"""
config_ = config.copy()
config_['model'] = model
config_['optimizer'] = optimizer
return build_from_registry(config_, GRADIENT_HANDLER)
from .cli import cli
__all__ = ['cli']
import click
from .utils import *
from .benchmark import run_benchmark
from colossalai.context import Config
__all__ = ['benchmark']
@click.command()
@click.option("-g", "--gpus", type=int, default=None, help="Total number of devices to use.")
@click.option("-b", "--batch_size", type=int, default=8, help="Batch size of the input tensor.")
@click.option("-s", "--seq_len", type=int, default=512, help="Sequence length of the input tensor.")
@click.option("-d", "--dimension", type=int, default=1024, help="Hidden dimension of the input tensor.")
@click.option("-w", "--warmup_steps", type=int, default=10, help="The number of warmup steps.")
@click.option("-p", "--profile_steps", type=int, default=50, help="The number of profiling steps.")
@click.option("-l", "--layers", type=int, default=2)
@click.option("-m",
"--model",
type=click.Choice(['mlp'], case_sensitive=False),
default='mlp',
help="Select the model to benchmark, currently only supports MLP")
def benchmark(gpus: int, batch_size: int, seq_len: int, dimension: int, warmup_steps: int, profile_steps: int,
layers: int, model: str):
args_dict = locals()
args = Config(args_dict)
run_benchmark(args)
import colossalai
import click
import torch.multiprocessing as mp
from functools import partial
from typing import List, Dict
from colossalai.context import Config
from colossalai.context.random import reset_seeds
from colossalai.core import global_context as gpc
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.utils import free_port, MultiTimer
from colossalai.cli.benchmark.utils import find_all_configs, profile_model, get_batch_data
from .models import MLP
def run_benchmark(args: Config) -> None:
"""
Run benchmarking with torch.multiprocessing.
"""
# sanity checks
if args.gpus is None:
click.echo("Error: --num_gpus is not given")
exit()
if args.gpus <= 1:
click.echo("Warning: tensor parallel will be activated with at least 2 devices.")
click.echo("=== Benchmarking Parameters ===")
for k, v in args.items():
click.echo(f'{k}: {v}')
click.echo('')
config_list = find_all_configs(args.gpus)
avail_ports = [free_port() for _ in range(len(config_list))]
run_func = partial(run_dist_profiling,
world_size=args.gpus,
port_list=avail_ports,
config_list=config_list,
hyperparams=args)
mp.spawn(run_func, nprocs=args.gpus)
def run_dist_profiling(rank: int, world_size: int, port_list: List[int], config_list: List[Dict],
hyperparams: Config) -> None:
"""
A function executed for profiling, this function should be spawn by torch.multiprocessing.
Args:
rank (int): rank of the process
world_size (int): the number of processes
port_list (List[int]): a list of free ports for initializing distributed networks
config_list (List[Dict]): a list of configuration
hyperparams (Config): the hyperparameters given by the user
"""
# disable logging for clean output
disable_existing_loggers()
logger = get_dist_logger()
logger.set_level('WARNING')
for config, port in zip(config_list, port_list):
colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
timer = MultiTimer()
# 1D parallel should be skipped if in_features or out_features is not able to be divided exactly by 1D parallel size.
if config.parallel.tensor.mode == '1d' and hyperparams.dimension % config.parallel.tensor.size != 0:
click.echo(
"1D parallel will be skipped because in_features or out_features is not able to be divided exactly by 1D parallel size."
)
continue
if hyperparams.model == 'mlp':
model = MLP(dim=hyperparams.dimension, layers=hyperparams.layers)
else:
if gpc.get_global_rank() == 0:
click.echo("Error: Invalid argument for --model")
exit()
data_func = partial(get_batch_data,
dim=hyperparams.dimension,
batch_size=hyperparams.batch_size,
seq_length=hyperparams.seq_len,
mode=config.parallel.tensor.mode)
fwd_time, bwd_time, max_allocated, max_cached = profile_model(model=model,
warmup_steps=hyperparams.warmup_steps,
profile_steps=hyperparams.profile_steps,
data_func=data_func,
timer=timer)
gpc.destroy()
reset_seeds()
if gpc.get_global_rank() == 0:
config_str = ', '.join([f'{k}: {v}' for k, v in config.parallel.tensor.items()])
click.echo(f"=== {config_str} ===")
click.echo(f"Average forward time: {fwd_time}")
click.echo(f"Average backward time: {bwd_time}")
click.echo(f"Max allocated GPU memory: {max_allocated}")
click.echo(f"Max cached GPU memory: {max_cached}\n")
import torch
import colossalai.nn as col_nn
class MLP(torch.nn.Module):
def __init__(self, dim: int, layers: int):
super().__init__()
self.layers = torch.nn.ModuleList()
for _ in range(layers):
self.layers.append(col_nn.Linear(dim, dim))
def forward(self, x):
for layer in self.layers:
x = layer(x)
return x
import math
import time
import torch
from colossalai.utils import MultiTimer
from colossalai.context import ParallelMode, Config
from typing import List, Dict, Tuple, Callable
def get_time_stamp() -> int:
"""
Return the time stamp for profiling.
Returns:
time_stamp (int): the time given by time.time()
"""
torch.cuda.synchronize()
time_stamp = time.time()
return time_stamp
def get_memory_states() -> Tuple[float]:
"""
Return the memory statistics.
Returns:
max_allocated (float): the allocated CUDA memory
max_cached (float): the cached CUDA memory
"""
max_allocated = torch.cuda.max_memory_allocated() / (1024**3)
max_cached = torch.cuda.max_memory_reserved() / (1024**3)
torch.cuda.reset_peak_memory_stats()
torch.cuda.empty_cache()
return max_allocated, max_cached
def find_all_configs(device_cnt: int) -> List[Dict]:
"""
Find all possible configurations for tensor parallelism
Args:
device_cnt (int): the number of devices
Returns:
config_list (List[Dict]): a list of configurations
"""
def _is_square(num):
# 2D parallel should be implemented with at least 2 devices.
if num <= 1:
return False
return math.floor(math.sqrt(num))**2 == num
def _is_cube(num):
# 3D parallel should be implemented with at least 2 devices.
if num <= 1:
return False
return math.floor(num**(1. / 3.))**3 == num
config_list = []
# add non-parallel config
config = dict(parallel=dict(tensor=dict(size=device_cnt, mode=None)))
config_list.append(config)
# add 1D config
config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='1d')))
config_list.append(config)
# add 2D config only if device_cnt is a square
if _is_square(device_cnt):
config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='2d')))
config_list.append(config)
# check for 2.5D
# iterate over depth
for depth in range(1, device_cnt):
if device_cnt % depth == 0 and _is_square(device_cnt // depth):
config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='2.5d', depth=depth)))
config_list.append(config)
# check for 3D if device_cnt is a cube
if _is_cube(device_cnt):
config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='3d')))
config_list.append(config)
config_list = [Config(cfg) for cfg in config_list]
return config_list
def profile_model(model: torch.nn.Module, warmup_steps: int, profile_steps: int, data_func: Callable,
timer: MultiTimer) -> Tuple[float]:
"""
Profile the forward and backward of a model
Args:
model (torch.nn.Module): a PyTorch model
warmup_steps (int): the number of steps for warmup
profile_steps (int): the number of steps for profiling
data_func (Callable): a function to generate random data
timer (colossalai.utils.Multitimer): a timer instance for time recording
Returns:
fwd_time (float): the average forward time taken by forward pass in second
bwd_time (float): the average backward time taken by forward pass in second
max_allocated (float): the maximum GPU memory allocated in GB
max_cached (float): the maximum GPU memory cached in GB
"""
def _run_step(data):
timer.start('forward')
out = model(data)
timer.stop('forward', keep_in_history=True)
timer.start('backward')
out.mean().backward()
timer.stop('backward', keep_in_history=True)
data_list = [data_func() for _ in range(warmup_steps)]
for data in data_list:
_run_step(data)
timer.reset('forward')
timer.reset('backward')
for _ in range(profile_steps):
data = data_func()
_run_step(data)
max_allocated, max_cached = get_memory_states()
fwd_time = timer.get_timer('forward').get_history_mean()
bwd_time = timer.get_timer('backward').get_history_mean()
return fwd_time, bwd_time, max_allocated, max_cached
def get_batch_data(dim: int, batch_size: int, seq_length: int, mode: ParallelMode) -> torch.Tensor:
"""
Return a random data of shape (batch_size, seq_length, dim) for profiling.
Args:
dim (int): hidden size
batch_size (int): the number of data samples
seq_length (int): the number of tokens
mode (ParallelMode): Colossal-AI ParallelMode enum
Returns:
data (torch.Tensor): random data
"""
if mode in ['2d', '2.5d']:
batch_size = batch_size // 2
dim = dim // 2
elif mode == '3d':
batch_size = batch_size // 4
dim = dim // 2
data = torch.rand(batch_size, seq_length, dim).cuda()
return data
import click
from .check_installation import check_installation
__all__ = ['check']
@click.command(help="Check if Colossal-AI is correct based on the given option")
@click.option('-i', '--installation', is_flag=True, help="Check if Colossal-AI is built correctly")
def check(installation):
if installation:
check_installation()
return
click.echo("No option is given")
import subprocess
import click
import torch
from torch.utils.cpp_extension import CUDA_HOME
import colossalai
def check_installation():
cuda_ext_installed = _check_cuda_extension_installed()
cuda_version, torch_version, torch_cuda_version = _check_cuda_torch()
colossalai_verison, torch_version_required, cuda_version_required = _parse_colossalai_version()
cuda_compatibility = _get_compatibility_string([cuda_version, torch_cuda_version, cuda_version_required])
torch_compatibility = _get_compatibility_string([torch_version, torch_version_required])
click.echo(f'#### Installation Report ####\n')
click.echo(f"Colossal-AI version: {colossalai_verison}")
click.echo(f'----------------------------')
click.echo(f"PyTorch Version: {torch_version}")
click.echo(f"PyTorch Version required by Colossal-AI: {torch_version_required}")
click.echo(f'PyTorch version match: {torch_compatibility}')
click.echo(f'----------------------------')
click.echo(f"System CUDA Version: {cuda_version}")
click.echo(f"CUDA Version required by PyTorch: {torch_cuda_version}")
click.echo(f"CUDA Version required by Colossal-AI: {cuda_version_required}")
click.echo(f"CUDA Version Match: {cuda_compatibility}")
click.echo(f'----------------------------')
click.echo(f"CUDA Extension: {cuda_ext_installed}")
def _get_compatibility_string(versions):
# split version into [major, minor, patch]
versions = [version.split('.') for version in versions]
for version in versions:
if len(version) == 2:
# x means unknown
version.append('x')
for idx, version_values in enumerate(zip(*versions)):
equal = len(set(version_values)) == 1
if idx in [0, 1] and not equal:
# if the major/minor versions do not match
# return a cross
return 'x'
elif idx == 1:
# if the minor versions match
# return a tick
return u'\u2713'
else:
continue
def _parse_colossalai_version():
colossalai_verison = colossalai.__version__.split('+')[0]
torch_version_required = colossalai.__version__.split('torch')[1].split('cu')[0]
cuda_version_required = colossalai.__version__.split('cu')[1]
return colossalai_verison, torch_version_required, cuda_version_required
def _check_cuda_extension_installed():
try:
import colossalai._C.fused_optim
is_cuda_extension_installed = u'\u2713'
except ImportError:
is_cuda_extension_installed = 'x'
return is_cuda_extension_installed
def _check_cuda_torch():
# get cuda version
if CUDA_HOME is None:
cuda_version = 'N/A (CUDA_HOME is not set)'
else:
raw_output = subprocess.check_output([CUDA_HOME + "/bin/nvcc", "-V"], universal_newlines=True)
output = raw_output.split()
release_idx = output.index("release") + 1
release = output[release_idx].split(".")
bare_metal_major = release[0]
bare_metal_minor = release[1][0]
cuda_version = f'{bare_metal_major}.{bare_metal_minor}'
# get torch version
torch_version = torch.__version__.split('+')[0]
# get cuda version in pytorch build
torch_cuda_major = torch.version.cuda.split(".")[0]
torch_cuda_minor = torch.version.cuda.split(".")[1]
torch_cuda_version = f'{torch_cuda_major}.{torch_cuda_minor}'
return cuda_version, torch_version, torch_cuda_version
import click
from .launcher import run
from .check import check
from .benchmark import benchmark
class Arguments():
def __init__(self, arg_dict):
for k, v in arg_dict.items():
self.__dict__[k] = v
@click.group()
def cli():
pass
cli.add_command(run)
cli.add_command(check)
cli.add_command(benchmark)
if __name__ == '__main__':
cli()
import click
from .run import launch_multi_processes
from colossalai.context import Config
@click.command(help="Launch distributed training on a single node or multiple nodes",
context_settings=dict(ignore_unknown_options=True))
@click.option("-H",
"-host",
"--host",
type=str,
default=None,
help="the list of hostnames to launch in the format <host1>,<host2>")
@click.option(
"--hostfile",
type=str,
default=None,
help="Hostfile path that defines the device pool available to the job, each line in the file is a hostname")
@click.option("--include",
type=str,
default=None,
help="Specify computing devices to use during execution. String format is <host1>,<host2>,"
" only effective when used with --hostfile.")
@click.option(
"--exclude",
type=str,
default=None,
help=
"Specify computing devices to NOT use during execution. Mutually exclusive with --include. Formatting is the same as --includ,"
" only effective when used with --hostfile.")
@click.option("--num_nodes",
type=int,
default=-1,
help="Total number of worker nodes to use, only effective when used with --hostfile.")
@click.option("--nproc_per_node", type=int, default=None, help="Number of GPUs to use on each node.")
@click.option("--master_port",
type=int,
default=29500,
help="(optional) Port used by PyTorch distributed for communication during distributed training.")
@click.option("--master_addr",
type=str,
default="127.0.0.1",
help="(optional) IP address of node 0, will be inferred via 'hostname -I' if not specified.")
@click.option(
"--extra_launch_args",
type=str,
default=None,
help=
"Set additional torch distributed launcher arguments such as --standalone. The format is --extra_launch_args arg1=1,arg2=2. "
"This will be converted to --arg1=1 --arg2=2 during execution")
@click.option("--ssh-port", type=int, default=None, help="(optional) the port used for ssh connection")
@click.argument("user_script", type=str)
@click.argument('user_args', nargs=-1)
def run(host: str, hostfile: str, num_nodes: int, nproc_per_node: int, include: str, exclude: str, master_addr: str,
master_port: int, extra_launch_args: str, ssh_port: int, user_script: str, user_args: str) -> None:
"""
To launch multiple processes on a single node or multiple nodes via command line.
Usage::
# run with 4 GPUs on the current node use default port 29500
colossalai run --nprocs_per_node 4 train.py
# run with 2 GPUs on the current node at port 29550
colossalai run --nprocs_per_node 4 --master_port 29550 train.py
# run on two nodes
colossalai run --host <host1>,<host2> --master_addr host1 --nprocs_per_node 4 train.py
# run with hostfile
colossalai run --hostfile <file_path> --master_addr <host> --nprocs_per_node 4 train.py
# run with hostfile with only included hosts
colossalai run --hostfile <file_path> --master_addr host1 --include host1,host2 --nprocs_per_node 4 train.py
# run with hostfile excluding the hosts selected
colossalai run --hostfile <file_path> --master_addr host1 --exclude host2 --nprocs_per_node 4 train.py
"""
if not user_script.endswith('.py'):
click.echo(f'Error: invalid Python file {user_script}. Did you use a wrong option? Try colossalai run --help')
exit()
args_dict = locals()
args = Config(args_dict)
args.user_args = list(args.user_args)
launch_multi_processes(args)
from typing import List
import socket
class HostInfo:
"""
A data class to store host connection-related data.
Args:
hostname (str): name or IP address of the host
port (str): the port for ssh connection
"""
def __init__(
self,
hostname: str,
port: str = None,
):
self.hostname = hostname
self.port = port
self.is_local_host = HostInfo.is_host_localhost(hostname, port)
@staticmethod
def is_host_localhost(hostname: str, port: str = None) -> None:
"""
Check if the host refers to the local machine.
Args:
hostname (str): name or IP address of the host
port (str): the port for ssh connection
Returns:
bool: True if it is local, False otherwise
"""
if port is None:
port = 22 # no port specified, lets just use the ssh port
hostname = socket.getfqdn(hostname)
if hostname in ("localhost", "127.0.0.1", "0.0.0.0"):
return True
localhost = socket.gethostname()
localaddrs = socket.getaddrinfo(localhost, port)
targetaddrs = socket.getaddrinfo(hostname, port)
for (family, socktype, proto, canonname, sockaddr) in localaddrs:
for (rfamily, rsocktype, rproto, rcanonname, rsockaddr) in targetaddrs:
if rsockaddr[0] == sockaddr[0]:
return True
return False
def __str__(self):
return f'hostname: {self.hostname}, port: {self.port}'
def __repr__(self):
return self.__str__()
class HostInfoList:
"""
A data class to store a list of HostInfo objects.
"""
def __init__(self):
self.hostinfo_list = []
def append(self, hostinfo: HostInfo) -> None:
"""
Add an HostInfo object to the list.
Args:
hostinfo (HostInfo): host information
"""
self.hostinfo_list.append(hostinfo)
def remove(self, hostname: str) -> None:
"""
Add an HostInfo object to the list.
Args:
hostname (str): the name of the host
"""
hostinfo = self.get_hostinfo(hostname)
self.hostinfo_list.remove(hostinfo)
def get_hostinfo(self, hostname: str) -> HostInfo:
"""
Return the HostInfo object which matches with the hostname.
Args:
hostname (str): the name of the host
Returns:
hostinfo (HostInfo): the HostInfo object which matches with the hostname
"""
for hostinfo in self.hostinfo_list:
if hostinfo.hostname == hostname:
return hostinfo
raise Exception(f"Hostname {hostname} is not found")
def has(self, hostname: str) -> bool:
"""
Check if the hostname has been added.
Args:
hostname (str): the name of the host
Returns:
bool: True if added, False otherwise
"""
for hostinfo in self.hostinfo_list:
if hostinfo.hostname == hostname:
return True
return False
def __iter__(self):
return iter(self.hostinfo_list)
def __len__(self):
return len(self.hostinfo_list)
import fabric
from .hostinfo import HostInfo, HostInfoList
from multiprocessing import Pipe, Process
from multiprocessing import connection as mp_connection
import click
def run_on_host(hostinfo: HostInfo, workdir: str, recv_conn: mp_connection.Connection,
send_conn: mp_connection.Connection, env: dict) -> None:
"""
Use fabric connection to execute command on local or remote hosts.
Args:
hostinfo (HostInfo): host information
workdir (str): the directory to execute the command
recv_conn (multiprocessing.connection.Connection): receive messages from the master sender
send_conn (multiprocessing.connection.Connection): send messages to the master receiver
env (dict): a dictionary for environment variables
"""
fab_conn = fabric.Connection(hostinfo.hostname, port=hostinfo.port)
finish = False
env_msg = ' '.join([f'{k}=\"{v}\"' for k, v in env.items()])
# keep listening until exit
while not finish:
# receive cmd
cmds = recv_conn.recv()
if cmds == 'exit':
# exit from the loop
finish = True
break
else:
# execute the commands
try:
# cd to execute directory
with fab_conn.cd(workdir):
# propagate the runtime environment
with fab_conn.prefix(f"export {env_msg}"):
if hostinfo.is_local_host:
# execute on the local machine
fab_conn.local(cmds, hide=False)
else:
# execute on the remote machine
fab_conn.run(cmds, hide=False)
send_conn.send('success')
except:
click.echo(f"Error: failed to run {cmds} on {hostinfo.hostname}")
send_conn.send('failure')
# shutdown
send_conn.send("finish")
fab_conn.close()
class MultiNodeRunner:
"""
A runner to execute commands on an array of machines. This runner
is inspired by Nezha (https://github.com/zhuzilin/NeZha).
"""
def __init__(self):
self.processes = {}
self.master_send_conns = {}
self.master_recv_conns = {}
def connect(self, host_info_list: HostInfoList, workdir: str, env: dict) -> None:
"""
Establish connections to a list of hosts
Args:
host_info_list (HostInfoList): a list of HostInfo objects
workdir (str): the directory where command is executed
env (dict): environment variables to propagate to hosts
"""
for hostinfo in host_info_list:
master_send_conn, worker_recv_conn = Pipe()
master_recv_conn, worker_send_conn = Pipe()
p = Process(target=run_on_host, args=(hostinfo, workdir, worker_recv_conn, worker_send_conn, env))
p.start()
self.processes[hostinfo.hostname] = p
self.master_recv_conns[hostinfo.hostname] = master_recv_conn
self.master_send_conns[hostinfo.hostname] = master_send_conn
def send(self, hostinfo: HostInfo, cmd: str) -> None:
"""
Send a command to a local/remote host.
Args:
hostinfo (HostInfo): host information
cmd (str): the command to execute
"""
assert hostinfo.hostname in self.master_send_conns, \
f'{hostinfo} is not found in the current connections'
conn = self.master_send_conns[hostinfo.hostname]
conn.send(cmd)
def stop_all(self) -> None:
"""
Stop connections to all hosts.
"""
for hostname, conn in self.master_send_conns.items():
conn.send('exit')
def recv_from_all(self) -> dict:
"""
Receive messages from all hosts
Returns:
msg_from_node (dict): a dictionry which contains messages from each node
"""
msg_from_node = dict()
for hostname, conn in self.master_recv_conns.items():
msg_from_node[hostname] = conn.recv()
return msg_from_node
import click
import sys
import os
import torch
from colossalai.context import Config
from .multinode_runner import MultiNodeRunner
from .hostinfo import HostInfo, HostInfoList
from typing import List
from packaging import version
# Constants that define our syntax
NODE_SEP = ','
def fetch_hostfile(hostfile_path: str, ssh_port: int) -> HostInfoList:
"""
Parse the hostfile to obtain a list of hosts.
A hostfile should look like:
worker-0
worker-1
worker-2
...
Args:
hostfile_path (str): the path to the hostfile
ssh_port (int): the port to connect to the host
"""
if not os.path.isfile(hostfile_path):
click.echo(f"Error: Unable to find the hostfile, no such file: {hostfile_path}")
exit()
with open(hostfile_path, 'r') as fd:
device_pool = HostInfoList()
for line in fd.readlines():
line = line.strip()
if line == '':
# skip empty lines
continue
# build the HostInfo object
hostname = line.strip()
hostinfo = HostInfo(hostname=hostname, port=ssh_port)
if device_pool.has(hostname):
click.echo(f"Error: found duplicate host {hostname} in the hostfile")
exit()
device_pool.append(hostinfo)
return device_pool
def parse_device_filter(device_pool: HostInfoList, include_str=None, exclude_str=None) -> HostInfoList:
'''Parse an inclusion or exclusion string and filter a hostfile dictionary.
Examples:
include_str="worker-0,worker-1" will execute jobs only on worker-0 and worker-1.
exclude_str="worker-1" will use all available devices except worker-1.
Args:
device_pool (HostInfoList): a list of HostInfo objects
include_str (str): --include option passed by user, default None
exclude_str (str): --exclude option passed by user, default None
Returns:
filtered_hosts (HostInfoList): filtered hosts after inclusion/exclusion
'''
# Ensure include/exclude are mutually exclusive
if include_str and exclude_str:
click.echo("--include and --exclude are mutually exclusive, only one can be used")
exit()
# no-op
if include_str is None and exclude_str is None:
return device_pool
# Either build from scratch or remove items
if include_str:
parse_str = include_str
filtered_hosts = HostInfoList()
elif exclude_str:
parse_str = exclude_str
filtered_hosts = device_pool
# foreach node in the list
for node_config in parse_str.split(NODE_SEP):
hostname = node_config
hostinfo = device_pool.get_hostinfo(hostname)
# sanity check hostname
if not device_pool.has(hostname):
click.echo(f"Error: Hostname '{hostname}' not found in hostfile")
exit()
if include_str:
filtered_hosts.append(hostinfo)
elif exclude_str:
filtered_hosts.remove(hostname)
return filtered_hosts
def get_launch_command(
master_addr: str,
master_port: int,
nproc_per_node: int,
user_script: str,
user_args: List[str],
node_rank: int,
num_nodes: int,
extra_launch_args: str = None,
) -> str:
"""
Generate a command for distributed training.
Args:
master_addr (str): the host of the master node
master_port (str): the port of the master node
nproc_per_node (str): the number of processes to launch on each node
user_script (str): the user Python file
user_args (str): the arguments for the user script
node_rank (int): the unique ID for the node
num_nodes (int): the number of nodes to execute jobs
Returns:
cmd (str): the command the start distributed training
"""
def _arg_dict_to_list(arg_dict):
ret = []
for k, v in arg_dict.items():
if v:
ret.append(f'--{k}={v}')
else:
ret.append(f'--{k}')
return ret
if extra_launch_args:
extra_launch_args_dict = dict()
for arg in extra_launch_args.split(','):
if '=' in arg:
k, v = arg.split('=')
extra_launch_args_dict[k] = v
else:
extra_launch_args_dict[arg] = None
extra_launch_args = extra_launch_args_dict
else:
extra_launch_args = dict()
torch_version = version.parse(torch.__version__)
assert torch_version.major == 1
if torch_version.minor < 9:
cmd = [
sys.executable, "-m", "torch.distributed.launch", f"--nproc_per_node={nproc_per_node}",
f"--master_addr={master_addr}", f"--master_port={master_port}", f"--nnodes={num_nodes}",
f"--node_rank={node_rank}"
]
else:
# extra launch args for torch distributed launcher with torch >= 1.9
default_torchrun_rdzv_args = dict(rdzv_backend="c10d",
rdzv_endpoint=f"{master_addr}:{master_port}",
rdzv_id="colossalai-default-job")
# update rdzv arguments
for key in default_torchrun_rdzv_args.keys():
if key in extra_launch_args:
value = extra_launch_args.pop(key)
default_torchrun_rdzv_args[key] = value
if torch_version.minor < 10:
cmd = [
sys.executable, "-m", "torch.distributed.run", f"--nproc_per_node={nproc_per_node}",
f"--nnodes={num_nodes}", f"--node_rank={node_rank}"
]
else:
cmd = [
"torchrun", f"--nproc_per_node={nproc_per_node}", f"--nnodes={num_nodes}", f"--node_rank={node_rank}"
]
cmd += _arg_dict_to_list(default_torchrun_rdzv_args)
cmd += _arg_dict_to_list(extra_launch_args) + [user_script] + user_args
cmd = ' '.join(cmd)
return cmd
def launch_multi_processes(args: Config) -> None:
"""
Launch multiple processes on a single node or multiple nodes.
The overall logic can be summarized as the pseudo code below:
if hostfile given:
hostinfo = parse_hostfile(hostfile)
hostinfo = include_or_exclude_hosts(hostinfo)
launch_on_multi_nodes(hostinfo)
elif hosts given:
hostinfo = parse_hosts(hosts)
launch_on_multi_nodes(hostinfo)
else:
launch_on_current_node()
Args:
args (Config): the arguments taken from command line
"""
assert isinstance(args, Config)
if args.nproc_per_node is None:
click.echo("--nproc_per_node did not receive any value")
exit()
# cannot accept hosts and hostfile at the same time
if args.host and args.hostfile:
click.echo("Error: hostfile and hosts are mutually exclusive, only one is required")
# check if hostfile is given
if args.hostfile:
device_pool = fetch_hostfile(args.hostfile, ssh_port=args.ssh_port)
active_device_pool = parse_device_filter(device_pool, args.include, args.exclude)
if args.num_nodes > 0:
# only keep the first num_nodes to execute jobs
updated_active_device_pool = HostInfoList()
for count, hostinfo in enumerate(active_device_pool):
if args.num_nodes == count:
break
updated_active_device_pool.append(hostinfo)
active_device_pool = updated_active_device_pool
else:
active_device_pool = None
env = os.environ.copy()
# use hosts if hostfile is not given
if args.host and active_device_pool is None:
active_device_pool = HostInfoList()
host_list = args.host.strip().split(NODE_SEP)
for hostname in host_list:
hostinfo = HostInfo(hostname=hostname, port=args.ssh_port)
active_device_pool.append(hostinfo)
if not active_device_pool:
# run on local node if not hosts or hostfile is given
# add local node to host info list
active_device_pool = HostInfoList()
localhost_info = HostInfo(hostname='127.0.0.1', port=args.ssh_port)
active_device_pool.append(localhost_info)
# launch distributed processes
runner = MultiNodeRunner()
curr_path = os.path.abspath('.')
# collect current path env
env = dict()
for k, v in os.environ.items():
# do not support multi-line env var
if v and '\n' not in v:
env[k] = v
# establish remote connection
runner.connect(host_info_list=active_device_pool, workdir=curr_path, env=env)
# execute distributed launching command
for node_id, hostinfo in enumerate(active_device_pool):
cmd = get_launch_command(master_addr=args.master_addr,
master_port=args.master_port,
nproc_per_node=args.nproc_per_node,
user_script=args.user_script,
user_args=args.user_args,
node_rank=node_id,
num_nodes=len(active_device_pool),
extra_launch_args=args.extra_launch_args)
runner.send(hostinfo=hostinfo, cmd=cmd)
runner.recv_from_all()
runner.stop_all()
runner.recv_from_all()
from .collective import all_gather, reduce_scatter, all_reduce, broadcast, reduce
from .p2p import (send_forward, send_forward_recv_forward, send_backward_recv_forward, send_backward,
send_backward_recv_backward, send_forward_recv_backward, send_forward_backward_recv_forward_backward,
recv_forward, recv_backward)
from .ring import ring_forward
from .utils import send_obj_meta, recv_obj_meta
__all__ = [
'all_gather',
'reduce_scatter',
'all_reduce',
'broadcast',
'reduce',
'send_forward',
'send_forward_recv_forward',
'send_forward_backward_recv_forward_backward',
'send_backward',
'send_backward_recv_backward',
'send_backward_recv_forward',
'send_forward_recv_backward',
'recv_backward',
'recv_forward',
'ring_forward',
'send_obj_meta',
'recv_obj_meta',
]
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import torch
import torch.distributed as dist
from torch.distributed import ReduceOp
from torch import Tensor
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
def all_gather(tensor: Tensor, dim: int, parallel_mode: ParallelMode, async_op: bool = False) -> Tensor:
r"""Gathers all tensors from the parallel group and concatenates them in a
specific dimension.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
Args:
tensor (:class:`torch.Tensor`): Tensor to be gathered.
dim (int): The dimension concatenating in.
parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
async_op (bool, optional): Whether operations are asynchronous.
Returns:
Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The result of all-together only,
if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
"""
depth = gpc.get_world_size(parallel_mode)
if depth == 1:
out = tensor
work = None
else:
shape = list(tensor.shape)
shape[0], shape[dim] = shape[dim], shape[0]
shape[0] *= depth
out = torch.empty(shape, dtype=tensor.dtype, device=tensor.device)
temp = list(torch.chunk(out, depth, dim=0))
group = gpc.get_cpu_group(parallel_mode) if tensor.device.type == "cpu" else gpc.get_group(parallel_mode)
work = dist.all_gather(tensor_list=temp,
tensor=tensor.transpose(0, dim).contiguous(),
group=group,
async_op=async_op)
out = torch.transpose(out, 0, dim)
if async_op:
return out, work
else:
return out
def reduce_scatter(tensor: Tensor,
dim: int,
parallel_mode: ParallelMode,
op: ReduceOp = ReduceOp.SUM,
async_op: bool = False) -> Tensor:
r"""Reduces all tensors then scatters it in a specific dimension to all
members in the parallel group.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
Args:
tensor (:class:`torch.Tensor`): Tensor to be reduce_scattered.
dim (int): The dimension concatenating in.
parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
op (torch.distributed.ReduceOp, optional): The type of reduce operation,
should be included in [SUM, AVG, PRODUCT, MIN, MAX, BAND, BOR, BXOR].
More details about ReduceOp please refer to
`ReduceOp <https://pytorch.org/docs/stable/distributed.html#torch.distributed.ReduceOp>`_.
async_op (bool, optional): Whether operations are asynchronous.
Returns:
Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The result of reduce_scatter only,
if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
"""
depth = gpc.get_world_size(parallel_mode)
if depth == 1:
out = tensor
work = None
else:
temp = list(map(lambda x: x.contiguous(), torch.chunk(tensor, depth, dim=dim)))
out = torch.empty(temp[0].shape, dtype=tensor.dtype, device=tensor.device)
group = gpc.get_cpu_group(parallel_mode) if tensor.device.type == "cpu" else gpc.get_group(parallel_mode)
work = dist.reduce_scatter(output=out, input_list=temp, op=op, group=group, async_op=async_op)
if async_op:
return out, work
else:
return out
def all_reduce(tensor: Tensor,
parallel_mode: ParallelMode,
op: ReduceOp = ReduceOp.SUM,
async_op: bool = False) -> Tensor:
r"""Reduces the tensor data across whole parallel group in such a way that all get the final result.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
Args:
tensor (:class:`torch.Tensor`): Tensor to be all-reduced.
parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
op (torch.distributed.ReduceOp, optional): The type of reduce operation,
should be included in [SUM, AVG, PRODUCT, MIN, MAX, BAND, BOR, BXOR].
More details about ReduceOp please refer to
`ReduceOp <https://pytorch.org/docs/stable/distributed.html#torch.distributed.ReduceOp>`_.
async_op (bool, optional): Whether operations are asynchronous.
Returns:
Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The result of all-gather only,
if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
"""
depth = gpc.get_world_size(parallel_mode)
if depth == 1:
out = tensor
work = None
else:
out = tensor.contiguous()
group = gpc.get_cpu_group(parallel_mode) if tensor.device.type == "cpu" else gpc.get_group(parallel_mode)
work = dist.all_reduce(out, op=op, group=group, async_op=async_op)
if async_op:
return out, work
else:
return out
def broadcast(tensor: Tensor, src: int, parallel_mode: ParallelMode, async_op: bool = False):
r"""Broadcast tensors to whole parallel group. Tensor must have the same
number of elements in all processes participating in the collective.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
Args:
tensor (:class:`torch.Tensor`): Tensor to be broadcast.
src (int): Source rank.
parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
async_op (bool, optional): Whether operations are asynchronous.
Returns:
Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The tensor need to be broadcast only,
if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
"""
depth = gpc.get_world_size(parallel_mode)
if depth == 1:
out = tensor
work = None
else:
out = tensor.contiguous()
group = gpc.get_cpu_group(parallel_mode) if tensor.device.type == "cpu" else gpc.get_group(parallel_mode)
work = dist.broadcast(out, src=src, group=group, async_op=async_op)
if async_op:
return out, work
else:
return out
def reduce(tensor: Tensor, dst: int, parallel_mode: ParallelMode, op: ReduceOp = ReduceOp.SUM, async_op: bool = False):
r"""Reduce tensors across whole parallel group. Only the process with
rank ``dst`` is going to receive the final result.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
Args:
tensor (:class:`torch.Tensor`): Tensor to be reduced.
dst (int): Destination rank.
parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
async_op (bool, optional): Whether operations are asynchronous.
Returns:
Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The result of reduce only,
if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
"""
depth = gpc.get_world_size(parallel_mode)
if depth == 1:
out = tensor
work = None
else:
out = tensor.contiguous()
group = gpc.get_cpu_group(parallel_mode) if tensor.device.type == "cpu" else gpc.get_group(parallel_mode)
work = dist.reduce(out, dst=dst, op=op, group=group, async_op=async_op)
if async_op:
return out, work
else:
return out
def scatter_object_list(scatter_object_output_list, scatter_object_input_list, src=0, group=None) -> None:
r"""Modified from `torch.distributed.scatter_object_list <https://pytorch.org/docs/stable/_modules/torch/distributed/distributed_c10d.html#scatter_object_list>` to fix issues
"""
if dist.distributed_c10d._rank_not_in_group(group):
return
if (not isinstance(scatter_object_output_list, list) or len(scatter_object_output_list) < 1):
raise RuntimeError("Expected argument scatter_object_output_list to be a list of size at least 1.")
# set tensor device to cuda if backend is nccl
device = torch.cuda.current_device() if dist.get_backend(group) == 'nccl' else torch.device("cpu")
my_rank = dist.get_rank() # use global rank
if my_rank == src:
tensor_list, tensor_sizes = zip(
*[dist.distributed_c10d._object_to_tensor(obj) for obj in scatter_object_input_list])
tensor_list = list(map(lambda x: x.to(device), tensor_list))
tensor_sizes = list(map(lambda x: x.to(device), tensor_sizes))
# Src rank broadcasts the maximum tensor size. This is because all ranks are
# expected to call into scatter() with equal-sized tensors.
if my_rank == src:
max_tensor_size = max(tensor_sizes)
for tensor in tensor_list:
tensor.resize_(max_tensor_size)
else:
max_tensor_size = torch.tensor([0], dtype=torch.long).to(device)
dist.broadcast(max_tensor_size, src=src, group=group)
# Scatter actual serialized objects
output_tensor = torch.empty(max_tensor_size.item(), dtype=torch.uint8).to(device)
dist.scatter(
output_tensor,
scatter_list=None if my_rank != src else tensor_list,
src=src,
group=group,
)
# Scatter per-object sizes to trim tensors when deserializing back to object
obj_tensor_size = torch.tensor([0], dtype=torch.long).to(device)
dist.scatter(
obj_tensor_size,
scatter_list=None if my_rank != src else tensor_sizes,
src=src,
group=group,
)
output_tensor, obj_tensor_size = output_tensor.cpu(), obj_tensor_size.cpu()
# Deserialize back to object
scatter_object_output_list[0] = dist.distributed_c10d._tensor_to_object(output_tensor, obj_tensor_size)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment