Commit 7bc5a8e3 authored by zhuwenwen's avatar zhuwenwen
Browse files
parents e6748d82 0f785cb1
# coding=utf-8
from pathlib import Path
import torch
import torch.nn as nn
from typing import List, Dict, Mapping, OrderedDict, Optional, Tuple
from colossalai.tensor.d_tensor.d_tensor import DTensor
import re
SAFE_WEIGHTS_NAME = "model.safetensors"
WEIGHTS_NAME = "pytorch_model.bin"
SAFE_WEIGHTS_INDEX_NAME = "model.safetensors.index.json"
WEIGHTS_INDEX_NAME = "pytorch_model.bin.index.json"
# ======================================
# General helper functions
# ======================================
def calculate_tensor_size(tensor: torch.Tensor) -> float:
"""
Calculate the size of a parameter in MB. Used to compute whether a group of params exceed the shard size.
If so, a new shard should be created.
Args:
tenosr (torch.Tensor): the tensor to calculate size for.
Returns:
float: size of the tensor in MB.
"""
return tensor.numel() * tensor.element_size() / 1024 / 1024
def is_safetensors_available() -> bool:
"""
Check whether safetensors is available.
Returns:
bool: whether safetensors is available.
"""
try:
import safetensors
return True
except ImportError:
return False
def is_dtensor_checkpoint(checkpoint_file_path: str) -> bool:
"""
Check whether the checkpoint file is a dtensor checkpoint.
Args:
checkpoint_file_path (str): path to the checkpoint file.
Returns:
bool: whether the checkpoint file is a dtensor checkpoint.
"""
if checkpoint_file_path.endswith('.*.safetensors') or checkpoint_file_path.endswith('.*.bin'):
return True
else:
return False
def is_safetensor_checkpoint(checkpoint_file_path: str) -> bool:
"""
Check whether the checkpoint file is a safetensor checkpoint.
Args:
checkpoint_file_path (str): path to the checkpoint file.
Returns:
bool: whether the checkpoint file is a safetensor checkpoint.
"""
if checkpoint_file_path.endswith('.safetensors'):
return True
else:
return False
# ======================================
# Helper functions for saving shard file
# ======================================
def shard_checkpoint(state_dict: torch.Tensor, max_shard_size: int = 1024, weights_name: str = WEIGHTS_NAME):
"""
Splits a model state dictionary in sub-checkpoints so that the final size of each sub-checkpoint does not exceed a
given size.
"""
sharded_state_dicts = []
current_block = {}
current_block_size = 0
total_size = 0
for key, weight in state_dict.items():
if type(weight) != DTensor:
weight_size = calculate_tensor_size(weight)
# If this weight is going to tip up over the maximal size, we split.
if current_block_size + weight_size > max_shard_size:
sharded_state_dicts.append(current_block)
current_block = {}
current_block_size = 0
current_block[key] = weight
current_block_size += weight_size
total_size += weight_size
# Add the last block
sharded_state_dicts.append(current_block)
# If we only have one shard, we return it
if len(sharded_state_dicts) == 1:
return {weights_name: sharded_state_dicts[0]}, None
# Otherwise, let's build the index
weight_map = {}
shards = {}
for idx, shard in enumerate(sharded_state_dicts):
shard_file = weights_name.replace(".bin", f"-{idx+1:05d}-of-{len(sharded_state_dicts):05d}.bin")
shard_file = shard_file.replace(
".safetensors", f"-{idx + 1:05d}-of-{len(sharded_state_dicts):05d}.safetensors"
)
shards[shard_file] = shard
for key in shard.keys():
weight_map[key] = shard_file
# Add the metadata
metadata = {"total_size": total_size}
index = {"metadata": metadata, "weight_map": weight_map}
return shards, index
def load_shard_state_dict(checkpoint_file: Path, use_safetensors: bool =False):
"""
load shard state dict into model
"""
if use_safetensors and not checkpoint_file.suffix == ".safetensors":
raise Exception("load the model using `safetensors`, but no file endwith .safetensors")
if use_safetensors:
from safetensors.torch import safe_open
from safetensors.torch import load_file as safe_load_file
with safe_open(checkpoint_file, framework="pt") as f:
metadata = f.metadata()
if metadata["format"] != "pt":
raise NotImplementedError(
f"Conversion from a {metadata['format']} safetensors archive to PyTorch is not implemented yet."
)
return safe_load_file(checkpoint_file)
else:
return torch.load(checkpoint_file)
def load_state_dict_into_model(model: nn.Module, state_dict: torch.Tensor, missing_keys: List, strict: bool = False):
r"""Copies parameters and buffers from :attr:`state_dict` into
this module and its descendants.
Args:
state_dict (dict): a dict containing parameters and
persistent buffers.
"""
if not isinstance(state_dict, Mapping):
raise TypeError("Expected state_dict to be dict-like, got {}.".format(type(state_dict)))
unexpected_keys: List[str] = []
sub_missing_keys: List[str] = []
error_msgs: List[str] = []
# copy state_dict so _load_from_state_dict can modify it
metadata = getattr(state_dict, '_metadata', None)
state_dict = OrderedDict(state_dict)
if metadata is not None:
state_dict._metadata = metadata
def load(module: nn.Module, state_dict, prefix=""):
local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
args = (state_dict, prefix, local_metadata, True, [], [], error_msgs)
# Parameters of module and children will start with prefix. We can exit early if there are none in this
# state_dict
if len([key for key in state_dict if key.startswith(prefix)]) > 0:
module._load_from_state_dict(*args)
for name, child in module._modules.items():
if child is not None:
load(child, state_dict, prefix + name + ".")
load(model, state_dict, "")
del load
# deal with missing key
if len(missing_keys) > 0:
deleted_keys = []
for key in missing_keys:
if key not in sub_missing_keys:
deleted_keys.append(key)
for key in deleted_keys:
missing_keys.remove(key)
if strict:
if len(unexpected_keys) > 0:
error_msgs = 'Unexpected key(s) in state_dict: {}. '.format(
', '.join('"{}"'.format(k) for k in unexpected_keys))
raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
model.__class__.__name__, "\n\t".join(error_msgs)))
# ======================================
# Helper functions for saving state dict
# ======================================
def save_state_dict(state_dict: dict, checkpoint_file_path: str, use_safetensors: bool) -> None:
"""
Save state dict to checkpoint.
Args:
state_dict (dict): state dict.
checkpoint_file_path (str): path to the checkpoint file.
use_safetensors (bool): whether to use safetensors to save the checkpoint.
"""
if use_safetensors:
assert is_safetensors_available(), "safetensors is not available."
assert checkpoint_file_path.endswith('.safetensors'), \
"safetensors only supports .safetensors suffix for checkpoint file."
from safetensors.torch import save_file as safe_save_file
safe_save_file(state_dict, checkpoint_file_path, metadata={"format": "pt"})
else:
torch.save(state_dict, checkpoint_file_path)
def save_dtensor(name: str, tensor: torch.Tensor, index_file: "CheckpointIndexFile", use_safetensors: bool) -> None:
"""
Save distributed tensor to checkpoint. This checkpoint will be a dictionary which contains
only one tensor.
Args:
tensor (Tensor): tensor to be saved.
index_file (CheckpointIndexFile): path to the checkpoint file.
size_per_shard (int): size per shard in MB.
"""
root_path = index_file.root_path
output_root_path = root_path.joinpath('dtensor')
# create directory
output_root_path.mkdir(exist_ok=True)
# save tensor to this directory
# TODO(YuliangLiu): get index of the tensor shard
# e.g. index =
index = 0
# save tensor to file
ckpt_file_name = generate_dtensor_file_name(name, index, use_safetensors)
ckpt_file_path = output_root_path.joinpath(ckpt_file_name)
# dtensor ckpt file always contains only one tensor
state_dict = {name: tensor}
save_state_dict(state_dict, str(ckpt_file_path), use_safetensors)
# update the weight map
# * means all shards
ckpt_file_name_in_weight_map = 'dtensor/' + generate_dtensor_file_name(name, '*', use_safetensors)
index_file.append_weight_map(name, ckpt_file_name_in_weight_map)
def get_checkpoint_file_suffix(use_safetensors: bool) -> str:
"""
Get checkpoint file suffix.
Args:
use_safetensors (bool): whether to use safetensors to save the checkpoint.
Returns:
str: checkpoint file suffix.
"""
if use_safetensors:
return '.safetensors'
else:
return '.bin'
def generate_checkpoint_shard_file_name(index: int,
total_number: int,
use_safetensors: bool,
prefix: str = None) -> str:
"""
Generate checkpoint shard file name.
Args:
index (int): index of the shard.
total_number (int): total number of shards.
use_safetensors (bool): whether to use safetensors to save the checkpoint.
prefix (str): prefix of the shard file name. Default: None.
Returns:
str: checkpoint shard file name.
"""
suffix = get_checkpoint_file_suffix(use_safetensors)
if prefix is None:
return f"{index:05d}-of-{total_number:05d}.{suffix}"
else:
return f"{prefix}-{index:05d}-of-{total_number:05d}.{suffix}"
def generate_dtensor_file_name(param_name: str, index: int, use_safetensors: bool) -> str:
"""
Generate dtensor file name.
Args:
param_name (str): name of the distributed parameter.
index (int): index of the shard.
use_safetensors (bool): whether to use safetensors to save the checkpoint.
Returns:
str: dtensor file name.
"""
suffix = get_checkpoint_file_suffix(use_safetensors)
return f'{param_name}.{index}.{suffix}'
def save_state_dict_as_shard(
state_dict: dict,
checkpoint_path: str,
index: int,
total_number: int,
use_safetensors: bool,
prefix: str = None,
) -> None:
"""
Save state dict as shard.
Args:
state_dict (dict): state dict.
checkpoint_path (str): path to the checkpoint file.
index (int): index of the shard.
total_number (int): total number of shards.
prefix (str): prefix of the shard file name.
use_safetensors (bool): whether to use safetensors to save the checkpoint.
"""
# generate the shard name
shard_file_name = generate_checkpoint_shard_file_name(index, total_number, use_safetensors, prefix)
shard_file_path = Path(checkpoint_path).joinpath(shard_file_name).absolute()
# save the shard
save_state_dict(state_dict, str(shard_file_path), use_safetensors)
# ========================================
# Helper functions for loading state dict
# ========================================
def has_index_file(checkpoint_path: str) -> Tuple[bool, Optional[Path]]:
"""
Check whether the checkpoint has an index file.
Args:
checkpoint_path (str): path to the checkpoint.
Returns:
Tuple[bool, Optional[Path]]: a tuple of (has_index_file, index_file_path)
"""
checkpoint_path = Path(checkpoint_path)
if checkpoint_path.is_file():
# check if it is .index.json
reg = re.compile("(.*?).index((\..*)?).json")
if reg.fullmatch(checkpoint_path.name) is not None:
return True, checkpoint_path
else:
return False, None
elif checkpoint_path.is_dir():
# check if there is only one a file ending with .index.json in this directory
index_files = list(checkpoint_path.glob('*.index.*json'))
# if we found a .index.json file, make sure there is only one
if len(index_files) > 0:
assert len(
index_files
) == 1, f'Expected to find one .index.json file in {checkpoint_path}, but found {len(index_files)}'
if len(index_files) == 1:
return True, index_files[0]
else:
return False, None
def load_state_dict(checkpoint_file_path: Path):
"""
Load state dict from checkpoint.
Args:
checkpoint_file_path (Path): path to the checkpoint file.
Returns:
dict: state dict.
"""
assert not is_dtensor_checkpoint(checkpoint_file_path), \
f'Cannot load state dict from dtensor checkpoint {checkpoint_file_path}, you should convert the distributed tensors to gathered tensors with our CLI offline.'
if is_safetensor_checkpoint(checkpoint_file_path):
assert is_safetensors_available(), \
f'Cannot load state dict from safetensor checkpoint {checkpoint_file_path}, because safetensors is not available. Please install safetensors first with pip install safetensors.'
# load with safetensors
from safetensors import safe_open
state_dict = {}
with safe_open(checkpoint_file_path, framework="pt", device="cpu") as f:
for k in f.keys():
state_dict[k] = f.get_tensor(k)
return state_dict
else:
# load with torch
return torch.load(checkpoint_file_path)
def add_variant(weights_name: str, variant: Optional[str] = None) -> str:
if variant is not None and len(variant) > 0:
splits = weights_name.split(".")
splits = splits[:-1] + [variant] + splits[-1:]
weights_name = ".".join(splits)
return weights_name
from .cli import cli
__all__ = ['cli']
import click
from colossalai.context import Config
from .benchmark import run_benchmark
from .utils import *
__all__ = ['benchmark']
@click.command()
@click.option("-g", "--gpus", type=int, default=None, help="Total number of devices to use.")
@click.option("-b", "--batch_size", type=int, default=8, help="Batch size of the input tensor.")
@click.option("-s", "--seq_len", type=int, default=512, help="Sequence length of the input tensor.")
@click.option("-d", "--dimension", type=int, default=1024, help="Hidden dimension of the input tensor.")
@click.option("-w", "--warmup_steps", type=int, default=10, help="The number of warmup steps.")
@click.option("-p", "--profile_steps", type=int, default=50, help="The number of profiling steps.")
@click.option("-l", "--layers", type=int, default=2)
@click.option("-m",
"--model",
type=click.Choice(['mlp'], case_sensitive=False),
default='mlp',
help="Select the model to benchmark, currently only supports MLP")
def benchmark(gpus: int, batch_size: int, seq_len: int, dimension: int, warmup_steps: int, profile_steps: int,
layers: int, model: str):
args_dict = locals()
args = Config(args_dict)
run_benchmark(args)
from functools import partial
from typing import Dict, List
import click
import torch.multiprocessing as mp
import colossalai
from colossalai.cli.benchmark.utils import find_all_configs, get_batch_data, profile_model
from colossalai.context import Config
from colossalai.context.random import reset_seeds
from colossalai.core import global_context as gpc
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.testing import free_port
from colossalai.utils import MultiTimer
from .models import MLP
def run_benchmark(args: Config) -> None:
"""
Run benchmarking with torch.multiprocessing.
"""
# sanity checks
if args.gpus is None:
click.echo("Error: --num_gpus is not given")
exit()
if args.gpus <= 1:
click.echo("Warning: tensor parallel will be activated with at least 2 devices.")
click.echo("=== Benchmarking Parameters ===")
for k, v in args.items():
click.echo(f'{k}: {v}')
click.echo('')
config_list = find_all_configs(args.gpus)
avail_ports = [free_port() for _ in range(len(config_list))]
run_func = partial(run_dist_profiling,
world_size=args.gpus,
port_list=avail_ports,
config_list=config_list,
hyperparams=args)
mp.spawn(run_func, nprocs=args.gpus)
def run_dist_profiling(rank: int, world_size: int, port_list: List[int], config_list: List[Dict],
hyperparams: Config) -> None:
"""
A function executed for profiling, this function should be spawn by torch.multiprocessing.
Args:
rank (int): rank of the process
world_size (int): the number of processes
port_list (List[int]): a list of free ports for initializing distributed networks
config_list (List[Dict]): a list of configuration
hyperparams (Config): the hyperparameters given by the user
"""
# disable logging for clean output
disable_existing_loggers()
logger = get_dist_logger()
logger.set_level('WARNING')
for config, port in zip(config_list, port_list):
colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
timer = MultiTimer()
# 1D parallel should be skipped if in_features or out_features is not able to be divided exactly by 1D parallel size.
if config.parallel.tensor.mode == '1d' and hyperparams.dimension % config.parallel.tensor.size != 0:
click.echo(
"1D parallel will be skipped because in_features or out_features is not able to be divided exactly by 1D parallel size."
)
continue
if hyperparams.model == 'mlp':
model = MLP(dim=hyperparams.dimension, layers=hyperparams.layers)
else:
if gpc.get_global_rank() == 0:
click.echo("Error: Invalid argument for --model")
exit()
data_func = partial(get_batch_data,
dim=hyperparams.dimension,
batch_size=hyperparams.batch_size,
seq_length=hyperparams.seq_len,
mode=config.parallel.tensor.mode)
fwd_time, bwd_time, max_allocated, max_cached = profile_model(model=model,
warmup_steps=hyperparams.warmup_steps,
profile_steps=hyperparams.profile_steps,
data_func=data_func,
timer=timer)
gpc.destroy()
reset_seeds()
if gpc.get_global_rank() == 0:
config_str = ', '.join([f'{k}: {v}' for k, v in config.parallel.tensor.items()])
click.echo(f"=== {config_str} ===")
click.echo(f"Average forward time: {fwd_time}")
click.echo(f"Average backward time: {bwd_time}")
click.echo(f"Max allocated GPU memory: {max_allocated}")
click.echo(f"Max cached GPU memory: {max_cached}\n")
import torch
import colossalai.nn as col_nn
class MLP(torch.nn.Module):
def __init__(self, dim: int, layers: int):
super().__init__()
self.layers = torch.nn.ModuleList()
for _ in range(layers):
self.layers.append(col_nn.Linear(dim, dim))
def forward(self, x):
for layer in self.layers:
x = layer(x)
return x
import math
import time
import torch
from colossalai.utils import MultiTimer
from colossalai.context import ParallelMode, Config
from typing import List, Dict, Tuple, Callable
def get_time_stamp() -> int:
"""
Return the time stamp for profiling.
Returns:
time_stamp (int): the time given by time.time()
"""
torch.cuda.synchronize()
time_stamp = time.time()
return time_stamp
def get_memory_states() -> Tuple[float]:
"""
Return the memory statistics.
Returns:
max_allocated (float): the allocated CUDA memory
max_cached (float): the cached CUDA memory
"""
max_allocated = torch.cuda.max_memory_allocated() / (1024**3)
max_cached = torch.cuda.max_memory_reserved() / (1024**3)
torch.cuda.reset_peak_memory_stats()
torch.cuda.empty_cache()
return max_allocated, max_cached
def find_all_configs(device_cnt: int) -> List[Dict]:
"""
Find all possible configurations for tensor parallelism
Args:
device_cnt (int): the number of devices
Returns:
config_list (List[Dict]): a list of configurations
"""
def _is_square(num):
# 2D parallel should be implemented with at least 2 devices.
if num <= 1:
return False
return math.floor(math.sqrt(num))**2 == num
def _is_cube(num):
# 3D parallel should be implemented with at least 2 devices.
if num <= 1:
return False
return math.floor(num**(1. / 3.))**3 == num
config_list = []
# add non-parallel config
config = dict(parallel=dict(tensor=dict(size=device_cnt, mode=None)))
config_list.append(config)
# add 1D config
config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='1d')))
config_list.append(config)
# add 2D config only if device_cnt is a square
if _is_square(device_cnt):
config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='2d')))
config_list.append(config)
# check for 2.5D
# iterate over depth
for depth in range(1, device_cnt):
if device_cnt % depth == 0 and _is_square(device_cnt // depth):
config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='2.5d', depth=depth)))
config_list.append(config)
# check for 3D if device_cnt is a cube
if _is_cube(device_cnt):
config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='3d')))
config_list.append(config)
config_list = [Config(cfg) for cfg in config_list]
return config_list
def profile_model(model: torch.nn.Module, warmup_steps: int, profile_steps: int, data_func: Callable,
timer: MultiTimer) -> Tuple[float]:
"""
Profile the forward and backward of a model
Args:
model (torch.nn.Module): a PyTorch model
warmup_steps (int): the number of steps for warmup
profile_steps (int): the number of steps for profiling
data_func (Callable): a function to generate random data
timer (colossalai.utils.Multitimer): a timer instance for time recording
Returns:
fwd_time (float): the average forward time taken by forward pass in second
bwd_time (float): the average backward time taken by forward pass in second
max_allocated (float): the maximum GPU memory allocated in GB
max_cached (float): the maximum GPU memory cached in GB
"""
def _run_step(data):
timer.start('forward')
out = model(data)
timer.stop('forward', keep_in_history=True)
timer.start('backward')
out.mean().backward()
timer.stop('backward', keep_in_history=True)
data_list = [data_func() for _ in range(warmup_steps)]
for data in data_list:
_run_step(data)
timer.reset('forward')
timer.reset('backward')
for _ in range(profile_steps):
data = data_func()
_run_step(data)
max_allocated, max_cached = get_memory_states()
fwd_time = timer.get_timer('forward').get_history_mean()
bwd_time = timer.get_timer('backward').get_history_mean()
return fwd_time, bwd_time, max_allocated, max_cached
def get_batch_data(dim: int, batch_size: int, seq_length: int, mode: ParallelMode) -> torch.Tensor:
"""
Return a random data of shape (batch_size, seq_length, dim) for profiling.
Args:
dim (int): hidden size
batch_size (int): the number of data samples
seq_length (int): the number of tokens
mode (ParallelMode): Colossal-AI ParallelMode enum
Returns:
data (torch.Tensor): random data
"""
if mode in ['2d', '2.5d']:
batch_size = batch_size // 2
dim = dim // 2
elif mode == '3d':
batch_size = batch_size // 4
dim = dim // 2
data = torch.rand(batch_size, seq_length, dim).cuda()
return data
import click
from .check_installation import check_installation
__all__ = ['check']
@click.command(help="Check if Colossal-AI is correct based on the given option")
@click.option('-i', '--installation', is_flag=True, help="Check if Colossal-AI is built correctly")
def check(installation):
if installation:
check_installation()
return
click.echo("No option is given")
import subprocess
import click
import torch
from torch.utils.cpp_extension import CUDA_HOME
import colossalai
def to_click_output(val):
# installation check output to understandable symbols for readability
VAL_TO_SYMBOL = {True: u'\u2713', False: 'x', None: 'N/A'}
if val in VAL_TO_SYMBOL:
return VAL_TO_SYMBOL[val]
else:
return val
def check_installation():
"""
This function will check the installation of colossalai, specifically, the version compatibility of
colossalai, pytorch and cuda.
Example:
```text
```
Returns: A table of installation information.
"""
found_aot_cuda_ext = _check_aot_built_cuda_extension_installed()
cuda_version = _check_cuda_version()
torch_version, torch_cuda_version = _check_torch_version()
colossalai_verison, prebuilt_torch_version_required, prebuilt_cuda_version_required = _parse_colossalai_version()
# if cuda_version is None, that means either
# CUDA_HOME is not found, thus cannot compare the version compatibility
if not cuda_version:
sys_torch_cuda_compatibility = None
else:
sys_torch_cuda_compatibility = _is_compatible([cuda_version, torch_cuda_version])
# if cuda_version or cuda_version_required is None, that means either
# CUDA_HOME is not found or AOT compilation is not enabled
# thus, there is no need to compare the version compatibility at all
if not cuda_version or not prebuilt_cuda_version_required:
sys_colossalai_cuda_compatibility = None
else:
sys_colossalai_cuda_compatibility = _is_compatible([cuda_version, prebuilt_cuda_version_required])
# if torch_version_required is None, that means AOT compilation is not enabled
# thus there is no need to compare the versions
if prebuilt_torch_version_required is None:
torch_compatibility = None
else:
torch_compatibility = _is_compatible([torch_version, prebuilt_torch_version_required])
click.echo(f'#### Installation Report ####')
click.echo(f'\n------------ Environment ------------')
click.echo(f"Colossal-AI version: {to_click_output(colossalai_verison)}")
click.echo(f"PyTorch version: {to_click_output(torch_version)}")
click.echo(f"System CUDA version: {to_click_output(cuda_version)}")
click.echo(f"CUDA version required by PyTorch: {to_click_output(torch_cuda_version)}")
click.echo("")
click.echo(f"Note:")
click.echo(f"1. The table above checks the versions of the libraries/tools in the current environment")
click.echo(f"2. If the System CUDA version is N/A, you can set the CUDA_HOME environment variable to locate it")
click.echo(
f"3. If the CUDA version required by PyTorch is N/A, you probably did not install a CUDA-compatible PyTorch. This value is give by torch.version.cuda and you can go to https://pytorch.org/get-started/locally/ to download the correct version."
)
click.echo(f'\n------------ CUDA Extensions AOT Compilation ------------')
click.echo(f"Found AOT CUDA Extension: {to_click_output(found_aot_cuda_ext)}")
click.echo(f"PyTorch version used for AOT compilation: {to_click_output(prebuilt_torch_version_required)}")
click.echo(f"CUDA version used for AOT compilation: {to_click_output(prebuilt_cuda_version_required)}")
click.echo("")
click.echo(f"Note:")
click.echo(
f"1. AOT (ahead-of-time) compilation of the CUDA kernels occurs during installation when the environment variable CUDA_EXT=1 is set"
)
click.echo(f"2. If AOT compilation is not enabled, stay calm as the CUDA kernels can still be built during runtime")
click.echo(f"\n------------ Compatibility ------------")
click.echo(f'PyTorch version match: {to_click_output(torch_compatibility)}')
click.echo(f"System and PyTorch CUDA version match: {to_click_output(sys_torch_cuda_compatibility)}")
click.echo(f"System and Colossal-AI CUDA version match: {to_click_output(sys_colossalai_cuda_compatibility)}")
click.echo(f"")
click.echo(f"Note:")
click.echo(f"1. The table above checks the version compatibility of the libraries/tools in the current environment")
click.echo(
f" - PyTorch version mismatch: whether the PyTorch version in the current environment is compatible with the PyTorch version used for AOT compilation"
)
click.echo(
f" - System and PyTorch CUDA version match: whether the CUDA version in the current environment is compatible with the CUDA version required by PyTorch"
)
click.echo(
f" - System and Colossal-AI CUDA version match: whether the CUDA version in the current environment is compatible with the CUDA version used for AOT compilation"
)
def _is_compatible(versions):
"""
Compare the list of versions and return whether they are compatible.
"""
if None in versions:
return False
# split version into [major, minor, patch]
versions = [version.split('.') for version in versions]
for version in versions:
if len(version) == 2:
# x means unknown
version.append('x')
for idx, version_values in enumerate(zip(*versions)):
equal = len(set(version_values)) == 1
if idx in [0, 1] and not equal:
return False
elif idx == 1:
return True
else:
continue
def _parse_colossalai_version():
"""
Get the Colossal-AI version information.
Returns:
colossalai_version: Colossal-AI version.
torch_version_for_aot_build: PyTorch version used for AOT compilation of CUDA kernels.
cuda_version_for_aot_build: CUDA version used for AOT compilation of CUDA kernels.
"""
# colossalai version can be in two formats
# 1. X.X.X+torchX.XXcuXX.X (when colossalai is installed with CUDA extensions)
# 2. X.X.X (when colossalai is not installed with CUDA extensions)
# where X represents an integer.
colossalai_verison = colossalai.__version__.split('+')[0]
try:
torch_version_for_aot_build = colossalai.__version__.split('torch')[1].split('cu')[0]
cuda_version_for_aot_build = colossalai.__version__.split('cu')[1]
except:
torch_version_for_aot_build = None
cuda_version_for_aot_build = None
return colossalai_verison, torch_version_for_aot_build, cuda_version_for_aot_build
def _check_aot_built_cuda_extension_installed():
"""
According to `op_builder/README.md`, the CUDA extension can be built with either
AOT (ahead-of-time) or JIT (just-in-time) compilation.
AOT compilation will build CUDA extensions to `colossalai._C` during installation.
JIT (just-in-time) compilation will build CUDA extensions to `~/.cache/colossalai/torch_extensions` during runtime.
"""
try:
import colossalai._C.fused_optim
found_aot_cuda_ext = True
except ImportError:
found_aot_cuda_ext = False
return found_aot_cuda_ext
def _check_torch_version():
"""
Get the PyTorch version information.
Returns:
torch_version: PyTorch version.
torch_cuda_version: CUDA version required by PyTorch.
"""
# get torch version
# torch version can be of two formats
# - 1.13.1+cu113
# - 1.13.1.devxxx
torch_version = torch.__version__.split('+')[0]
torch_version = '.'.join(torch_version.split('.')[:3])
# get cuda version in pytorch build
try:
torch_cuda_major = torch.version.cuda.split(".")[0]
torch_cuda_minor = torch.version.cuda.split(".")[1]
torch_cuda_version = f'{torch_cuda_major}.{torch_cuda_minor}'
except:
torch_cuda_version = None
return torch_version, torch_cuda_version
def _check_cuda_version():
"""
Get the CUDA version information.
Returns:
cuda_version: CUDA version found on the system.
"""
# get cuda version
if CUDA_HOME is None:
cuda_version = CUDA_HOME
else:
try:
raw_output = subprocess.check_output([CUDA_HOME + "/bin/nvcc", "-V"], universal_newlines=True)
output = raw_output.split()
release_idx = output.index("release") + 1
release = output[release_idx].split(".")
bare_metal_major = release[0]
bare_metal_minor = release[1][0]
cuda_version = f'{bare_metal_major}.{bare_metal_minor}'
except:
cuda_version = None
return cuda_version
import click
from .benchmark import benchmark
from .check import check
from .launcher import run
class Arguments():
def __init__(self, arg_dict):
for k, v in arg_dict.items():
self.__dict__[k] = v
@click.group()
def cli():
pass
cli.add_command(run)
cli.add_command(check)
cli.add_command(benchmark)
if __name__ == '__main__':
cli()
import click
from colossalai.context import Config
from .run import launch_multi_processes
@click.command(help="Launch distributed training on a single node or multiple nodes",
context_settings=dict(ignore_unknown_options=True))
@click.option("-H",
"-host",
"--host",
type=str,
default=None,
help="the list of hostnames to launch in the format <host1>,<host2>")
@click.option(
"--hostfile",
type=str,
default=None,
help="Hostfile path that defines the device pool available to the job, each line in the file is a hostname")
@click.option("--include",
type=str,
default=None,
help="Specify computing devices to use during execution. String format is <host1>,<host2>,"
" only effective when used with --hostfile.")
@click.option(
"--exclude",
type=str,
default=None,
help=
"Specify computing devices to NOT use during execution. Mutually exclusive with --include. Formatting is the same as --includ,"
" only effective when used with --hostfile.")
@click.option("--num_nodes",
type=int,
default=-1,
help="Total number of worker nodes to use, only effective when used with --hostfile.")
@click.option("--nproc_per_node", type=int, default=None, help="Number of GPUs to use on each node.")
@click.option("--master_port",
type=int,
default=29500,
help="(optional) Port used by PyTorch distributed for communication during distributed training.")
@click.option("--master_addr",
type=str,
default="127.0.0.1",
help="(optional) IP address of node 0, will be inferred via 'hostname -I' if not specified.")
@click.option(
"--extra_launch_args",
type=str,
default=None,
help=
"Set additional torch distributed launcher arguments such as --standalone. The format is --extra_launch_args arg1=1,arg2=2. "
"This will be converted to --arg1=1 --arg2=2 during execution")
@click.option("--ssh-port", type=int, default=None, help="(optional) the port used for ssh connection")
@click.argument("user_script", type=str)
@click.argument('user_args', nargs=-1)
def run(host: str, hostfile: str, num_nodes: int, nproc_per_node: int, include: str, exclude: str, master_addr: str,
master_port: int, extra_launch_args: str, ssh_port: int, user_script: str, user_args: str) -> None:
"""
To launch multiple processes on a single node or multiple nodes via command line.
Usage::
# run with 4 GPUs on the current node use default port 29500
colossalai run --nprocs_per_node 4 train.py
# run with 2 GPUs on the current node at port 29550
colossalai run --nprocs_per_node 4 --master_port 29550 train.py
# run on two nodes
colossalai run --host <host1>,<host2> --master_addr host1 --nprocs_per_node 4 train.py
# run with hostfile
colossalai run --hostfile <file_path> --master_addr <host> --nprocs_per_node 4 train.py
# run with hostfile with only included hosts
colossalai run --hostfile <file_path> --master_addr host1 --include host1,host2 --nprocs_per_node 4 train.py
# run with hostfile excluding the hosts selected
colossalai run --hostfile <file_path> --master_addr host1 --exclude host2 --nprocs_per_node 4 train.py
"""
if not user_script.endswith('.py'):
click.echo(f'Error: invalid Python file {user_script}. Did you use a wrong option? Try colossalai run --help')
exit()
args_dict = locals()
args = Config(args_dict)
args.user_args = list(args.user_args)
launch_multi_processes(args)
import socket
from typing import List
class HostInfo:
"""
A data class to store host connection-related data.
Args:
hostname (str): name or IP address of the host
port (str): the port for ssh connection
"""
def __init__(
self,
hostname: str,
port: str = None,
):
self.hostname = hostname
self.port = port
self.is_local_host = HostInfo.is_host_localhost(hostname, port)
@staticmethod
def is_host_localhost(hostname: str, port: str = None) -> None:
"""
Check if the host refers to the local machine.
Args:
hostname (str): name or IP address of the host
port (str): the port for ssh connection
Returns:
bool: True if it is local, False otherwise
"""
if port is None:
port = 22 # no port specified, lets just use the ssh port
# socket.getfqdn("127.0.0.1") does not return localhost
# on some users' machines
# thus, we directly return True if hostname is locahost, 127.0.0.1 or 0.0.0.0
if hostname in ("localhost", "127.0.0.1", "0.0.0.0"):
return True
hostname = socket.getfqdn(hostname)
localhost = socket.gethostname()
localaddrs = socket.getaddrinfo(localhost, port)
targetaddrs = socket.getaddrinfo(hostname, port)
for (family, socktype, proto, canonname, sockaddr) in localaddrs:
for (rfamily, rsocktype, rproto, rcanonname, rsockaddr) in targetaddrs:
if rsockaddr[0] == sockaddr[0]:
return True
return False
def __str__(self):
return f'hostname: {self.hostname}, port: {self.port}'
def __repr__(self):
return self.__str__()
class HostInfoList:
"""
A data class to store a list of HostInfo objects.
"""
def __init__(self):
self.hostinfo_list = []
def append(self, hostinfo: HostInfo) -> None:
"""
Add an HostInfo object to the list.
Args:
hostinfo (HostInfo): host information
"""
self.hostinfo_list.append(hostinfo)
def remove(self, hostname: str) -> None:
"""
Add an HostInfo object to the list.
Args:
hostname (str): the name of the host
"""
hostinfo = self.get_hostinfo(hostname)
self.hostinfo_list.remove(hostinfo)
def get_hostinfo(self, hostname: str) -> HostInfo:
"""
Return the HostInfo object which matches with the hostname.
Args:
hostname (str): the name of the host
Returns:
hostinfo (HostInfo): the HostInfo object which matches with the hostname
"""
for hostinfo in self.hostinfo_list:
if hostinfo.hostname == hostname:
return hostinfo
raise Exception(f"Hostname {hostname} is not found")
def has(self, hostname: str) -> bool:
"""
Check if the hostname has been added.
Args:
hostname (str): the name of the host
Returns:
bool: True if added, False otherwise
"""
for hostinfo in self.hostinfo_list:
if hostinfo.hostname == hostname:
return True
return False
def __iter__(self):
return iter(self.hostinfo_list)
def __len__(self):
return len(self.hostinfo_list)
from multiprocessing import Pipe, Process
from multiprocessing import connection as mp_connection
import click
import fabric
from .hostinfo import HostInfo, HostInfoList
def run_on_host(hostinfo: HostInfo, workdir: str, recv_conn: mp_connection.Connection,
send_conn: mp_connection.Connection, env: dict) -> None:
"""
Use fabric connection to execute command on local or remote hosts.
Args:
hostinfo (HostInfo): host information
workdir (str): the directory to execute the command
recv_conn (multiprocessing.connection.Connection): receive messages from the master sender
send_conn (multiprocessing.connection.Connection): send messages to the master receiver
env (dict): a dictionary for environment variables
"""
fab_conn = fabric.Connection(hostinfo.hostname, port=hostinfo.port)
finish = False
env_msg = ' '.join([f'{k}=\"{v}\"' for k, v in env.items()])
# keep listening until exit
while not finish:
# receive cmd
cmds = recv_conn.recv()
if cmds == 'exit':
# exit from the loop
finish = True
break
else:
# execute the commands
try:
# cd to execute directory
with fab_conn.cd(workdir):
# propagate the runtime environment
with fab_conn.prefix(f"export {env_msg}"):
if hostinfo.is_local_host:
# execute on the local machine
fab_conn.local(cmds, hide=False)
else:
# execute on the remote machine
fab_conn.run(cmds, hide=False)
send_conn.send('success')
except Exception as e:
click.echo(
f"Error: failed to run {cmds} on {hostinfo.hostname}, is localhost: {hostinfo.is_local_host}, exception: {e}"
)
send_conn.send('failure')
# shutdown
send_conn.send("finish")
fab_conn.close()
class MultiNodeRunner:
"""
A runner to execute commands on an array of machines. This runner
is inspired by Nezha (https://github.com/zhuzilin/NeZha).
"""
def __init__(self):
self.processes = {}
self.master_send_conns = {}
self.master_recv_conns = {}
def connect(self, host_info_list: HostInfoList, workdir: str, env: dict) -> None:
"""
Establish connections to a list of hosts
Args:
host_info_list (HostInfoList): a list of HostInfo objects
workdir (str): the directory where command is executed
env (dict): environment variables to propagate to hosts
"""
for hostinfo in host_info_list:
master_send_conn, worker_recv_conn = Pipe()
master_recv_conn, worker_send_conn = Pipe()
p = Process(target=run_on_host, args=(hostinfo, workdir, worker_recv_conn, worker_send_conn, env))
p.start()
self.processes[hostinfo.hostname] = p
self.master_recv_conns[hostinfo.hostname] = master_recv_conn
self.master_send_conns[hostinfo.hostname] = master_send_conn
def send(self, hostinfo: HostInfo, cmd: str) -> None:
"""
Send a command to a local/remote host.
Args:
hostinfo (HostInfo): host information
cmd (str): the command to execute
"""
assert hostinfo.hostname in self.master_send_conns, \
f'{hostinfo} is not found in the current connections'
conn = self.master_send_conns[hostinfo.hostname]
conn.send(cmd)
def stop_all(self) -> None:
"""
Stop connections to all hosts.
"""
for hostname, conn in self.master_send_conns.items():
conn.send('exit')
def recv_from_all(self) -> dict:
"""
Receive messages from all hosts
Returns:
msg_from_node (dict): a dictionry which contains messages from each node
"""
msg_from_node = dict()
for hostname, conn in self.master_recv_conns.items():
msg_from_node[hostname] = conn.recv()
return msg_from_node
import os
import sys
from typing import List
import click
import torch
from packaging import version
from colossalai.context import Config
from .hostinfo import HostInfo, HostInfoList
from .multinode_runner import MultiNodeRunner
# Constants that define our syntax
NODE_SEP = ','
def fetch_hostfile(hostfile_path: str, ssh_port: int) -> HostInfoList:
"""
Parse the hostfile to obtain a list of hosts.
A hostfile should look like:
worker-0
worker-1
worker-2
...
Args:
hostfile_path (str): the path to the hostfile
ssh_port (int): the port to connect to the host
"""
if not os.path.isfile(hostfile_path):
click.echo(f"Error: Unable to find the hostfile, no such file: {hostfile_path}")
exit()
with open(hostfile_path, 'r') as fd:
device_pool = HostInfoList()
for line in fd.readlines():
line = line.strip()
if line == '':
# skip empty lines
continue
# build the HostInfo object
hostname = line.strip()
hostinfo = HostInfo(hostname=hostname, port=ssh_port)
if device_pool.has(hostname):
click.echo(f"Error: found duplicate host {hostname} in the hostfile")
exit()
device_pool.append(hostinfo)
return device_pool
def parse_device_filter(device_pool: HostInfoList, include_str=None, exclude_str=None) -> HostInfoList:
'''Parse an inclusion or exclusion string and filter a hostfile dictionary.
Examples:
include_str="worker-0,worker-1" will execute jobs only on worker-0 and worker-1.
exclude_str="worker-1" will use all available devices except worker-1.
Args:
device_pool (HostInfoList): a list of HostInfo objects
include_str (str): --include option passed by user, default None
exclude_str (str): --exclude option passed by user, default None
Returns:
filtered_hosts (HostInfoList): filtered hosts after inclusion/exclusion
'''
# Ensure include/exclude are mutually exclusive
if include_str and exclude_str:
click.echo("--include and --exclude are mutually exclusive, only one can be used")
exit()
# no-op
if include_str is None and exclude_str is None:
return device_pool
# Either build from scratch or remove items
if include_str:
parse_str = include_str
filtered_hosts = HostInfoList()
elif exclude_str:
parse_str = exclude_str
filtered_hosts = device_pool
# foreach node in the list
for node_config in parse_str.split(NODE_SEP):
hostname = node_config
hostinfo = device_pool.get_hostinfo(hostname)
# sanity check hostname
if not device_pool.has(hostname):
click.echo(f"Error: Hostname '{hostname}' not found in hostfile")
exit()
if include_str:
filtered_hosts.append(hostinfo)
elif exclude_str:
filtered_hosts.remove(hostname)
return filtered_hosts
def get_launch_command(
master_addr: str,
master_port: int,
nproc_per_node: int,
user_script: str,
user_args: List[str],
node_rank: int,
num_nodes: int,
extra_launch_args: str = None,
) -> str:
"""
Generate a command for distributed training.
Args:
master_addr (str): the host of the master node
master_port (str): the port of the master node
nproc_per_node (str): the number of processes to launch on each node
user_script (str): the user Python file
user_args (str): the arguments for the user script
node_rank (int): the unique ID for the node
num_nodes (int): the number of nodes to execute jobs
Returns:
cmd (str): the command the start distributed training
"""
def _arg_dict_to_list(arg_dict):
ret = []
for k, v in arg_dict.items():
if v:
ret.append(f'--{k}={v}')
else:
ret.append(f'--{k}')
return ret
if extra_launch_args:
extra_launch_args_dict = dict()
for arg in extra_launch_args.split(','):
if '=' in arg:
k, v = arg.split('=')
extra_launch_args_dict[k] = v
else:
extra_launch_args_dict[arg] = None
extra_launch_args = extra_launch_args_dict
else:
extra_launch_args = dict()
torch_version = version.parse(torch.__version__)
assert torch_version.major == 1
if torch_version.minor < 9:
cmd = [
sys.executable, "-m", "torch.distributed.launch", f"--nproc_per_node={nproc_per_node}",
f"--master_addr={master_addr}", f"--master_port={master_port}", f"--nnodes={num_nodes}",
f"--node_rank={node_rank}"
]
else:
# extra launch args for torch distributed launcher with torch >= 1.9
default_torchrun_rdzv_args = dict(rdzv_backend="c10d",
rdzv_endpoint=f"{master_addr}:{master_port}",
rdzv_id="colossalai-default-job")
# update rdzv arguments
for key in default_torchrun_rdzv_args.keys():
if key in extra_launch_args:
value = extra_launch_args.pop(key)
default_torchrun_rdzv_args[key] = value
if torch_version.minor < 10:
cmd = [
sys.executable, "-m", "torch.distributed.run", f"--nproc_per_node={nproc_per_node}",
f"--nnodes={num_nodes}", f"--node_rank={node_rank}"
]
else:
cmd = [
"torchrun", f"--nproc_per_node={nproc_per_node}", f"--nnodes={num_nodes}", f"--node_rank={node_rank}"
]
cmd += _arg_dict_to_list(default_torchrun_rdzv_args)
cmd += _arg_dict_to_list(extra_launch_args) + [user_script] + user_args
cmd = ' '.join(cmd)
return cmd
def launch_multi_processes(args: Config) -> None:
"""
Launch multiple processes on a single node or multiple nodes.
The overall logic can be summarized as the pseudo code below:
if hostfile given:
hostinfo = parse_hostfile(hostfile)
hostinfo = include_or_exclude_hosts(hostinfo)
launch_on_multi_nodes(hostinfo)
elif hosts given:
hostinfo = parse_hosts(hosts)
launch_on_multi_nodes(hostinfo)
else:
launch_on_current_node()
Args:
args (Config): the arguments taken from command line
"""
assert isinstance(args, Config)
if args.nproc_per_node is None:
click.echo("--nproc_per_node did not receive any value")
exit()
# cannot accept hosts and hostfile at the same time
if args.host and args.hostfile:
click.echo("Error: hostfile and hosts are mutually exclusive, only one is required")
# check if hostfile is given
if args.hostfile:
device_pool = fetch_hostfile(args.hostfile, ssh_port=args.ssh_port)
active_device_pool = parse_device_filter(device_pool, args.include, args.exclude)
if args.num_nodes > 0:
# only keep the first num_nodes to execute jobs
updated_active_device_pool = HostInfoList()
for count, hostinfo in enumerate(active_device_pool):
if args.num_nodes == count:
break
updated_active_device_pool.append(hostinfo)
active_device_pool = updated_active_device_pool
else:
active_device_pool = None
env = os.environ.copy()
# use hosts if hostfile is not given
if args.host and active_device_pool is None:
active_device_pool = HostInfoList()
host_list = args.host.strip().split(NODE_SEP)
for hostname in host_list:
hostinfo = HostInfo(hostname=hostname, port=args.ssh_port)
active_device_pool.append(hostinfo)
if not active_device_pool:
# run on local node if not hosts or hostfile is given
# add local node to host info list
active_device_pool = HostInfoList()
localhost_info = HostInfo(hostname='127.0.0.1', port=args.ssh_port)
active_device_pool.append(localhost_info)
# launch distributed processes
runner = MultiNodeRunner()
curr_path = os.path.abspath('.')
# collect current path env
env = dict()
for k, v in os.environ.items():
# do not support multi-line env var
if v and '\n' not in v:
env[k] = v
# establish remote connection
runner.connect(host_info_list=active_device_pool, workdir=curr_path, env=env)
# execute distributed launching command
for node_id, hostinfo in enumerate(active_device_pool):
cmd = get_launch_command(master_addr=args.master_addr,
master_port=args.master_port,
nproc_per_node=args.nproc_per_node,
user_script=args.user_script,
user_args=args.user_args,
node_rank=node_id,
num_nodes=len(active_device_pool),
extra_launch_args=args.extra_launch_args)
runner.send(hostinfo=hostinfo, cmd=cmd)
# start training
msg_from_node = runner.recv_from_all()
has_error = False
# print node status
click.echo("\n====== Training on All Nodes =====")
for hostname, msg in msg_from_node.items():
click.echo(f"{hostname}: {msg}")
# check if a process failed
if msg == "failure":
has_error = True
# stop all nodes
runner.stop_all()
# receive the stop status
msg_from_node = runner.recv_from_all()
# printe node status
click.echo("\n====== Stopping All Nodes =====")
for hostname, msg in msg_from_node.items():
click.echo(f"{hostname}: {msg}")
# give the process an exit code
# so that it behaves like a normal process
if has_error:
sys.exit(1)
else:
sys.exit(0)
from .device_mesh_manager import DeviceMeshManager
from .dist_coordinator import DistCoordinator
from .process_group_manager import ProcessGroupManager
__all__ = ['DistCoordinator', 'ProcessGroupManager', 'DeviceMeshManager']
from dataclasses import dataclass
from typing import Dict, List, Tuple, Union
import torch
import torch.distributed as dist
from colossalai.device.alpha_beta_profiler import AlphaBetaProfiler
from colossalai.device.device_mesh import DeviceMesh
@dataclass
class DeviceMeshInfo:
'''
This class is used to store the information used to initialize the device mesh.
Args:
physical_ids (List[int]): The physical ids of the current booster. For example, if we have the last 4 GPUs on a 8-devices cluster, then the physical ids should be [4, 5, 6, 7].
mesh_shapes (List[Union[torch.Size, List[int], Tuple[int]]]): The shape of the mesh. For example, if we have 4 GPUs and we want to use 2D mesh with mesh shape [2, 2], then the mesh shape should be [2, 2].
'''
physical_ids: List[int]
mesh_shape: Union[torch.Size, List[int], Tuple[int]] = None
def __post_init__(self):
if self.mesh_shape is not None:
world_size = len(self.physical_ids)
mesh_shape_numel = torch.Size(self.mesh_shape).numel()
assert world_size == mesh_shape_numel, f'the numel of mesh_shape should be equal to world size, but got {world_size} != {mesh_shape_numel}'
def initialize_device_mesh(device_mesh_info: DeviceMeshInfo):
'''
This method is used to initialize the device mesh.
Args:
device_mesh_info (DeviceMeshInfo): The information used to initialize device mesh.
'''
# parse the device mesh info
physical_devices = device_mesh_info.physical_ids
physical_mesh = torch.tensor(physical_devices)
logical_mesh_shape = device_mesh_info.mesh_shape
if logical_mesh_shape is None:
ab_profiler = AlphaBetaProfiler(physical_devices)
# search for the best logical mesh shape
logical_mesh_id = ab_profiler.search_best_logical_mesh()
logical_mesh_id = torch.Tensor(logical_mesh_id).to(torch.int)
else:
logical_mesh_id = physical_mesh.reshape(logical_mesh_shape)
device_mesh = DeviceMesh(physical_mesh_id=physical_mesh, logical_mesh_id=logical_mesh_id, init_process_group=True)
return device_mesh
class DeviceMeshManager:
"""
Device mesh manager is responsible for creating and managing device meshes.
"""
def __init__(self):
self.device_mesh_store: Dict[str, DeviceMesh] = dict()
def create_device_mesh(self, name, device_mesh_info: DeviceMeshInfo) -> DeviceMesh:
"""
Create a device mesh and store it in the manager.
Args:
name (str): name of the device mesh
device_mesh_info (DeviceMeshInfo): the information used to initialize the device mesh
"""
if name not in self.device_mesh_store:
device_mesh = initialize_device_mesh(device_mesh_info)
self.device_mesh_store[name] = device_mesh
return device_mesh
else:
raise ValueError(f'Device mesh {name} already exists.')
def get(self, name: str) -> DeviceMesh:
"""
Get a device mesh by name.
Args:
name (str): name of the device mesh
Returns:
DeviceMesh: the device mesh
"""
if name in self.device_mesh_store:
return self.device_mesh_store[name]
else:
raise ValueError(f'Device mesh {name} does not exist.')
def destroy(self, name: str) -> None:
"""
Destroy a device mesh by name.
Args:
name (str): name of the device mesh
"""
if name in self.device_mesh_store:
for pgs in self.device_mesh_store[name].process_groups_dict.values():
for pg in pgs:
dist.destroy_process_group(pg)
del self.device_mesh_store[name]
else:
raise ValueError(f'Device mesh {name} does not exist.')
def destroy_all(self):
"""
Destroy all device meshes.
"""
for name in self.device_mesh_store:
for pgs in self.device_mesh_store[name].process_groups_dict.values():
for pg in pgs:
dist.destroy_process_group(pg)
self.device_mesh_store.clear()
import functools
import os
from contextlib import contextmanager
import torch.distributed as dist
from torch.distributed import ProcessGroup
from colossalai.context.singleton_meta import SingletonMeta
class DistCoordinator(metaclass=SingletonMeta):
"""
This class is used to coordinate distributed training. It is a singleton class, which means that there is only one instance of this
class in the whole program.
There are some terms that are used in this class:
- rank: the rank of the current process
- world size: the total number of processes
- local rank: the rank of the current process on the current node
- master: the process with rank 0
- node master: the process with local rank 0 on the current node
Example:
>>> from colossalai.cluster.dist_coordinator import DistCoordinator
>>> coordinator = DistCoordinator()
>>>
>>> if coordinator.is_master():
>>> do_something()
>>>
>>> coordinator.print_on_master('hello world')
Attributes:
rank (int): the rank of the current process
world_size (int): the total number of processes
local_rank (int): the rank of the current process on the current node
"""
def __init__(self):
assert dist.is_initialized(
), 'Distributed is not initialized. Please call `torch.distributed.init_process_group` or `colossalai.launch` first.'
self._rank = dist.get_rank()
self._world_size = dist.get_world_size()
# this is often passed by launchers such as torchrun
self._local_rank = os.environ.get('LOCAL_RANK', -1)
@property
def rank(self) -> int:
return self._rank
@property
def world_size(self) -> int:
return self._world_size
@property
def local_rank(self) -> int:
return self._local_rank
def _assert_local_rank_set(self):
"""
Assert that the local rank is set. This is often passed by launchers such as torchrun.
"""
assert self.local_rank >= 0, 'The environment variable LOCAL_RANK is not set, thus the coordinator is not aware of the local rank of the current process.'
def is_master(self, process_group: ProcessGroup = None) -> bool:
"""
Check if the current process is the master process (rank is 0). It can accept a sub process group to check the rank 0 with respect to the process.
Args:
process_group (ProcessGroup, optional): process group to use for the rank 0 check. Defaults to None, which refers to the default process group.
Returns:
bool: True if the current process is the master process, False otherwise
"""
rank = dist.get_rank(group=process_group)
return rank == 0
def is_node_master(self) -> bool:
"""
Check if the current process is the master process on the current node (local rank is 0).
Returns:
bool: True if the current process is the master process on the current node, False otherwise
"""
self._assert_local_rank_set()
return self.local_rank == 0
def is_last_process(self, process_group: ProcessGroup = None) -> bool:
"""
Check if the current process is the last process (rank is world size - 1). It can accept a sub process group to check the last rank with respect to the process.
Args:
process_group (ProcessGroup, optional): process group to use for the last rank check. Defaults to None, which refers to the default process group.
Returns:
bool: True if the current process is the last process, False otherwise
"""
rank = dist.get_rank(group=process_group)
world_size = dist.get_world_size(group=process_group)
return rank == world_size - 1
def print_on_master(self, msg: str, process_group: ProcessGroup = None):
"""
Print message only from rank 0.
Args:
msg (str): message to print
process_group (ProcessGroup, optional): process group to use for the rank 0 check. Defaults to None, which refers to the default process group.
"""
rank = dist.get_rank(group=process_group)
if rank == 0:
print(msg)
def print_on_node_master(self, msg: str):
"""
Print message only from local rank 0. Local rank 0 refers to the 0th process running the current node.
Args:
msg (str): message to print
"""
self._assert_local_rank_set()
if self.local_rank == 0:
print(msg)
@contextmanager
def priority_execution(self, executor_rank: int = 0, process_group: ProcessGroup = None):
"""
This context manager is used to allow one process to execute while blocking all
other processes in the same process group. This is often useful when downloading is required
as we only want to download in one process to prevent file corruption.
Example:
>>> from colossalai.cluster import DistCoordinator
>>> dist_coordinator = DistCoordinator()
>>> with dist_coordinator.priority_execution():
>>> dataset = CIFAR10(root='./data', download=True)
Args:
executor_rank (int): the process rank to execute without blocking, all other processes will be blocked
process_group (ProcessGroup, optional): process group to use for the executor rank check. Defaults to None, which refers to the default process group.
"""
rank = dist.get_rank(group=process_group)
should_block = rank != executor_rank
if should_block:
self.block_all(process_group)
yield
if not should_block:
self.block_all(process_group)
def destroy(self, process_group: ProcessGroup = None):
"""
Destroy the distributed process group.
Args:
process_group (ProcessGroup, optional): process group to destroy. Defaults to None, which refers to the default process group.
"""
dist.destroy_process_group(process_group)
def block_all(self, process_group: ProcessGroup = None):
"""
Block all processes in the process group.
Args:
process_group (ProcessGroup, optional): process group to block. Defaults to None, which refers to the default process group.
"""
dist.barrier(group=process_group)
def on_master_only(self, process_group: ProcessGroup = None):
"""
A function wrapper that only executes the wrapped function on the master process (rank 0).
Example:
>>> from colossalai.cluster import DistCoordinator
>>> dist_coordinator = DistCoordinator()
>>>
>>> @dist_coordinator.on_master_only()
>>> def print_on_master(msg):
>>> print(msg)
"""
is_master = self.is_master(process_group)
# define an inner functiuon
def decorator(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
if is_master:
return func(*args, **kwargs)
return wrapper
return decorator
from typing import List
import torch.distributed as dist
from torch.distributed import ProcessGroup
class ProcessGroupManager:
"""
ProcessGroupManager is used to manage the process groups in the cluster.
There are some terms used in this class:
- pg: the short name for process group
- pg_name: the name of the process group
- pg_size: the world size of the process group
- rank: the rank of the current process in the process group
- world_size: the total number of processes in the process group
"""
def __init__(self):
self.pg_store = dict()
def create_process_group(self, name: str, ranks: List[int], backend: str = 'nccl') -> ProcessGroup:
"""
Get a process group by name. If the process group does not exist, it will be created.
Args:
name (str): name of the process group
ranks (List[int]): ranks of the process group
backend (str, optional): backend of the process group. Defaults to 'nccl'.
Returns:
ProcessGroup: the process group
"""
if name not in self.pg_store:
pg = dist.new_group(ranks=ranks, backend=backend)
self.pg_store[name] = pg
return pg
else:
raise ValueError(f'Process group {name} already exists.')
def get(self, name: str) -> ProcessGroup:
"""
Get a process group by name.
Args:
name (str): name of the process group
Returns:
ProcessGroup: the process group
"""
if name in self.pg_store:
return self.pg_store[name]
else:
raise ValueError(f'Process group {name} does not exist.')
def destroy(self, name: str) -> None:
"""
Destroy a process group by name.
Args:
name (str): name of the process group
"""
if name in self.pg_store:
dist.destroy_process_group(self.pg_store[name])
del self.pg_store[name]
else:
raise ValueError(f'Process group {name} does not exist.')
def destroy_all(self) -> None:
"""
Destroy all process groups.
"""
for name in self.pg_store:
dist.destroy_process_group(self.pg_store[name])
self.pg_store.clear()
from .collective import all_gather, reduce_scatter, all_reduce, broadcast, reduce
from .p2p import (send_forward, send_forward_recv_forward, send_backward_recv_forward, send_backward,
send_backward_recv_backward, send_forward_recv_backward, send_forward_backward_recv_forward_backward,
recv_forward, recv_backward)
from .ring import ring_forward
from .utils import send_obj_meta, recv_obj_meta
__all__ = [
'all_gather',
'reduce_scatter',
'all_reduce',
'broadcast',
'reduce',
'send_forward',
'send_forward_recv_forward',
'send_forward_backward_recv_forward_backward',
'send_backward',
'send_backward_recv_backward',
'send_backward_recv_forward',
'send_forward_recv_backward',
'recv_backward',
'recv_forward',
'ring_forward',
'send_obj_meta',
'recv_obj_meta',
]
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import torch
import torch.distributed as dist
from torch import Tensor
from torch.distributed import ReduceOp
from colossalai.context import ParallelMode
from colossalai.core import global_context as gpc
_all_gather_func = dist._all_gather_base \
if "all_gather_into_tensor" not in dir(dist) else dist.all_gather_into_tensor
_reduce_scatter_func = dist._reduce_scatter_base \
if "reduce_scatter_tensor" not in dir(dist) else dist.reduce_scatter_tensor
def all_gather(tensor: Tensor, dim: int, parallel_mode: ParallelMode, async_op: bool = False) -> Tensor:
r"""Gathers all tensors from the parallel group and concatenates them in a
specific dimension.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
Args:
tensor (:class:`torch.Tensor`): Tensor to be gathered.
dim (int): The dimension concatenating in.
parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
async_op (bool, optional): Whether operations are asynchronous.
Returns:
Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The result of all-together only,
if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
"""
depth = gpc.get_world_size(parallel_mode)
if depth == 1:
out = tensor
work = None
else:
tensor_in = tensor.contiguous() if dim == 0 else tensor.transpose(0, dim).contiguous()
out_shape = (tensor_in.shape[0] * depth,) + tensor_in.shape[1:]
tensor_out = torch.empty(out_shape, dtype=tensor.dtype, device=tensor.device)
group = gpc.get_cpu_group(parallel_mode) if tensor.device.type == "cpu" else gpc.get_group(parallel_mode)
work = _all_gather_func(tensor_out, tensor_in, group=group, async_op=async_op)
out = tensor_out if dim == 0 else tensor_out.transpose(0, dim)
if async_op:
return out, work
else:
return out
def reduce_scatter(tensor: Tensor,
dim: int,
parallel_mode: ParallelMode,
op: ReduceOp = ReduceOp.SUM,
async_op: bool = False) -> Tensor:
r"""Reduces all tensors then scatters it in a specific dimension to all
members in the parallel group.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
Args:
tensor (:class:`torch.Tensor`): Tensor to be reduce_scattered.
dim (int): The dimension concatenating in.
parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
op (torch.distributed.ReduceOp, optional): The type of reduce operation,
should be included in [SUM, AVG, PRODUCT, MIN, MAX, BAND, BOR, BXOR].
More details about ReduceOp please refer to
`ReduceOp <https://pytorch.org/docs/stable/distributed.html#torch.distributed.ReduceOp>`_.
async_op (bool, optional): Whether operations are asynchronous.
Returns:
Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The result of reduce_scatter only,
if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
"""
depth = gpc.get_world_size(parallel_mode)
if depth == 1:
out = tensor
work = None
else:
tensor_in = tensor.contiguous() if dim == 0 else tensor.transpose(0, dim).contiguous()
out_shape = (tensor_in.shape[0] // depth,) + tensor_in.shape[1:]
tensor_out = torch.empty(out_shape, dtype=tensor.dtype, device=tensor.device)
group = gpc.get_cpu_group(parallel_mode) if tensor.device.type == "cpu" else gpc.get_group(parallel_mode)
work = _reduce_scatter_func(tensor_out, tensor_in, op=op, group=group, async_op=async_op)
out = tensor_out if dim == 0 else tensor_out.transpose(0, dim)
if async_op:
return out, work
else:
return out
def all_reduce(tensor: Tensor,
parallel_mode: ParallelMode,
op: ReduceOp = ReduceOp.SUM,
async_op: bool = False) -> Tensor:
r"""Reduces the tensor data across whole parallel group in such a way that all get the final result.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
Args:
tensor (:class:`torch.Tensor`): Tensor to be all-reduced.
parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
op (torch.distributed.ReduceOp, optional): The type of reduce operation,
should be included in [SUM, AVG, PRODUCT, MIN, MAX, BAND, BOR, BXOR].
More details about ReduceOp please refer to
`ReduceOp <https://pytorch.org/docs/stable/distributed.html#torch.distributed.ReduceOp>`_.
async_op (bool, optional): Whether operations are asynchronous.
Returns:
Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The result of all-gather only,
if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
"""
depth = gpc.get_world_size(parallel_mode)
if depth == 1:
out = tensor
work = None
else:
out = tensor.contiguous()
group = gpc.get_cpu_group(parallel_mode) if tensor.device.type == "cpu" else gpc.get_group(parallel_mode)
work = dist.all_reduce(out, op=op, group=group, async_op=async_op)
if async_op:
return out, work
else:
return out
def broadcast(tensor: Tensor, src: int, parallel_mode: ParallelMode, async_op: bool = False):
r"""Broadcast tensors to whole parallel group. Tensor must have the same
number of elements in all processes participating in the collective.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
Args:
tensor (:class:`torch.Tensor`): Tensor to be broadcast.
src (int): Source rank.
parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
async_op (bool, optional): Whether operations are asynchronous.
Returns:
Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The tensor need to be broadcast only,
if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
"""
depth = gpc.get_world_size(parallel_mode)
if depth == 1:
out = tensor
work = None
else:
out = tensor.contiguous()
group = gpc.get_cpu_group(parallel_mode) if tensor.device.type == "cpu" else gpc.get_group(parallel_mode)
work = dist.broadcast(out, src=src, group=group, async_op=async_op)
if async_op:
return out, work
else:
return out
def reduce(tensor: Tensor, dst: int, parallel_mode: ParallelMode, op: ReduceOp = ReduceOp.SUM, async_op: bool = False):
r"""Reduce tensors across whole parallel group. Only the process with
rank ``dst`` is going to receive the final result.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
Args:
tensor (:class:`torch.Tensor`): Tensor to be reduced.
dst (int): Destination rank.
parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
async_op (bool, optional): Whether operations are asynchronous.
Returns:
Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The result of reduce only,
if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
"""
depth = gpc.get_world_size(parallel_mode)
if depth == 1:
out = tensor
work = None
else:
out = tensor.contiguous()
group = gpc.get_cpu_group(parallel_mode) if tensor.device.type == "cpu" else gpc.get_group(parallel_mode)
work = dist.reduce(out, dst=dst, op=op, group=group, async_op=async_op)
if async_op:
return out, work
else:
return out
def scatter_object_list(scatter_object_output_list, scatter_object_input_list, src=0, group=None) -> None:
r"""Modified from `torch.distributed.scatter_object_list
<https://pytorch.org/docs/stable/_modules/torch/distributed/distributed_c10d.html#scatter_object_list>` to fix issues
"""
if dist.distributed_c10d._rank_not_in_group(group):
return
if (not isinstance(scatter_object_output_list, list) or len(scatter_object_output_list) < 1):
raise RuntimeError("Expected argument scatter_object_output_list to be a list of size at least 1.")
# set tensor device to cuda if backend is nccl
device = torch.cuda.current_device() if dist.get_backend(group) == 'nccl' else torch.device("cpu")
my_rank = dist.get_rank() # use global rank
if my_rank == src:
tensor_list, tensor_sizes = zip(
*[dist.distributed_c10d._object_to_tensor(obj) for obj in scatter_object_input_list])
tensor_list = list(map(lambda x: x.to(device), tensor_list))
tensor_sizes = list(map(lambda x: x.to(device), tensor_sizes))
# Src rank broadcasts the maximum tensor size. This is because all ranks are
# expected to call into scatter() with equal-sized tensors.
if my_rank == src:
max_tensor_size = max(tensor_sizes)
for tensor in tensor_list:
tensor.resize_(max_tensor_size)
else:
max_tensor_size = torch.tensor([0], dtype=torch.long).to(device)
dist.broadcast(max_tensor_size, src=src, group=group)
# Scatter actual serialized objects
output_tensor = torch.empty(max_tensor_size.item(), dtype=torch.uint8).to(device)
dist.scatter(
output_tensor,
scatter_list=None if my_rank != src else tensor_list,
src=src,
group=group,
)
# Scatter per-object sizes to trim tensors when deserializing back to object
obj_tensor_size = torch.tensor([0], dtype=torch.long).to(device)
dist.scatter(
obj_tensor_size,
scatter_list=None if my_rank != src else tensor_sizes,
src=src,
group=group,
)
output_tensor, obj_tensor_size = output_tensor.cpu(), obj_tensor_size.cpu()
# Deserialize back to object
scatter_object_output_list[0] = dist.distributed_c10d._tensor_to_object(output_tensor, obj_tensor_size)
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from typing import List, Tuple, Union
import torch
import torch.distributed as dist
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.utils import get_current_device
from functools import reduce
import operator
from .utils import split_tensor_into_1d_equal_chunks, gather_split_1d_tensor
TensorShape = Union[torch.Size, List[int], Tuple[int]]
def _get_tensor_shape(tensor_shape: TensorShape, chunk_tensor: bool = False) -> Tuple[TensorShape, bool]:
"""get the exact tensor shape when communicating and return whether the tensor is a chunk
Args:
tensor_shape (:class:`torch.Size`): shape of tensor
chunk_tensor (bool, optional): whether to chunk tensor, defaults to False
Returns:
Tuple[Union[:class:`torch.Size`, List[int], Tuple[int]], bool]: exact tensor shape, whether to chunk tensor
"""
if chunk_tensor:
tensor_chunk_shape = reduce(operator.mul, tensor_shape, 1)
tensor_parallel_world_size = gpc.get_world_size(ParallelMode.TENSOR)
if tensor_chunk_shape % tensor_parallel_world_size == 0:
tensor_chunk_shape = tensor_chunk_shape // tensor_parallel_world_size
else:
tensor_chunk_shape = tensor_shape
chunk_tensor = False
else:
tensor_chunk_shape = tensor_shape
return tensor_chunk_shape, chunk_tensor
def create_recv_buffer_with_shapes(recv_shapes, dtype, scatter_gather_tensors):
if isinstance(recv_shapes, torch.Size):
recv_chunk_shape, recv_split = _get_tensor_shape(recv_shapes, scatter_gather_tensors)
buffer_recv = torch.empty(recv_chunk_shape, requires_grad=True, device=get_current_device(), dtype=dtype)
return buffer_recv, recv_split
buffer_recv = []
for recv_shape in recv_shapes:
recv_chunk_shape, recv_split = _get_tensor_shape(recv_shape, scatter_gather_tensors)
tensor_recv = torch.empty(recv_chunk_shape, requires_grad=True, device=get_current_device(), dtype=dtype)
buffer_recv.append(tensor_recv)
return buffer_recv, recv_split
def process_object_to_send(object_send, scatter_gather_tensors):
if isinstance(object_send, torch.Tensor):
send_split = _get_tensor_shape(object_send.shape, scatter_gather_tensors)[1]
if send_split:
object_send = split_tensor_into_1d_equal_chunks(object_send)
return object_send
object_send_list = []
for tensor_send in object_send:
send_split = _get_tensor_shape(tensor_send.shape, scatter_gather_tensors)[1]
if send_split:
object_send_list.append(split_tensor_into_1d_equal_chunks(tensor_send))
else:
object_send_list.append(tensor_send)
object_send = tuple(object_send_list)
return object_send
def filling_ops_queue(obj, comm_op, comm_rank, ops_queue):
if isinstance(obj, torch.Tensor):
op_to_add = dist.P2POp(comm_op, obj, comm_rank)
ops_queue.append(op_to_add)
else:
for tensor_to_comm in obj:
op_to_add = dist.P2POp(comm_op, tensor_to_comm, comm_rank)
ops_queue.append(op_to_add)
def _communicate(object_send_next: Union[torch.Tensor, List[torch.Tensor]] = None,
object_send_prev: Union[torch.Tensor, List[torch.Tensor]] = None,
recv_prev: bool = False,
recv_next: bool = False,
recv_prev_shape: Union[torch.Size, List[torch.Size]] = None,
recv_next_shape: Union[torch.Size, List[torch.Size]] = None,
prev_rank: int = None,
next_rank: int = None,
dtype: torch.dtype = None,
scatter_gather_tensors: bool = False) -> Tuple[Union[torch.Tensor, List[torch.Tensor]]]:
"""
Adapted from megatron.p2p_communication.
Communicate tensors between stages. Used as helper method in other
communication methods that are used in pipeline schedule.
Takes the following arguments:
object_send_next (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): tensor to send to next rank (no tensor sent if
set to None).
object_send_prev (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): tensor to send to prev rank (no tensor sent if
set to None).
recv_prev (bool): boolean for whether tensor should be received from
previous rank.
recv_next (bool): boolean for whether tensor should be received from
next rank.
recv_prev_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): shape of the tensor to be received from the previous stage, defaults to None.
recv_next_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): shape of the tensor to be received from the next stage, defaults to None.
prev_rank (int): the rank of the previous pipeline stage, defaults to None,
next_rank (int): the rank of the next pipeline stage, defaults to None,
dtype (torch.dtype): data type of intermediate buffers, defaults to None
scatter_gather_tensors (bool): whether to scatter and gather tensor between pipeline stages, defaults to False
Returns:
Tuple[Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]]: returns tensor_recv_prev, tensor_recv_next
"""
# Create placeholder tensors for receive in forward and backward directions
# if needed.
tensor_recv_prev = None
tensor_recv_next = None
if recv_prev:
assert recv_prev_shape is not None
tensor_recv_prev, recv_prev_split = create_recv_buffer_with_shapes(recv_prev_shape, dtype,
scatter_gather_tensors)
if recv_next:
assert recv_next_shape is not None
tensor_recv_next, recv_next_split = create_recv_buffer_with_shapes(recv_next_shape, dtype,
scatter_gather_tensors)
if object_send_prev is not None or recv_prev:
if prev_rank is None:
prev_rank = gpc.get_prev_global_rank(ParallelMode.PIPELINE)
if object_send_next is not None or recv_next:
if next_rank is None:
next_rank = gpc.get_next_global_rank(ParallelMode.PIPELINE)
if object_send_prev is not None:
object_send_prev = process_object_to_send(object_send_prev, scatter_gather_tensors)
if object_send_next is not None:
object_send_next = process_object_to_send(object_send_next, scatter_gather_tensors)
ops = []
if object_send_prev is not None:
filling_ops_queue(object_send_prev, dist.isend, prev_rank, ops)
if tensor_recv_prev is not None:
filling_ops_queue(tensor_recv_prev, dist.irecv, prev_rank, ops)
if tensor_recv_next is not None:
filling_ops_queue(tensor_recv_next, dist.irecv, next_rank, ops)
if object_send_next is not None:
filling_ops_queue(object_send_next, dist.isend, next_rank, ops)
if len(ops) > 0:
reqs = dist.batch_isend_irecv(ops)
for req in reqs:
req.wait()
# To protect against race condition when using batch_isend_irecv().
torch.cuda.synchronize()
if recv_prev and recv_prev_split:
if isinstance(tensor_recv_prev, torch.Tensor):
tensor_recv_prev = gather_split_1d_tensor(tensor_recv_prev).view(recv_prev_shape).requires_grad_()
else:
for index in range(len(tensor_recv_prev)):
tensor_recv_prev[index] = gather_split_1d_tensor(tensor_recv_prev[index]).view(
recv_prev_shape[index]).requires_grad_()
if recv_next and recv_next_split:
if isinstance(tensor_recv_next, torch.Tensor):
tensor_recv_next = gather_split_1d_tensor(tensor_recv_next).view(recv_next_shape).requires_grad_()
else:
for index in range(len(tensor_recv_next)):
tensor_recv_next[index] = gather_split_1d_tensor(tensor_recv_next[index]).view(
recv_next_shape[index]).requires_grad_()
return tensor_recv_prev, tensor_recv_next
def recv_forward(input_tensor_shape,
prev_rank=None,
dtype=torch.float,
scatter_gather_tensors=False) -> Union[torch.Tensor, List[torch.Tensor]]:
"""Copy the forward output from the previous stage in pipeline as the input tensor of this stage.
Args:
input_tensor_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): The shape of the tensor to be received.
prev_rank (int, optional): The rank of the source of the tensor.
Returns:
Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]: The input tensor or input tensor list.
"""
if gpc.is_pipeline_first_stage():
input_tensor = None
else:
input_tensor, _ = _communicate(recv_prev=True,
recv_prev_shape=input_tensor_shape,
prev_rank=prev_rank,
dtype=dtype,
scatter_gather_tensors=scatter_gather_tensors)
return input_tensor
def recv_backward(output_grad_shape,
next_rank=None,
dtype=torch.float,
scatter_gather_tensors=False) -> Union[torch.Tensor, List[torch.Tensor]]:
"""Copy the gradient tensor from the next stage in pipeline as the input gradient of this stage.
Args:
output_grad_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): The shape of the tensor to be received.
next_rank (int, optional): The rank of the source of the tensor.
Returns:
Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]: The input gradient tensor or gradident tensor list.
"""
if gpc.is_pipeline_last_stage():
output_tensor_grad = None
else:
_, output_tensor_grad = _communicate(recv_next=True,
recv_next_shape=output_grad_shape,
next_rank=next_rank,
dtype=dtype,
scatter_gather_tensors=scatter_gather_tensors)
return output_tensor_grad
def send_forward(output_tensor, next_rank=None, scatter_gather_tensors=False) -> None:
"""Sends the input tensor to the next stage in pipeline.
Args:
output_tensor (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): Tensor to be sent.
next_rank (int, optional): The rank of the recipient of the tensor.
"""
if not gpc.is_pipeline_last_stage():
_communicate(object_send_next=output_tensor, next_rank=next_rank, scatter_gather_tensors=scatter_gather_tensors)
def send_backward(input_tensor_grad, prev_rank=None, scatter_gather_tensors=False) -> None:
"""Sends the gradient tensor to the previous stage in pipeline.
Args:
input_tensor_grad (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): Tensor to be sent
prev_rank (int, optional): The rank of the recipient of the tensor
"""
if not gpc.is_pipeline_first_stage():
_communicate(object_send_prev=input_tensor_grad,
prev_rank=prev_rank,
scatter_gather_tensors=scatter_gather_tensors)
def send_forward_recv_backward(output_tensor,
output_grad_shape,
recv_next=True,
next_rank=None,
dtype=torch.float,
scatter_gather_tensors=False) -> Union[torch.Tensor, List[torch.Tensor]]:
"""Batched communication operation. Sends the input tensor to the
next stage in pipeline, while receives the gradient tensor from the
next stage in pipeline as the input gradient tensor of this stage.
Args:
output_tensor (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): Tensor to be sent.
output_grad_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): The shape of the tensor to be received.
Returns:
Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]: The input gradient tensor.
"""
if gpc.is_pipeline_last_stage():
output_tensor_grad = None
else:
_, output_tensor_grad = _communicate(object_send_next=output_tensor,
recv_next=recv_next,
recv_next_shape=output_grad_shape,
next_rank=next_rank,
dtype=dtype,
scatter_gather_tensors=scatter_gather_tensors)
return output_tensor_grad
def send_backward_recv_forward(input_tensor_grad,
input_tensor_shape,
recv_prev=True,
prev_rank=None,
dtype=torch.float,
scatter_gather_tensors=False) -> Union[torch.Tensor, List[torch.Tensor]]:
"""Batched communication operation. Sends the gradient tensor to the
previous stage in pipeline, while receives the output tensor from the
previous stage in pipeline as the input of this stage.
Args:
input_tensor_grad (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): Tensor to be sent.
input_tensor_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): The shape of the tensor to be received.
Returns:
Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]: The input tensor.
"""
if gpc.is_pipeline_first_stage():
input_tensor = None
else:
input_tensor, _ = _communicate(object_send_prev=input_tensor_grad,
recv_prev=recv_prev,
recv_prev_shape=input_tensor_shape,
prev_rank=prev_rank,
dtype=dtype,
scatter_gather_tensors=scatter_gather_tensors)
return input_tensor
def send_forward_recv_forward(output_tensor,
input_tensor_shape,
recv_prev=True,
prev_rank=None,
next_rank=None,
dtype=torch.float,
scatter_gather_tensors=False) -> Union[torch.Tensor, List[torch.Tensor]]:
"""Batched communication operation. Sends the input tensor to the
next stage in pipeline, while receives the output tensor from the
previous stage in pipeline as the input of this stage.
Args:
output_tensor (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): Tensor to be sent.
input_tensor_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): The shape of the tensor to be received.
Returns:
Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]: The input tensor.
"""
input_tensor, _ = _communicate(object_send_next=output_tensor,
recv_prev=recv_prev,
recv_prev_shape=input_tensor_shape,
prev_rank=prev_rank,
next_rank=next_rank,
dtype=dtype,
scatter_gather_tensors=scatter_gather_tensors)
return input_tensor
def send_backward_recv_backward(input_tensor_grad,
output_grad_shape,
recv_next=True,
prev_rank=None,
next_rank=None,
dtype=torch.float,
scatter_gather_tensors=False) -> Union[torch.Tensor, List[torch.Tensor]]:
"""Batched communication operation. Sends the gradient tensor to the
previous stage in pipeline, while receives the gradient tensor from the
next member in pipeline as the input of this stage.
Args:
input_tensor_grad (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): Tensor to be sent.
output_grad_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): The shape of the tensor to be received.
Returns:
Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]: The input gradient tensor.
"""
_, output_tensor_grad = _communicate(object_send_prev=input_tensor_grad,
recv_next=recv_next,
recv_next_shape=output_grad_shape,
prev_rank=prev_rank,
next_rank=next_rank,
dtype=dtype,
scatter_gather_tensors=scatter_gather_tensors)
return output_tensor_grad
def send_forward_backward_recv_forward_backward(
output_tensor,
input_tensor_grad,
input_tensor_shape,
output_grad_shape,
recv_prev=True,
recv_next=True,
prev_rank=None,
next_rank=None,
dtype=torch.float,
scatter_gather_tensors=False) -> Tuple[Union[torch.Tensor, List[torch.Tensor]]]:
"""Batched communication operation. Sends the input tensor to the next stage in pipeline and
the gradient tensor to the previous stage, while receives the input gradient tensor from the
next stage and the input tensor from the previous stage.
Args:
output_tensor (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): Tensor sent to the next.
input_tensor_grad (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): Tensor sent to the previous.
input_tensor_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): The shape of the tensor received from the previous.
output_grad_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): The shape of the tensor received from the next.
Returns:
Tuple(Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]], Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): (the input tensor, the input gradient tensor)
"""
input_tensor, output_tensor_grad = _communicate(object_send_next=output_tensor,
object_send_prev=input_tensor_grad,
recv_prev=recv_prev,
recv_next=recv_next,
recv_prev_shape=input_tensor_shape,
recv_next_shape=output_grad_shape,
prev_rank=prev_rank,
next_rank=next_rank,
dtype=dtype,
scatter_gather_tensors=scatter_gather_tensors)
return input_tensor, output_tensor_grad
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment