Unverified Commit b9a8dff7 authored by digger-yu's avatar digger-yu Committed by GitHub
Browse files

[doc] Fix typo under colossalai and doc(#3618)

* Fixed several spelling errors under colossalai

* Fix the spelling error in colossalai and docs directory

* Cautious Changed the spelling error under the example folder

* Update runtime_preparation_pass.py

revert autograft to autograd

* Update search_chunk.py

utile to until

* Update check_installation.py

change misteach to mismatch in line 91

* Update 1D_tensor_parallel.md

revert to perceptron

* Update 2D_tensor_parallel.md

revert to perceptron in line 73

* Update 2p5D_tensor_parallel.md

revert to perceptron in line 71

* Update 3D_tensor_parallel.md

revert to perceptron in line 80

* Update README.md

revert to resnet in line 42

* Update reorder_graph.py

revert to indice in line 7

* Update p2p.py

revert to megatron in line 94

* Update initialize.py

revert to torchrun in line 198

* Update routers.py

change to detailed in line 63

* Update routers.py

change to detailed in line 146

* Update README.md

revert  random number in line 402
parent e1b0a78a
......@@ -138,7 +138,7 @@ def emit_ckpt_func(body,
delete_unused_value_func,
ckpt_level=0,
in_ckpt=False):
"""Emit ckpt fuction in nested way
"""Emit ckpt function in nested way
Args:
body: forward code - in recursive calls, this part will be checkpoint
......
......@@ -111,7 +111,7 @@ class Region:
Copy data slice to the memory space indexed by the input tensor in the region.
Args:
param (torch.nn.Parameter): the param used to retrive meta information
param (torch.nn.Parameter): the param used to retrieve meta information
data_slice (torch.Tensor): the tensor to be copied to the region
"""
......
......@@ -22,7 +22,7 @@ class TrainingSimulator(ABC):
Args:
region_list (List[Region]): represents the linearized DNN computing graph.
comp_power (float): the NVIDIA GPU FP16 compuing power.
comp_power (float): the NVIDIA GPU FP16 computing power.
link_to_bw (Dict[str, Dict[float, float]]): communication links and the corresponding bandwidth.
"""
......
......@@ -149,7 +149,7 @@ def size_value_converting_pass(gm: torch.fx.GraphModule, device_mesh: DeviceMesh
def _extract_target_dim(node):
'''
A helper function to etract the target dimension from size node.
A helper function to extract the target dimension from size node.
There are two usages of torch.Tensor.size:
1. tensor.size()
2. tensor.size(dim)
......@@ -427,7 +427,7 @@ def module_params_sharding_pass(gm: torch.fx.GraphModule, device_mesh: DeviceMes
if target_sharding_spec.dim_partition_dict != {}:
origin_sharding_spec = ShardingSpec(device_mesh, param.shape, {})
setattr(param, 'sharding_spec', origin_sharding_spec)
# TODO: build a ColoParamter class to manager the distributed parameters
# TODO: build a ColoParameter class to manager the distributed parameters
# we could use .data here, because all the operations just happen before the real training
# loop, so we don't need to track these operations in the autograd graph.
param = torch.nn.Parameter(
......
......@@ -287,7 +287,7 @@ def emit_code_with_chunk(body: List[str],
body = _replace_new_tensor_like_shape(search_chunk, chunk_infos, region_idx, node_idx, node, body)
# new tensor
body = _replace_new_tensor_shape(search_chunk, chunk_infos, region_idx, node_idx, node, body)
# reassgin reshape size
# reassign reshape size
body[-1] = _replace_reshape_size(body[-1], node.name, chunk_infos[region_idx]["reshape_size"])
body[-1] = " " + body[-1]
delete_unused_value_func(node, body, chunk_inputs_names)
......
......@@ -153,7 +153,7 @@ class EstimateMemory(object):
Returns:
act_memory_peak_log (List): peak memory of every node
act_memory_after_node_log (List): memory after excuting every node
act_memory_after_node_log (List): memory after executing every node
active_node_list_log (List): active nodes of every node. active nodes refer to
nodes generated but not deleted.
"""
......
......@@ -16,7 +16,7 @@ class SearchChunk(object):
This is the core class for AutoChunk.
It defines the framework of the strategy of AutoChunk.
Chunks will be selected one by one utill search stops.
Chunks will be selected one by one until search stops.
The chunk search is as follows:
1. find the peak memory node
......@@ -73,7 +73,7 @@ class SearchChunk(object):
def _find_peak_region(self, mem_peak: List) -> int:
"""
find peak node, along with its neighbour nodes exceeds max mem
find peak node, along with its neighbor nodes exceeds max mem
"""
max_value = max(mem_peak)
max_idx = mem_peak.index(max_value)
......@@ -118,7 +118,7 @@ class SearchChunk(object):
chunk_region_start (int)
chunk_region_end (int)
"""
# check if peak node already in chunkinfo
# check if peak node already in chunk info
if chunk_regions is not None:
for i in chunk_regions:
if i["region"][0] < peak_region[0] <= i["region"][1] or \
......
......@@ -479,7 +479,7 @@ class TraceFlow(object):
# check index source align
if not self.check_index_source(start_dim, start_node, start_idx, end_dim, end_node):
return False
# check index copmute
# check index compute
if not self.check_index_compute(start_idx, end_dim, end_node, end_idx):
return False
return True
......@@ -8,7 +8,7 @@ from .utils import NodeMgr, find_first_tensor_arg, flat_list, get_module_node_na
class TraceIndice(object):
"""
Trace all indice infomation for every node.
Trace all indice information for every node.
Indice is a logical concept. Equal dims can been treated as one indice.
eg. dim(x1) = [a, b, c]
......@@ -153,7 +153,7 @@ class TraceIndice(object):
def _inherit_more_indice_from_node_with_exclude(self, node_from: Node, node_to: Node, exclude: List = None) -> None:
"""
inheirt indice from node without init
inherit indice from node without init
"""
if exclude == None:
exclude = []
......@@ -301,7 +301,7 @@ class TraceIndice(object):
def _assign_linear_indice(self, node: Node, node_idx: int) -> None:
"""
Assign indice for linear op.
1. copy trace from input node and change last indice accroding to weight
1. copy trace from input node and change last indice according to weight
2. mark equal for input node last indice, weight first dim and bias dim.
3. inherit input's computation, mark computation for last dim.
......@@ -360,7 +360,7 @@ class TraceIndice(object):
def _assign_matmul_indice(self, node: Node, node_idx: int) -> None:
"""
Assign indice for matmul op.
1. copy trace from matmul_left and change last indice accroding to matmul_right. (assert they have same length)
1. copy trace from matmul_left and change last indice according to matmul_right. (assert they have same length)
2. mark equal for input matmul_left -1 indice and matmul_right -2 dim.
3. inherit matmul_left and matmul_right computation, mark computation for last dim.
......@@ -720,11 +720,11 @@ class TraceIndice(object):
Assign indice for view and reshape op.
1. get origin shape and target shape by meta info.
2. compute the real value of -1 in target shape.
3. determine changed dim, and assgin indice for generated dim.
3. determine changed dim, and assign indice for generated dim.
4. log changed dim and generated dim for restore
5. inherit computation.
6. look into view list to see whether the view is associated with other,
if so assgin equal dim according to previous view.
if so assign equal dim according to previous view.
Args:
node (node)
......
......@@ -20,7 +20,7 @@ __all__ = ['Booster']
class Booster:
"""
Booster is a high-level API for training neural networks. It provides a unified interface for
training with different precisio, accelerator, and plugin.
training with different precision, accelerator, and plugin.
Examples:
>>> colossalai.launch(...)
......
......@@ -71,7 +71,7 @@ class CheckpointIO(ABC):
Args:
model (nn.Module): model to be loaded.
checkpoint (str): checkpoint path. This value is made compatiblity with the model checkpoints in the
checkpoint (str): checkpoint path. This value is made compatibility with the model checkpoints in the
mainstream model zoos such as Hugging Face and TIMM. The checkpoint path can be:
1. a file path, e.g. 'model.pt'
2. a path to a json file which defines the index to the sharded checkpoint
......@@ -127,7 +127,7 @@ class CheckpointIO(ABC):
1. a file path, e.g. 'model.pt'
2. a directory path to save the sharded checkpoint, e.g. './checkpoints/' when shard = True.
shard (bool): whether to shard the checkpoint. Default: False. If set to True, the checkpoint will be sharded into
multiple files. The model shards will be specificed by a `model.index.json` file. When shard = True, please ensure
multiple files. The model shards will be specified by a `model.index.json` file. When shard = True, please ensure
that the checkpoint path is a directory path instead of a file path.
gather_dtensor (bool): whether to gather the distributed tensor to the first device. Default: True.
variant (str): If specified, weights are saved in the format pytorch_model.<variant>.bin. Default: None.
......@@ -149,7 +149,7 @@ class CheckpointIO(ABC):
Args:
optimizer (Optimizer): optimizer to be loaded.
checkpoint (str): checkpoint path. This value is made compatiblity with the model checkpoints in the
checkpoint (str): checkpoint path. This value is made compatibility with the model checkpoints in the
"""
index_file_exists, index_file_path = has_index_file(checkpoint)
......@@ -180,7 +180,7 @@ class CheckpointIO(ABC):
2. a path to a json file which defines the index to the sharded checkpoint for the optimizer
3. a path to a folder containing a unique .index.json file for sharded checkpoint
shard (bool): whether to shard the checkpoint. Default: False. If set to True, the checkpoint will be sharded into
multiple files. The optimizer shards will be specificed by a `optimizer.index.json` file.
multiple files. The optimizer shards will be specified by a `optimizer.index.json` file.
gather_dtensor (bool): whether to gather the distributed tensor to the first device. Default: True.
prefix (str): prefix for the optimizer checkpoint when shard = True. Default: None.
size_per_shard (int): size per shard in MB. Default: 1024. This value is only used when shard is set to True.
......
......@@ -76,7 +76,7 @@ def check_installation():
click.echo("")
click.echo(f"Note:")
click.echo(
f"1. AOT (ahead-of-time) compilation of the CUDA kernels occurs during installation when the environment varialbe CUDA_EXT=1 is set"
f"1. AOT (ahead-of-time) compilation of the CUDA kernels occurs during installation when the environment variable CUDA_EXT=1 is set"
)
click.echo(f"2. If AOT compilation is not enabled, stay calm as the CUDA kernels can still be built during runtime")
......@@ -88,7 +88,7 @@ def check_installation():
click.echo(f"Note:")
click.echo(f"1. The table above checks the version compatibility of the libraries/tools in the current environment")
click.echo(
f" - PyTorch version mistach: whether the PyTorch version in the current environment is compatible with the PyTorch version used for AOT compilation"
f" - PyTorch version mismatch: whether the PyTorch version in the current environment is compatible with the PyTorch version used for AOT compilation"
)
click.echo(
f" - System and PyTorch CUDA version match: whether the CUDA version in the current environment is compatible with the CUDA version required by PyTorch"
......
......@@ -103,10 +103,10 @@ def _communicate(object_send_next: Union[torch.Tensor, List[torch.Tensor]] = Non
previous rank.
recv_next (bool): boolean for whether tensor should be received from
next rank.
recv_prev_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): shape of the tensor to be received from the previous stage, defualts to None.
recv_next_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): shape of the tensor to be received from the next stage, defualts to None.
prev_rank (int): the rank of the previous pipeline stage, defualts to None,
next_rank (int): the rank of the next pipeline stage, defualts to None,
recv_prev_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): shape of the tensor to be received from the previous stage, defaults to None.
recv_next_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): shape of the tensor to be received from the next stage, defaults to None.
prev_rank (int): the rank of the previous pipeline stage, defaults to None,
next_rank (int): the rank of the next pipeline stage, defaults to None,
dtype (torch.dtype): data type of intermediate buffers, defaults to None
scatter_gather_tensors (bool): whether to scatter and gather tensor between pipeline stages, defaults to False
......
......@@ -230,7 +230,7 @@ def recv_backward(next_rank: int = None) -> Any:
next_rank (int, optional): The rank of the source of the tensor.
Returns:
Any: The input gradient tensor or gradident tensor list.
Any: The input gradient tensor or gradient tensor list.
"""
if gpc.is_pipeline_last_stage():
output_tensor_grad = None
......
......@@ -64,7 +64,7 @@ class MoeContext(metaclass=SingletonMeta):
from colossalai.core import global_context as gpc
self.max_ep_size = gpc.config.get('max_ep_size', self.world_size)
assert self.world_size % self.max_ep_size == 0, \
"Maximum epxert parallel size must be a factor of the number of GPUs"
"Maximum expert parallel size must be a factor of the number of GPUs"
self.min_dp_size = self.world_size // self.max_ep_size
# Enabling kernel optimization may raise error in some cases
......
......@@ -44,7 +44,7 @@ class ParallelContext(metaclass=SingletonMeta):
# load config from file
self._config = None
# default 3D parallel args, will be overwritten during process group intialization
# default 3D parallel args, will be overwritten during process group initialization
self.world_size = 1
self.data_parallel_size = 1
self.pipeline_parallel_size = 1
......@@ -264,7 +264,7 @@ class ParallelContext(metaclass=SingletonMeta):
"""Adds world size for `parallel_mode`.
Args:
parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode correponding to the process group
parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode corresponding to the process group
world_size (int): The world size to be added
Raises:
......
......@@ -59,23 +59,23 @@ class SeedManager:
self._current_mode = parallel_mode
torch.cuda.set_rng_state(self._seed_states[parallel_mode])
def add_seed(self, parallel_mode: ParallelMode, seed: int, overwrtie: bool = False):
def add_seed(self, parallel_mode: ParallelMode, seed: int, overwrite: bool = False):
"""Adds a seed to the seed manager for `parallel_mode`.
Args:
parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
seed (int): The seed to be added.
overwrtie (bool, optional): Whether allows to overwrite the seed that has been set already
overwrite (bool, optional): Whether allows to overwrite the seed that has been set already
Raises:
AssertionError: Raises an AssertionError if `parallel_mode` is not an instance of :class:`colossalai.context.ParallelMode`
or the seed for `parallel_mode` has been added.
"""
assert isinstance(parallel_mode, ParallelMode), 'A valid ParallelMode must be provided'
if overwrtie is False:
if overwrite is False:
assert parallel_mode not in self._seed_states, f'The seed for {parallel_mode} has been added'
elif parallel_mode in self._seed_states:
print(f"Warnning: {parallel_mode} seed has been overwritten.", flush=True)
print(f"Warning: {parallel_mode} seed has been overwritten.", flush=True)
current_state = torch.cuda.get_rng_state()
torch.cuda.manual_seed(seed)
......
......@@ -305,7 +305,7 @@ def emit_ckpt_func(body,
delete_unused_value_func,
level=0,
in_ckpt=False):
"""Emit ckpt fuction in nested way
"""Emit ckpt function in nested way
Args:
body: forward code, in recursive calls, this part will be checkpoint
functions code
......
......@@ -155,7 +155,7 @@ def split_module(
use_partition = partitions[use_partition_name]
use_partition.outputs.setdefault(def_node.name)
# split nodes into parititons
# split nodes into partitions
for node in m.graph.nodes:
orig_nodes[node.name] = node
......@@ -198,7 +198,7 @@ def split_module(
if len(sorted_partitions) != len(partitions):
raise RuntimeError("cycle exists between partitions!")
# add placeholders to parititons
# add placeholders to partitions
for partition_name in sorted_partitions:
partition = partitions[partition_name]
for input in partition.inputs:
......
......@@ -111,7 +111,7 @@ class MultiHeadAttention(nn.Module):
Arguments:
hidden_size: Total dimension of hidden_size.
nhead: Number of parallel attention heads.
batch_size: Batch Size for one foward
batch_size: Batch Size for one forward
max_seq_len: Max length of input sequence
dropout: Dropout probability
norm_first: perform LayerNorms before attention
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment