"...git@developer.sourcefind.cn:OpenDAS/colossalai.git" did not exist on "80eba05b0abc0ce24f02254cbe2c7b8f9ff5d688"
Unverified Commit b9a8dff7 authored by digger-yu's avatar digger-yu Committed by GitHub
Browse files

[doc] Fix typo under colossalai and doc(#3618)

* Fixed several spelling errors under colossalai

* Fix the spelling error in colossalai and docs directory

* Cautious Changed the spelling error under the example folder

* Update runtime_preparation_pass.py

revert autograft to autograd

* Update search_chunk.py

utile to until

* Update check_installation.py

change misteach to mismatch in line 91

* Update 1D_tensor_parallel.md

revert to perceptron

* Update 2D_tensor_parallel.md

revert to perceptron in line 73

* Update 2p5D_tensor_parallel.md

revert to perceptron in line 71

* Update 3D_tensor_parallel.md

revert to perceptron in line 80

* Update README.md

revert to resnet in line 42

* Update reorder_graph.py

revert to indice in line 7

* Update p2p.py

revert to megatron in line 94

* Update initialize.py

revert to torchrun in line 198

* Update routers.py

change to detailed in line 63

* Update routers.py

change to detailed in line 146

* Update README.md

revert  random number in line 402
parent e1b0a78a
...@@ -88,7 +88,7 @@ def colo_embedding_bag(input_tensor: GeneralTensor, ...@@ -88,7 +88,7 @@ def colo_embedding_bag(input_tensor: GeneralTensor,
assert isinstance(weight, ColoTensor) assert isinstance(weight, ColoTensor)
input_tensor = convert_to_colo_tensor(input_tensor, weight.get_process_group()) input_tensor = convert_to_colo_tensor(input_tensor, weight.get_process_group())
# Handle differen parallel actions. # Handle different parallel actions.
if not weight.has_compute_spec(): # No Model Parallel Applied if not weight.has_compute_spec(): # No Model Parallel Applied
assert weight.is_replicate(), 'Invalid weight spec for native embedding op' assert weight.is_replicate(), 'Invalid weight spec for native embedding op'
......
...@@ -13,7 +13,7 @@ from colossalai.zero.legacy.init_ctx import no_shard_zero_decrator ...@@ -13,7 +13,7 @@ from colossalai.zero.legacy.init_ctx import no_shard_zero_decrator
class MoeExperts(nn.Module): class MoeExperts(nn.Module):
"""Basic class for experts in MoE. It stores what kind of communication expersts use """Basic class for experts in MoE. It stores what kind of communication experts use
to exchange tokens, how many experts in a single GPU and parallel information such as to exchange tokens, how many experts in a single GPU and parallel information such as
expert parallel size, data parallel size and their distributed communication groups. expert parallel size, data parallel size and their distributed communication groups.
""" """
...@@ -24,7 +24,7 @@ class MoeExperts(nn.Module): ...@@ -24,7 +24,7 @@ class MoeExperts(nn.Module):
"This kind of communication has not been implemented yet.\n Please use Experts build function." "This kind of communication has not been implemented yet.\n Please use Experts build function."
self.comm_name = comm_name self.comm_name = comm_name
self.num_total_experts = num_experts self.num_total_experts = num_experts
# Get the configuration of experts' deployment and parallel information from moe contex # Get the configuration of experts' deployment and parallel information from moe context
self.num_local_experts, self.dist_info = MOE_CONTEXT.get_info(num_experts) self.num_local_experts, self.dist_info = MOE_CONTEXT.get_info(num_experts)
...@@ -32,7 +32,7 @@ class MoeExperts(nn.Module): ...@@ -32,7 +32,7 @@ class MoeExperts(nn.Module):
class Experts(MoeExperts): class Experts(MoeExperts):
"""A wrapper class to create experts. It will create E experts across the """A wrapper class to create experts. It will create E experts across the
moe model parallel group, where E is the number of experts. Every expert moe model parallel group, where E is the number of experts. Every expert
is a instence of the class, 'expert' in initialization parameters. is a instance of the class, 'expert' in initialization parameters.
Args: Args:
expert_cls (:class:`torch.nn.Module`): The class of all experts expert_cls (:class:`torch.nn.Module`): The class of all experts
...@@ -146,15 +146,15 @@ class FFNExperts(MoeExperts): ...@@ -146,15 +146,15 @@ class FFNExperts(MoeExperts):
class TPExperts(MoeExperts): class TPExperts(MoeExperts):
"""Use tensor parallelism to split each expert evenly, which can deploy experts in """Use tensor parallelism to split each expert evenly, which can deploy experts in
case that the number of experts can't be divied by maximum expert parallel size or case that the number of experts can't be divide by maximum expert parallel size or
maximum expert parallel size can't be divied by the number of experts. maximum expert parallel size can't be divide by the number of experts.
""" """
def __init__(self, num_experts: int, d_model: int, d_ff: int, activation=None, drop_rate: float = 0): def __init__(self, num_experts: int, d_model: int, d_ff: int, activation=None, drop_rate: float = 0):
super().__init__("all_gather", MOE_CONTEXT.max_ep_size) super().__init__("all_gather", MOE_CONTEXT.max_ep_size)
assert d_ff % MOE_CONTEXT.max_ep_size == 0, \ assert d_ff % MOE_CONTEXT.max_ep_size == 0, \
"d_ff should be divied by maximum expert parallel size" "d_ff should be divide by maximum expert parallel size"
p_ff = d_ff // MOE_CONTEXT.max_ep_size p_ff = d_ff // MOE_CONTEXT.max_ep_size
......
...@@ -25,7 +25,7 @@ from colossalai.zero.legacy.init_ctx import no_shard_zero_context, no_shard_zero ...@@ -25,7 +25,7 @@ from colossalai.zero.legacy.init_ctx import no_shard_zero_context, no_shard_zero
class MoeLayer(nn.Module): class MoeLayer(nn.Module):
"""A MoE layer, that puts its input tensor to its gate and uses the output logits """A MoE layer, that puts its input tensor to its gate and uses the output logits
to router all tokens, is mainly used to exchange all tokens for every expert across to router all tokens, is mainly used to exchange all tokens for every expert across
the moe tensor group by all to all comunication. Then it will get the output of all the moe tensor group by all to all communication. Then it will get the output of all
experts and exchange the output. At last returns the output of the moe system. experts and exchange the output. At last returns the output of the moe system.
Args: Args:
...@@ -122,7 +122,7 @@ class MoeModule(nn.Module): ...@@ -122,7 +122,7 @@ class MoeModule(nn.Module):
drop_tks (bool, optional): Whether drops tokens in evaluation drop_tks (bool, optional): Whether drops tokens in evaluation
use_residual (bool, optional): Makes this MoE layer a Residual MoE. use_residual (bool, optional): Makes this MoE layer a Residual MoE.
More information can be found in `Microsoft paper`_. More information can be found in `Microsoft paper`_.
residual_instance (nn.Module, optional): The instance of residual module in Resiual MoE residual_instance (nn.Module, optional): The instance of residual module in Residual MoE
expert_instance (MoeExperts, optional): The instance of experts module in MoeLayer expert_instance (MoeExperts, optional): The instance of experts module in MoeLayer
expert_cls (Type[nn.Module], optional): The class of each expert when no instance is given expert_cls (Type[nn.Module], optional): The class of each expert when no instance is given
expert_args (optional): The args of expert when no instance is given expert_args (optional): The args of expert when no instance is given
......
...@@ -60,7 +60,7 @@ class MoeRouter(nn.Module, ABC): ...@@ -60,7 +60,7 @@ class MoeRouter(nn.Module, ABC):
class Top1Router(MoeRouter): class Top1Router(MoeRouter):
"""Top1 router that returns the dispatch mask [s, e, c] and combine weight [s, e, c] """Top1 router that returns the dispatch mask [s, e, c] and combine weight [s, e, c]
for routing usage. More deailted function can be found in the paper about Switch Transformer for routing usage. More detailed function can be found in the paper about Switch Transformer
of Google. of Google.
Args: Args:
capacity_factor_train (float, optional): Capacity factor in routing of training. capacity_factor_train (float, optional): Capacity factor in routing of training.
...@@ -143,7 +143,7 @@ class Top1Router(MoeRouter): ...@@ -143,7 +143,7 @@ class Top1Router(MoeRouter):
class Top2Router(MoeRouter): class Top2Router(MoeRouter):
"""Top2 router that returns the dispatch mask [s, e, c] and combine weight [s, e, c] """Top2 router that returns the dispatch mask [s, e, c] and combine weight [s, e, c]
for routing usage. More deailted function can be found in the paper about ViT-MoE. for routing usage. More detailed function can be found in the paper about ViT-MoE.
Args: Args:
capacity_factor_train (float, optional): Capacity factor in routing of training. capacity_factor_train (float, optional): Capacity factor in routing of training.
capacity_factor_eval (float, optional): Capacity factor in routing of evaluation. capacity_factor_eval (float, optional): Capacity factor in routing of evaluation.
......
...@@ -12,7 +12,7 @@ class ForceFP32Parameter(torch.nn.Parameter): ...@@ -12,7 +12,7 @@ class ForceFP32Parameter(torch.nn.Parameter):
class NormalNoiseGenerator: class NormalNoiseGenerator:
"""Generates a random noisy mask for logtis tensor. """Generates a random noisy mask for logits tensor.
All noise is generated from a normal distribution :math:`(0, 1 / E^2)`, where All noise is generated from a normal distribution :math:`(0, 1 / E^2)`, where
`E = the number of experts`. `E = the number of experts`.
...@@ -32,7 +32,7 @@ class NormalNoiseGenerator: ...@@ -32,7 +32,7 @@ class NormalNoiseGenerator:
class UniformNoiseGenerator: class UniformNoiseGenerator:
"""Generates a random noisy mask for logtis tensor. """Generates a random noisy mask for logits tensor.
copied from mesh tensorflow: copied from mesh tensorflow:
Multiply values by a random number between :math:`1-epsilon` and :math:`1+epsilon`. Multiply values by a random number between :math:`1-epsilon` and :math:`1+epsilon`.
Makes models more resilient to rounding errors introduced by bfloat16. Makes models more resilient to rounding errors introduced by bfloat16.
......
...@@ -439,7 +439,7 @@ class Linear1D_Col(ParallelLayer): ...@@ -439,7 +439,7 @@ class Linear1D_Col(ParallelLayer):
to all GPUs, otherwise, every GPU will have its output to all GPUs, otherwise, every GPU will have its output
which is :math:`Y_i = XA_i`, defaults to False which is :math:`Y_i = XA_i`, defaults to False
skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer, skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
which is preserved for kernel fusion, defaults to Fals which is preserved for kernel fusion, defaults to False
weight_initializer (:class:`typing.Callable`, optional): weight_initializer (:class:`typing.Callable`, optional):
The initializer of weight, defaults to kaiming uniform initializer. The initializer of weight, defaults to kaiming uniform initializer.
bias_initializer (:class:`typing.Callable`, optional): bias_initializer (:class:`typing.Callable`, optional):
...@@ -578,7 +578,7 @@ class Linear1D_Row(ParallelLayer): ...@@ -578,7 +578,7 @@ class Linear1D_Row(ParallelLayer):
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None. dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
parallel_input (bool, optional): If set to ``True``, it's assumed that the input is split, defaults to False. parallel_input (bool, optional): If set to ``True``, it's assumed that the input is split, defaults to False.
skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer, skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
which is preserved for kernel fusion, defaults to Fals which is preserved for kernel fusion, defaults to False
weight_initializer (:class:`typing.Callable`, optional): weight_initializer (:class:`typing.Callable`, optional):
The initializer of weight, defaults to kaiming uniform initializer. The initializer of weight, defaults to kaiming uniform initializer.
bias_initializer (:class:`typing.Callable`, optional): bias_initializer (:class:`typing.Callable`, optional):
...@@ -994,11 +994,11 @@ class PatchEmbedding1D(ColossalaiModule): ...@@ -994,11 +994,11 @@ class PatchEmbedding1D(ColossalaiModule):
:type dtype: torch.dtype, optional :type dtype: torch.dtype, optional
:param flatten: whether to flatten output tensor, defaults to True :param flatten: whether to flatten output tensor, defaults to True
:type flatten: bool, optional :type flatten: bool, optional
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer :param weight_initializer: The initializer of weight, defaults to kaiming uniform initializer
:type weight_initializer: typing.Callable, optional :type weight_initializer: typing.Callable, optional
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer :param bias_initializer: The initializer of bias, defaults to xavier uniform initializer
:type bias_initializer: typing.Callable, optional :type bias_initializer: typing.Callable, optional
:param position_embed_initializer: The intializer of position embedding, defaults to zero :param position_embed_initializer: The initializer of position embedding, defaults to zero
:type position_embed_initializer: typing.Callable, optional :type position_embed_initializer: typing.Callable, optional
""" """
......
...@@ -184,7 +184,7 @@ class ColoTensor(torch.Tensor): ...@@ -184,7 +184,7 @@ class ColoTensor(torch.Tensor):
# we have to capture the `backward` function # we have to capture the `backward` function
# and make sure that it does not in `torch._C.DisableTorchFunction()` context # and make sure that it does not in `torch._C.DisableTorchFunction()` context
if func is torch.Tensor.backward: if func is torch.Tensor.backward:
assert len(args) == 1 # only has 1 paramter assert len(args) == 1 # only has 1 parameter
backward_tensor = torch.Tensor(args[0]) backward_tensor = torch.Tensor(args[0])
tensor_kwargs = {k: torch.Tensor(v) if torch.is_tensor(v) else v for k, v in kwargs.items()} tensor_kwargs = {k: torch.Tensor(v) if torch.is_tensor(v) else v for k, v in kwargs.items()}
return backward_tensor.backward(**tensor_kwargs) return backward_tensor.backward(**tensor_kwargs)
...@@ -228,7 +228,7 @@ class ColoTensor(torch.Tensor): ...@@ -228,7 +228,7 @@ class ColoTensor(torch.Tensor):
2. If the pg is not not None and not equal to the current process group. 2. If the pg is not not None and not equal to the current process group.
First, convert the tensor as replicated among the TP process group. First, convert the tensor as replicated among the TP process group.
Second, reset the process group to the new pg. Second, reset the process group to the new pg.
Third, conver the tensor (new replicated both among the tp process group) to the new dist_spec. Third, convert the tensor (new replicated both among the tp process group) to the new dist_spec.
Args: Args:
dist_spec (_DistSpec): the new dist spec. dist_spec (_DistSpec): the new dist spec.
...@@ -297,7 +297,7 @@ class ColoTensor(torch.Tensor): ...@@ -297,7 +297,7 @@ class ColoTensor(torch.Tensor):
def size_global(self, *args) -> torch.Size: def size_global(self, *args) -> torch.Size:
"""size_global """size_global
override the torch buildin size() override the torch building size()
the shape passed in must be in a replicate placement. the shape passed in must be in a replicate placement.
Returns: Returns:
......
...@@ -391,7 +391,7 @@ class CommSpec: ...@@ -391,7 +391,7 @@ class CommSpec:
to determine the buffer shape, and logical_process_axis to determine the buffer shape, and logical_process_axis
Argument: Argument:
comm_pattern(CollectiveCommPattern): decribe the communication method used in this spec. comm_pattern(CollectiveCommPattern): describe the communication method used in this spec.
sharding_spec(ShardingSpec): This is sharding spec of the tensor which will join the communication action. sharding_spec(ShardingSpec): This is sharding spec of the tensor which will join the communication action.
gather_dim(int, Optional): The gather_dim of the tensor will be gathered. gather_dim(int, Optional): The gather_dim of the tensor will be gathered.
shard_dim(int, Optional): The shard_dim of the tensor will be sharded. shard_dim(int, Optional): The shard_dim of the tensor will be sharded.
......
...@@ -10,7 +10,7 @@ class ComputePattern(Enum): ...@@ -10,7 +10,7 @@ class ComputePattern(Enum):
class ComputeSpec(object): class ComputeSpec(object):
"""ComputeSpec """ComputeSpec
The Specification for compuattion pattern The Specification for computation pattern
Args: Args:
compute_pattern (ComputePattern): an Enum instance for compute pattern. compute_pattern (ComputePattern): an Enum instance for compute pattern.
......
...@@ -14,7 +14,7 @@ class Layout: ...@@ -14,7 +14,7 @@ class Layout:
"""Layout of a tensor. """Layout of a tensor.
Attributes: Attributes:
device_mesh: the device mesh to store the tensor distributedly. device_mesh: the device mesh to store the tensor distributed.
device_type: the type of the device mesh, e.g. 'cpu' or 'cuda'. device_type: the type of the device mesh, e.g. 'cpu' or 'cuda'.
sharding_spec: the sharding specification to describe how the tensor is sharded. sharding_spec: the sharding specification to describe how the tensor is sharded.
entire_shape: the entire shape of the global tensor. entire_shape: the entire shape of the global tensor.
......
...@@ -14,7 +14,7 @@ NAN = 'nan' ...@@ -14,7 +14,7 @@ NAN = 'nan'
class DimSpec: class DimSpec:
''' '''
Sharding spec for single dimension of the sharded tensor decribe the sharding dimension of Sharding spec for single dimension of the sharded tensor describe the sharding dimension of
logical device mesh and give a method to compute the difference between them. logical device mesh and give a method to compute the difference between them.
This class is used internally in ShardingSpec. This class is used internally in ShardingSpec.
...@@ -143,7 +143,7 @@ class ShardingSpec: ...@@ -143,7 +143,7 @@ class ShardingSpec:
Argument: Argument:
dim_partition_dict(Dict[int, List[int]], optional): The key is the dimension of tensor to be sharded, dim_partition_dict(Dict[int, List[int]], optional): The key is the dimension of tensor to be sharded,
and the value of the key decribe which logical axis will be sharded in that dimension. and the value of the key describe which logical axis will be sharded in that dimension.
sharding_sequence(List[DimSpec], optional): A straight view of ShardingSpec looks like [R, R, S0, S1]. sharding_sequence(List[DimSpec], optional): A straight view of ShardingSpec looks like [R, R, S0, S1].
''' '''
......
...@@ -61,7 +61,7 @@ class DistSpecManager: ...@@ -61,7 +61,7 @@ class DistSpecManager:
Args: Args:
tensor (torch.Tensor): a global (replicated) tensor before shard tensor (torch.Tensor): a global (replicated) tensor before shard
dist_spec (_DistSpec): the distributed spec. to be sharded as. dist_spec (_DistSpec): the distributed spec. to be sharded as.
pg (ProcessGrouo): the process group of the corresponding colotensor pg (ProcessGroup): the process group of the corresponding colotensor
Returns: Returns:
torch.Tensor: a torch tensor after sharded. torch.Tensor: a torch tensor after sharded.
""" """
......
...@@ -15,7 +15,7 @@ class _DistSpec: ...@@ -15,7 +15,7 @@ class _DistSpec:
A class indicates Distributed Specification. A class indicates Distributed Specification.
The DistSpec is only works for the tensor parallel process groups. The DistSpec is only works for the tensor parallel process groups.
Because the dist spec of data parallel process group can be automatically deduced. Because the dist spec of data parallel process group can be automatically deduced.
This is an internal data structrue. This is an internal data structure.
The API for users should be `ShardSpec` and `ReplicaSpec`. The API for users should be `ShardSpec` and `ReplicaSpec`.
Args: Args:
......
...@@ -73,7 +73,7 @@ class ShapeConsistencyManager(metaclass=SingletonMeta): ...@@ -73,7 +73,7 @@ class ShapeConsistencyManager(metaclass=SingletonMeta):
orig_cost_dict: Dict[str, float]) -> Dict[ShardingSpec, float]: orig_cost_dict: Dict[str, float]) -> Dict[ShardingSpec, float]:
''' '''
Get all valid sharding specs from source_spec with single all-gather operation, and Get all valid sharding specs from source_spec with single all-gather operation, and
accumulate commucation cost on origin cost which will finally be used in auto sharding solver. accumulate communication cost on origin cost which will finally be used in auto sharding solver.
For the all-gather operation, we just care about the S dimension. For the all-gather operation, we just care about the S dimension.
Argument: Argument:
...@@ -145,7 +145,7 @@ class ShapeConsistencyManager(metaclass=SingletonMeta): ...@@ -145,7 +145,7 @@ class ShapeConsistencyManager(metaclass=SingletonMeta):
orig_cost_dict: Dict[str, float]) -> Dict[ShardingSpec, float]: orig_cost_dict: Dict[str, float]) -> Dict[ShardingSpec, float]:
''' '''
Get all valid sharding specs from source_spec with single all-to-all operation, and Get all valid sharding specs from source_spec with single all-to-all operation, and
accumulate commucation cost on origin cost which will finally be used in auto sharding solver. accumulate communication cost on origin cost which will finally be used in auto sharding solver.
For the all-to-all operation, we just care about the pairs containing S dimension. For the all-to-all operation, we just care about the pairs containing S dimension.
Argument: Argument:
......
...@@ -18,7 +18,7 @@ NAN = 'nan' ...@@ -18,7 +18,7 @@ NAN = 'nan'
class _DimSpec: class _DimSpec:
''' '''
Sharding spec for single dimension of the sharded tensor decribe the sharding dimension of Sharding spec for single dimension of the sharded tensor describe the sharding dimension of
logical device mesh and give a method to compute the difference between them. logical device mesh and give a method to compute the difference between them.
This class is used internally in ShardingSpec. This class is used internally in ShardingSpec.
......
...@@ -18,7 +18,7 @@ def all_gather_simulator(target_pair): ...@@ -18,7 +18,7 @@ def all_gather_simulator(target_pair):
Argument: Argument:
target_pair(Tuple[int, List[int]]): The first element is the dimension of tensor to be sharded, target_pair(Tuple[int, List[int]]): The first element is the dimension of tensor to be sharded,
and the second element decribes which logical axis will be sharded in that dimension. and the second element describes which logical axis will be sharded in that dimension.
''' '''
_, shard_list = target_pair _, shard_list = target_pair
new_shard_list = shard_list[:-1] new_shard_list = shard_list[:-1]
...@@ -36,7 +36,7 @@ def all_to_all_simulator(f_target_pair, b_target_pair): ...@@ -36,7 +36,7 @@ def all_to_all_simulator(f_target_pair, b_target_pair):
Therefore, if the behind shard_list is not None, we just extend it to the front shard_list. Therefore, if the behind shard_list is not None, we just extend it to the front shard_list.
Argument: Argument:
target_pair(Tuple[int, List[int]]): The first element is the dimension of tensor to be sharded, target_pair(Tuple[int, List[int]]): The first element is the dimension of tensor to be sharded,
and the second element decribes which logical axis will be sharded in that dimension. and the second element describes which logical axis will be sharded in that dimension.
e.g.: e.g.:
all-to-all(S0, S1) -> [S01, R] all-to-all(S0, S1) -> [S01, R]
all-to-all(S0, R) -> [R, S0] all-to-all(S0, R) -> [R, S0]
...@@ -46,7 +46,7 @@ def all_to_all_simulator(f_target_pair, b_target_pair): ...@@ -46,7 +46,7 @@ def all_to_all_simulator(f_target_pair, b_target_pair):
Argument: Argument:
target_pair(Tuple[int, List[int]]): The first element is the dimension of tensor to be sharded, target_pair(Tuple[int, List[int]]): The first element is the dimension of tensor to be sharded,
and the second element decribes which logical axis will be sharded in that dimension. and the second element describes which logical axis will be sharded in that dimension.
''' '''
_, f_shard_list = f_target_pair _, f_shard_list = f_target_pair
_, b_shard_list = b_target_pair _, b_shard_list = b_target_pair
......
...@@ -17,10 +17,10 @@ def parameterize(argument: str, values: List[Any]) -> Callable: ...@@ -17,10 +17,10 @@ def parameterize(argument: str, values: List[Any]) -> Callable:
we want to avoid the number of distributed network initialization, we need to have we want to avoid the number of distributed network initialization, we need to have
this extra decorator on the function launched by torch.multiprocessing. this extra decorator on the function launched by torch.multiprocessing.
If a function is wrapped with this wrapper, non-paramterized arguments must be keyword arguments, If a function is wrapped with this wrapper, non-parametrized arguments must be keyword arguments,
positioanl arguments are not allowed. positional arguments are not allowed.
Usgae:: Usage::
# Example 1: # Example 1:
@parameterize('person', ['xavier', 'davis']) @parameterize('person', ['xavier', 'davis'])
...@@ -33,7 +33,7 @@ def parameterize(argument: str, values: List[Any]) -> Callable: ...@@ -33,7 +33,7 @@ def parameterize(argument: str, values: List[Any]) -> Callable:
# > xavier: hello # > xavier: hello
# > davis: hello # > davis: hello
# Exampel 2: # Example 2:
@parameterize('person', ['xavier', 'davis']) @parameterize('person', ['xavier', 'davis'])
@parameterize('msg', ['hello', 'bye', 'stop']) @parameterize('msg', ['hello', 'bye', 'stop'])
def say_something(person, msg): def say_something(person, msg):
...@@ -110,7 +110,7 @@ def rerun_on_exception(exception_type: Exception = Exception, pattern: str = Non ...@@ -110,7 +110,7 @@ def rerun_on_exception(exception_type: Exception = Exception, pattern: str = Non
If the pattern is not None and matches the exception message, If the pattern is not None and matches the exception message,
the exception will be detected for rerun the exception will be detected for rerun
max_try (int, Optional): Maximum reruns for this function. The default value is 5. max_try (int, Optional): Maximum reruns for this function. The default value is 5.
If max_try is None, it will rerun foreven if exception keeps occurings If max_try is None, it will rerun forever if exception keeps occurring
""" """
def _match_lines(lines, pattern): def _match_lines(lines, pattern):
...@@ -144,7 +144,7 @@ def rerun_on_exception(exception_type: Exception = Exception, pattern: str = Non ...@@ -144,7 +144,7 @@ def rerun_on_exception(exception_type: Exception = Exception, pattern: str = Non
# Override signature # Override signature
# otherwise pytest.mark.parameterize will raise the following error: # otherwise pytest.mark.parameterize will raise the following error:
# function does not use argumetn xxx # function does not use argument xxx
sig = signature(func) sig = signature(func)
_run_until_success.__signature__ = sig _run_until_success.__signature__ = sig
...@@ -231,7 +231,7 @@ def spawn(func, nprocs=1, **kwargs): ...@@ -231,7 +231,7 @@ def spawn(func, nprocs=1, **kwargs):
This function is used to spawn processes for testing. This function is used to spawn processes for testing.
Usage: Usage:
# must contians arguments rank, world_size, port # must contains arguments rank, world_size, port
def do_something(rank, world_size, port): def do_something(rank, world_size, port):
... ...
......
...@@ -89,7 +89,7 @@ def load_checkpoint(path: str, ...@@ -89,7 +89,7 @@ def load_checkpoint(path: str,
torch_load_kwargs: (dict, optional): The kwargs of torch.load inside the function torch_load_kwargs: (dict, optional): The kwargs of torch.load inside the function
load_state_dict_kwargs (dict, optional): The kwargs of load_state_dict inside the function load_state_dict_kwargs (dict, optional): The kwargs of load_state_dict inside the function
""" """
# initialize the default paramters # initialize the default parameters
if not torch_load_kwargs: if not torch_load_kwargs:
torch_load_kwargs = dict() torch_load_kwargs = dict()
if not load_state_dict_kwargs: if not load_state_dict_kwargs:
......
...@@ -34,7 +34,7 @@ def gather_tensor(colo_tensor: ColoTensor) -> None: ...@@ -34,7 +34,7 @@ def gather_tensor(colo_tensor: ColoTensor) -> None:
dist.barrier() dist.barrier()
if dist.get_rank() == 0: if dist.get_rank() == 0:
setattr(colo_tensor, 'save_ready', True) # set saving signitrue setattr(colo_tensor, 'save_ready', True) # set saving signature
def scatter_tensor(colo_tensor: ColoTensor, dist_spec: _DistSpec) -> None: def scatter_tensor(colo_tensor: ColoTensor, dist_spec: _DistSpec) -> None:
......
...@@ -38,7 +38,7 @@ def sync_moe_model_param(model: nn.Module): ...@@ -38,7 +38,7 @@ def sync_moe_model_param(model: nn.Module):
param_dict = get_moe_epsize_param_dict(model) param_dict = get_moe_epsize_param_dict(model)
# synchrosize the parameters whose dp_group is the whole world # synchronize the parameters whose dp_group is the whole world
if 1 in param_dict: if 1 in param_dict:
src_rank = gpc.get_ranks_in_group(ParallelMode.DATA)[0] src_rank = gpc.get_ranks_in_group(ParallelMode.DATA)[0]
for param in param_dict[1]: for param in param_dict[1]:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment