Unverified Commit bce9499e authored by digger yu's avatar digger yu Committed by GitHub
Browse files

fix some typo (#5307)

parent ec912b1b
...@@ -69,7 +69,7 @@ class MoEManager(metaclass=SingletonMeta): ...@@ -69,7 +69,7 @@ class MoEManager(metaclass=SingletonMeta):
fixed_dp_size (int, optional): Fixed dp size in fixed mode. Defaults to 0. fixed_dp_size (int, optional): Fixed dp size in fixed mode. Defaults to 0.
fixed_ep_size (int, optional): Fixed ep size in fixed mode. Defaults to 0. fixed_ep_size (int, optional): Fixed ep size in fixed mode. Defaults to 0.
fixed_pp_size (int, optional): Fixed pp size in fixed mode. Defaults to 0. fixed_pp_size (int, optional): Fixed pp size in fixed mode. Defaults to 0.
use_ep_inside (bool, optional): Use ep inside dp if True, dp inside ep if Fasle. Defaults to True. use_ep_inside (bool, optional): Use ep inside dp if True, dp inside ep if False. Defaults to True.
""" """
assert not self.is_initialized, "MoE distributed context shouldn't be set up again" assert not self.is_initialized, "MoE distributed context shouldn't be set up again"
assert torch.cuda.is_available(), "MoE requires to enable CUDA first" assert torch.cuda.is_available(), "MoE requires to enable CUDA first"
......
...@@ -451,7 +451,7 @@ class CommSpec: ...@@ -451,7 +451,7 @@ class CommSpec:
elif self.comm_pattern == CollectiveCommPattern.MIXGATHER_FWD_SPLIT_BWD: elif self.comm_pattern == CollectiveCommPattern.MIXGATHER_FWD_SPLIT_BWD:
res_list.append(f"comm_pattern:MIXGATHER_FWD_SPLIT_BWD, ") res_list.append(f"comm_pattern:MIXGATHER_FWD_SPLIT_BWD, ")
res_list.append(f"gather_dim:{self.gather_dim}, ") res_list.append(f"gather_dim:{self.gather_dim}, ")
res_list.append(f"logical_process_asex:{self.logical_process_axes})") res_list.append(f"logical_process_axes:{self.logical_process_axes})")
return "".join(res_list) return "".join(res_list)
......
...@@ -96,9 +96,9 @@ def _apply_layout(tensor, layout): ...@@ -96,9 +96,9 @@ def _apply_layout(tensor, layout):
""" """
Apply the layout to the local tensor during initializing process. Apply the layout to the local tensor during initializing process.
""" """
# layout converter requires a source and target laytout # layout converter requires a source and target layout
# we construct the source layer for an unsharded tensor # we construct the source layer for an unsharded tensor
# and use self.dist_layer as the targer layout for the sharded tensor # and use self.dist_layer as the target layout for the sharded tensor
source_spec = _construct_default_sharding_spec(tensor) source_spec = _construct_default_sharding_spec(tensor)
source_layout = Layout(device_mesh=layout.device_mesh, sharding_spec=source_spec, global_shape=tensor.shape) source_layout = Layout(device_mesh=layout.device_mesh, sharding_spec=source_spec, global_shape=tensor.shape)
sharded_tensor = layout_converter.apply(tensor=tensor, source_layout=source_layout, target_layout=layout) sharded_tensor = layout_converter.apply(tensor=tensor, source_layout=source_layout, target_layout=layout)
......
...@@ -40,7 +40,7 @@ def get_moe_info(ep_size: int, dp_size: int, pp_size: int, ep_inside: bool) -> M ...@@ -40,7 +40,7 @@ def get_moe_info(ep_size: int, dp_size: int, pp_size: int, ep_inside: bool) -> M
ep_size (int): The expert parallel size. ep_size (int): The expert parallel size.
dp_size (int): The data parallel size. dp_size (int): The data parallel size.
pp_size (int): The pipeline parallel size. pp_size (int): The pipeline parallel size.
ep_inside (bool, optional): Use ep inside dp if True, dp inside ep if Fasle. ep_inside (bool, optional): Use ep inside dp if True, dp inside ep if False.
Returns: Returns:
dict: The moe info of the given tensor. dict: The moe info of the given tensor.
......
...@@ -12,7 +12,7 @@ class MoeParallelInfo: ...@@ -12,7 +12,7 @@ class MoeParallelInfo:
ep_size (int): expert parallel size ep_size (int): expert parallel size
dp_size (int): data parallel (zero) size dp_size (int): data parallel (zero) size
pp_size (int, optional): pipeline parallel size. Defaults to 1. pp_size (int, optional): pipeline parallel size. Defaults to 1.
ep_inside (bool, optional): Use ep inside dp if True, dp inside ep if Fasle. Defaults to True. ep_inside (bool, optional): Use ep inside dp if True, dp inside ep if False. Defaults to True.
""" """
self.pp_size, self.dp_size, self.ep_size = pp_size, dp_size, ep_size self.pp_size, self.dp_size, self.ep_size = pp_size, dp_size, ep_size
if ep_inside: if ep_inside:
......
...@@ -123,7 +123,7 @@ class MultiTimer: ...@@ -123,7 +123,7 @@ class MultiTimer:
return None return None
def get_timer(self, name): def get_timer(self, name):
"""Get timer by its name (from multitimer) """Get timer by its name (from multimer)
Args: Args:
name (str): Timer's key. name (str): Timer's key.
......
...@@ -413,7 +413,7 @@ class GeminiOptimizer(OptimizerWrapper): ...@@ -413,7 +413,7 @@ class GeminiOptimizer(OptimizerWrapper):
only_rank_0(bool): if True, states will be collected only on master rank, otherwise collected on every rank. only_rank_0(bool): if True, states will be collected only on master rank, otherwise collected on every rank.
Returns: Returns:
collected_states(dict): the gathered optimzier state of parameter with given id collected_states(dict): the gathered optimizer state of parameter with given id
if this method is called by master rank, otherwise an empty dict. if this method is called by master rank, otherwise an empty dict.
This method can work only when called by all processes simultaneously. This method can work only when called by all processes simultaneously.
...@@ -461,7 +461,7 @@ class GeminiOptimizer(OptimizerWrapper): ...@@ -461,7 +461,7 @@ class GeminiOptimizer(OptimizerWrapper):
global_shape = self.optimizer_params_info["id2shape"][param_id] global_shape = self.optimizer_params_info["id2shape"][param_id]
# If the chunk is kept gathered, # If the chunk is kept gathered,
# the parameteres are treated the same as that of those in strict DDP during training. # the parameters are treated the same as that of those in strict DDP during training.
# So states can be directly fetched from current device. # So states can be directly fetched from current device.
if chunk.keep_gathered: if chunk.keep_gathered:
assert param_id in self.id_to_fake_params assert param_id in self.id_to_fake_params
...@@ -644,7 +644,7 @@ class GeminiOptimizer(OptimizerWrapper): ...@@ -644,7 +644,7 @@ class GeminiOptimizer(OptimizerWrapper):
""" """
Args: Args:
only_rank_0 (bool): a boolean value indicating whether the state_dict is collected only_rank_0 (bool): a boolean value indicating whether the state_dict is collected
only on rank 0, dafault to True. only on rank 0, default to True.
Returns: Returns:
The complete state of the optimizer as a :class:`dict`. The complete state of the optimizer as a :class:`dict`.
...@@ -783,7 +783,7 @@ class GeminiOptimizer(OptimizerWrapper): ...@@ -783,7 +783,7 @@ class GeminiOptimizer(OptimizerWrapper):
prefix (str, optional): the prefix for states. Default to ''. prefix (str, optional): the prefix for states. Default to ''.
max_shard_size (int, optional): max size of state dict shard (in MB). Defaults to 1024. max_shard_size (int, optional): max size of state dict shard (in MB). Defaults to 1024.
only_rank_0 (bool, optional): a boolean value indicating whether the state_dict is collected only_rank_0 (bool, optional): a boolean value indicating whether the state_dict is collected
only on rank 0, dafault to True. only on rank 0, default to True.
Yields: Yields:
Iterator[OrderedDict]: A generator of state dict shard of optimizer states. Iterator[OrderedDict]: A generator of state dict shard of optimizer states.
......
...@@ -15,7 +15,7 @@ class BucketStore(BaseStore): ...@@ -15,7 +15,7 @@ class BucketStore(BaseStore):
# init # init
self.current_group_id = 0 self.current_group_id = 0
self._num_elements_in_bucket = 0 self._num_elements_in_bucket = 0
# mapping gardient slices and parameter # mapping gradient slices and parameter
self.grad_to_param_mapping = dict() self.grad_to_param_mapping = dict()
self._grad_in_bucket = dict() self._grad_in_bucket = dict()
...@@ -59,7 +59,7 @@ class BucketStore(BaseStore): ...@@ -59,7 +59,7 @@ class BucketStore(BaseStore):
self.offset_list[-1] += 1 self.offset_list[-1] += 1
def build_grad_in_bucket(self): def build_grad_in_bucket(self):
"""Orgnize parameters' gradient(padding and split), follows the paramters' splitting method """Organize parameters' gradient(padding and split), follows the parameters' splitting method
Data structure of self._grad_in_bucket: Data structure of self._grad_in_bucket:
{ {
...@@ -91,7 +91,7 @@ class BucketStore(BaseStore): ...@@ -91,7 +91,7 @@ class BucketStore(BaseStore):
return self._grad_in_bucket return self._grad_in_bucket
def get_flatten_grad(self) -> Tensor: def get_flatten_grad(self) -> Tensor:
"""Return the flattened gradients slices in the bucket, the data orginization of the flattened tensor: """Return the flattened gradients slices in the bucket, the data organization of the flattened tensor:
[grad0_rank0, grad1_rank0, ..., grad_0_rank1, grad1_rank1, ....] [grad0_rank0, grad1_rank0, ..., grad_0_rank1, grad1_rank1, ....]
Returns: Returns:
......
...@@ -9,7 +9,7 @@ class GradientStore(BaseStore): ...@@ -9,7 +9,7 @@ class GradientStore(BaseStore):
def __init__(self, *args, partition_grad: bool = False): def __init__(self, *args, partition_grad: bool = False):
super().__init__(*args) super().__init__(*args)
""" """
self._grads_of_params mapping the paramater and its gradient slices self._grads_of_params mapping the parameter and its gradient slices
data structure: data structure:
{ {
group_id:{ group_id:{
......
...@@ -171,7 +171,7 @@ class LowLevelZeroOptimizer(OptimizerWrapper): ...@@ -171,7 +171,7 @@ class LowLevelZeroOptimizer(OptimizerWrapper):
# managed by this data parallel rank # managed by this data parallel rank
param_group["params"] = master_param_current_rank param_group["params"] = master_param_current_rank
# if there are moe params, store in addtional group in optim # if there are moe params, store in additional group in optim
if len(moe_params) > 0: if len(moe_params) > 0:
param_group = dict() param_group = dict()
for key, value in self.optim.param_groups[0].items(): for key, value in self.optim.param_groups[0].items():
...@@ -180,8 +180,8 @@ class LowLevelZeroOptimizer(OptimizerWrapper): ...@@ -180,8 +180,8 @@ class LowLevelZeroOptimizer(OptimizerWrapper):
param_group["params"] = moe_params param_group["params"] = moe_params
self.optim.param_groups.append(param_group) self.optim.param_groups.append(param_group)
# intialize communication stream for # initialize communication stream for
# communication-compuation overlapping # communication-computation overlapping
if self._overlap_communication: if self._overlap_communication:
self._comm_stream = device_utils.Stream() self._comm_stream = device_utils.Stream()
......
...@@ -32,7 +32,7 @@ Plugin is an important component that manages parallel configuration (eg: The ge ...@@ -32,7 +32,7 @@ Plugin is an important component that manages parallel configuration (eg: The ge
More details about usages of each plugin can be found in chapter [Booster Plugins](./booster_plugins.md). More details about usages of each plugin can be found in chapter [Booster Plugins](./booster_plugins.md).
Some plugins support lazy initialization, which can be used to save memory when initializating large models. For more details, please see [Lazy Initialization](../features/lazy_init.md). Some plugins support lazy initialization, which can be used to save memory when initializing large models. For more details, please see [Lazy Initialization](../features/lazy_init.md).
### API of booster ### API of booster
......
...@@ -49,7 +49,7 @@ You should expect to the log like this. This log shows the edge cost on the comp ...@@ -49,7 +49,7 @@ You should expect to the log like this. This log shows the edge cost on the comp
### Auto-Checkpoint Tutorial ### Auto-Checkpoint Tutorial
We prepare two bechmarks for you to test the performance of auto checkpoint We prepare two benchmarks for you to test the performance of auto checkpoint
The first test `auto_ckpt_solver_test.py` will show you the ability of solver to search checkpoint strategy that could fit in the given budget (test on GPT2 Medium and ResNet 50). It will output the benchmark summary and data visualization of peak memory vs. budget memory and relative step time vs. peak memory. The first test `auto_ckpt_solver_test.py` will show you the ability of solver to search checkpoint strategy that could fit in the given budget (test on GPT2 Medium and ResNet 50). It will output the benchmark summary and data visualization of peak memory vs. budget memory and relative step time vs. peak memory.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment