"""If int, rank where encoder and decoder should be split in cases where the model has both an
encoder and decoder (e.g., T5). Ignored if None.
"""
###################
# CPU Offloading
###################
cpu_offloading:bool=False
"""When set to True, all the activations are offloaded to the CPU asynchronously."""
cpu_offloading_num_layers:int=0
"""Tells the number of transformer layers for which activations has to be offloaded."""
_cpu_offloading_context:ContextManager=None# Used for internal use only, not to be set by the user. TODO: Need to move to the 'right' place when possible.
"""For internal use only, do not set."""
cpu_offloading_activations:bool=True
"""If True, offloads the activations to CPU."""
cpu_offloading_weights:bool=True
"""If True, offloads the weights to CPU."""
###################
# Timing
###################
barrier_with_L1_time:bool=True
"""If true, use barrier with level 1 time measurements. It is up to the user to make sure
calling barrier with their timers will not result in hangs. This can happen if for example
the user adds a level 1 timer that is not called by all ranks.
"""
def__post_init__(self):
""" Python dataclass method that is used to modify attributes after initialization.
See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more details.
"""
ifself.sequence_parallel:
ifself.tensor_model_parallel_size<=1:
raiseValueError("Can not use sequence paralllelism without tensor parallelism")
ifself.async_tensor_model_parallel_allreduce:
# sequence_parallelism already does this async
self.async_tensor_model_parallel_allreduce=False
ifself.pipeline_model_parallel_size>1:
ifself.pipeline_dtypeisNone:
raiseValueError(
"When using pipeline parallelism, pipeline_dtype must be specified"