Commit 01bcbb1e authored by lim's avatar lim
Browse files

Initial commit

parent 187361d1
Pipeline #3395 canceled with stages
from .pipeline_parallel.pipeline_feature import PipelineFeature
from .tensor_parallel.parallel_linear_feature import ParallelLinearFeature
from .optimizer.optimizer_feature import OptimizerFeature
from .communication.gradient_compress_feature import GradientCompressFeature
from .communication.quantize_comm_feature import QuantizeCommFeature
from .memory.swap_attention_feature import SwapAttentionFeature
from .memory.cpu_offload_feature import CPUOffloadFeature
from .recompute.activation_function import RecomputeActivationFeature
ADAPTOR_FEATURES = [
PipelineFeature(),
OptimizerFeature(),
ParallelLinearFeature(),
GradientCompressFeature(),
QuantizeCommFeature(),
SwapAttentionFeature(),
CPUOffloadFeature(),
RecomputeActivationFeature(),
]
from argparse import ArgumentParser
from ..feature import AbstractFeature
class GradientCompressFeature(AbstractFeature):
def __init__(self):
super().__init__('enable-dynamic-grad-comp')
def register_args(self, parser: ArgumentParser):
group = parser.add_argument_group(title=self.feature_name)
group = parser.add_argument_group(title='grad comp args')
group.add_argument('--enable-dynamic-grad-comp',
dest='enable_dynamic_grad_comp',
action='store_true',
help='Enable dynamic gradient compression (e.g., adaptive rank/sparsity based on training phase or gradient statistics).')
group.add_argument('--grad-comp',
dest='grad_comp', action='store_true', help='use grad comp algorithm for data parallel.')
group.add_argument('--grad-comp-warm-up', type=float, default=0.1,
help='PwerSGD warm up period for accuracy gain.')
group.add_argument('--rank-adjust-window-size',
type=int, default=1000,
help='the window size of adjust rank')
group.add_argument('--iteration-sample-ratio',
type=float, default=0.01,
help='iteration_sample_ratio')
group.add_argument('--gradient-sample-ratio',
type=float, default=1.0,
help='gradient_sample_ratio')
group.add_argument('--collect-log-path', type=str, default='./logs',
help='If set, collect some data during the iteration process, such as the time and loss of each iteration')
def register_patches(self, patch_manager, args):
from dcu_megatron.core.distributed.finalize_model_grads import finalize_model_grads
from dcu_megatron.core.distributed.param_and_grad_buffer import _ParamAndGradBucketGroup, _ParamAndGradBuffer, \
_ParamAndGradBucket
from dcu_megatron.training.training import save_checkpoint_and_time_wrapper
from dcu_megatron.training.training import pretrain
# edgc相关功能函数替换
if args.enable_dynamic_grad_comp:
patch_manager.register_patch('megatron.core.distributed.finalize_model_grads.finalize_model_grads',
finalize_model_grads)
patch_manager.register_patch('megatron.core.distributed.param_and_grad_buffer._ParamAndGradBucketGroup',
_ParamAndGradBucketGroup)
patch_manager.register_patch('megatron.core.distributed.param_and_grad_buffer._ParamAndGradBuffer._new_bucket',
_ParamAndGradBuffer._new_bucket)
patch_manager.register_patch('megatron.core.distributed.param_and_grad_buffer._ParamAndGradBucket',
_ParamAndGradBucket)
patch_manager.register_patch('megatron.training.training.save_checkpoint_and_time',
save_checkpoint_and_time_wrapper,
apply_wrapper=True)
patch_manager.register_patch('megatron.training.training.pretrain',
pretrain)
from argparse import ArgumentParser
from ..feature import AbstractFeature
QUANT_BIT_DEFAULT_GROUP_SIZE_MAP = {
4: 32,
8: 128,
}
QUANT_BIT_GROUP_SIZE_CHOICES_MAP = {
4: {16, 32},
8: {64, 128},
}
class QuantizeCommFeature(AbstractFeature):
def __init__(self):
super().__init__('use-quantize-comm', 2)
def register_args(self, parser: ArgumentParser):
group = parser.add_argument_group(title=self.feature_name)
group.add_argument('--use-quantize-comm',
default=False,
action="store_true",
help='use quantized communication')
group.add_argument('--quant-comm-bits', type=int,
default=8,
choices=[4, 8],
help='the number of bits to quantize to, supported numbers are (4, 8)')
group.add_argument('--quant-group-size', type=int,
default=None,
help='the group size to use for quantization. If not specified, uses per-column quantization')
group.add_argument('--quant-scale-dtype', type=str,
default="bf16",
choices=["bf16", "fp16", "fp32"],
help='the dtype of quantization scale')
def validate_args(self, args):
assert args.quant_comm_bits in {4, 8}, f"quant_comm_bits {args.quant_comm_bits} only accepts values from [4, 8]"
if (
args.quant_group_size is not None
and args.quant_group_size not in QUANT_BIT_GROUP_SIZE_CHOICES_MAP[args.quant_comm_bits]
):
raise ValueError(f"quant_group_size {args.quant_group_size} only accepts values from {QUANT_BIT_GROUP_SIZE_CHOICES_MAP[args.quant_comm_bits]}")
def register_patches(self, patch_manager, args):
from dcu_megatron.core.tensor_parallel.mappings import all_to_all
if args.use_quantize_comm:
patch_manager.register_patch('megatron.core.tensor_parallel.mappings.all_to_all',
all_to_all)
import argparse
class AbstractFeature:
def __init__(self, feature_name: str, optimization_level: int = 2):
self.feature_name = feature_name.strip().replace('-', '_')
self.optimization_level = optimization_level
self.default_patches = self.optimization_level == 0
def register_args(self, parser):
pass
def pre_validate_args(self, args):
return args
def validate_args(self, args):
pass
def post_validate_args(self, args):
pass
def register_patches(self, patch_manager, args):
...
def incompatible_check(self, global_args, check_args):
if getattr(global_args, self.feature_name, None) and getattr(global_args, check_args, None):
raise AssertionError('{} and {} are incompatible.'.format(self.feature_name, check_args))
def dependency_check(self, global_args, check_args):
if getattr(global_args, self.feature_name, None) and not getattr(global_args, check_args, None):
raise AssertionError('{} requires {}.'.format(self.feature_name, check_args))
@staticmethod
def add_parser_argument_choices_value(parser, argument_name, new_choice):
for action in parser._actions:
exist_arg = isinstance(action, argparse.Action) and argument_name in action.option_strings
if exist_arg and action.choices is not None and new_choice not in action.choices:
action.choices.append(new_choice)
import os
from argparse import ArgumentParser
from ..feature import AbstractFeature
class CPUOffloadFeature(AbstractFeature):
def __init__(self):
super().__init__('fine-grained-activation-offloading', 2)
def register_args(self, parser: ArgumentParser):
group = parser.add_argument_group(title=self.feature_name)
group.add_argument('--fine-grained-activation-offloading', action='store_true',
help='Offload the activation to CPU')
group.add_argument('--offload-modules', nargs='*', type=str, default=None,
help='The submodules to offload. '
'choices: "attn_norm", "qkv_linear", "core_attn", "attn_proj", "mlp_norm", "expert_fc1", "expert_fc2", '
' "shared_fc1", "shared_fc2", "moe_act".'
'default: ["core_attn"].'
'"attn_norm": offload the input of the normalization in the attention part. '
'"qkv_linear": offload the qkv_linear part of the transformer layer. '
'"core_attn": offload the core attention part of the transformer layer. '
'"attn_proj": offload the input of the attn linear projection part. '
'"mlp_norm": offload the input of the normalization in the mlp part. '
'"expert_fc1": offload the input of the expert fc1 part. '
'"expert_fc2": offload the input of the expert fc2 part. '
'"shared_fc1": offload the shared_fc1 part of the transformer layer. '
'"shared_fc2": offload the shared_fc2 part of the transformer layer. '
'"moe_act": offload the activation function part of the moe layer.')
group.add_argument('--min-offloaded-tensor-size', type=int, default=1024*1024,
help='The minimum size of the tensor to be offloaded.')
def register_patches(self, patch_manager, args):
from dcu_megatron.core.models.gpt.gpt_model import GPTModel
from dcu_megatron.core.transformer.attention import Attention
from dcu_megatron.core.transformer.multi_latent_attention import MultiLatentAttention
from dcu_megatron.core.transformer.moe.experts import TEGroupedMLP
from dcu_megatron.core.transformer.mlp import MLP
from dcu_megatron.core.transformer.transformer_layer import TransformerLayer
from dcu_megatron.core.transformer.transformer_block import TransformerBlock
from dcu_megatron.core.extensions.transformer_engine import te_module_init_wrapper
from dcu_megatron.core.pipeline_parallel.schedules import forward_backward_pipelining_wrapper
from dcu_megatron.core.transformer.multi_token_prediction import MultiTokenPredictionBlock
from dcu_megatron.core.tensor_parallel.random import CheckpointWithoutOutput
from dcu_megatron.core.models.gpt.fine_grained_callables import build_layer_callables_without_split_attn
patch_manager.register_patch('megatron.core.models.gpt.gpt_model.GPTModel.preprocess_for_fine_grained_offloading',
GPTModel.preprocess_for_fine_grained_offloading,
create_dummy=True)
patch_manager.register_patch('megatron.core.models.gpt.gpt_model.GPTModel.__init__',
GPTModel.__init__)
patch_manager.register_patch('megatron.core.models.gpt.gpt_model.GPTModel.build_schedule_plan',
GPTModel.build_schedule_plan)
patch_manager.register_patch('megatron.core.transformer.attention.Attention.forward',
Attention.forward)
patch_manager.register_patch('megatron.core.transformer.multi_latent_attention.MultiLatentAttention.forward',
MultiLatentAttention.forward)
patch_manager.register_patch('megatron.core.transformer.moe.experts.TEGroupedMLP.forward',
TEGroupedMLP.forward)
patch_manager.register_patch('megatron.core.pipeline_parallel.schedules.forward_backward_pipelining_with_interleaving',
forward_backward_pipelining_wrapper,
apply_wrapper=True)
patch_manager.register_patch('megatron.core.transformer.mlp.MLP.forward',
MLP.forward)
patch_manager.register_patch('megatron.core.pipeline_parallel.schedules.forward_backward_pipelining_without_interleaving',
forward_backward_pipelining_wrapper,
apply_wrapper=True)
patch_manager.register_patch('megatron.core.transformer.transformer_layer.TransformerLayer._forward_attention',
TransformerLayer._forward_attention)
patch_manager.register_patch('megatron.core.transformer.transformer_block.TransformerBlock.forward',
TransformerBlock.forward)
patch_manager.register_patch('megatron.core.transformer.multi_token_prediction.MultiTokenPredictionBlock.forward',
MultiTokenPredictionBlock.forward)
patch_manager.register_cls_funcs('megatron.core.tensor_parallel.random.CheckpointWithoutOutput',
[CheckpointWithoutOutput.checkpoint,
CheckpointWithoutOutput._recompute],
create_dummy=True)
patch_manager.register_patch('megatron.core.models.gpt.fine_grained_callables.build_layer_callables',
build_layer_callables_without_split_attn)
from argparse import ArgumentParser
from ..feature import AbstractFeature
from megatron.core.utils import is_te_min_version
class SwapAttentionFeature(AbstractFeature):
def __init__(self):
super().__init__('swap-attention', 2)
def register_args(self, parser: ArgumentParser):
group = parser.add_argument_group(title=self.feature_name)
group.add_argument('--swap-attention', action='store_true', default=False,
help='switch to open swap-attention feature.'
'The default is False.')
# input_layernorm,self_attention,post_attention_norm
group.add_argument('--swap-modules', type=str, default="self_attention",
help='Swap modules for model. Can be used together with "--swap-attention."')
group.add_argument('--specify-layers', type=str, default=None,
help='Specify the swap layer. Can be used together with "--swap-attention."eg"0, 2, 4, 6"')
group.add_argument('--reduce-recompute-for-last-chunk', action='store_true', default=False,
help='with recompute, now default False')
def validate_args(self, args):
adaptive_recompute_device_size = getattr(args, 'adaptive-recompute-device-size', -1)
adaptive_recompute_device_swap = getattr(args, 'adaptive-recompute-device-swap', False)
if (adaptive_recompute_device_size > 0 or adaptive_recompute_device_swap) and args.swap_attention:
raise AssertionError('adaptive selective recompute is not compatible with swap_attention feature')
self.incompatible_check(args, 'adaptive_memory_optimization')
is_enable_lora = hasattr(args, "lora_target_modules") and args.lora_target_modules
if is_enable_lora:
raise AssertionError('swap attention is not compatible with LoRA')
def register_patches(self, patch_manager, args):
if getattr(args, self.feature_name, None):
if hasattr(args, "use_mcore_models") and args.use_mcore_models:
if not is_te_min_version("2.5.0") and hasattr(args, "overlap_grad_reduce") and args.overlap_grad_reduce:
raise AssertionError("With overlap_grad_reduce must have at least transformer-engine version of 2.5.0")
from dcu_megatron.core.memory.swap_attention.adaptor_swap_atten import allowed_recomputing_swap_module_wrapper
from megatron.legacy.model.transformer import ParallelTransformerLayer
from megatron.core.transformer.transformer_layer import TransformerLayer
if hasattr(args, "use_legacy_models") and not args.use_legacy_models:
allowed_recomputing_swap_module_wrapper(TransformerLayer)
else:
allowed_recomputing_swap_module_wrapper(ParallelTransformerLayer)
from dcu_megatron.core.memory.swap_attention.adaptor_swap_atten import setup_model_and_optimizer_wrapper
patch_manager.register_patch('megatron.training.training.setup_model_and_optimizer', setup_model_and_optimizer_wrapper)
from dcu_megatron.core.memory.common import linear_forward_main_grad_wrapper, linear_backward_main_grad_wrapper
patch_manager.register_patch('megatron.core.tensor_parallel.layers.LinearWithGradAccumulationAndAsyncCommunication.forward',
linear_forward_main_grad_wrapper)
patch_manager.register_patch('megatron.core.tensor_parallel.layers.LinearWithGradAccumulationAndAsyncCommunication.backward',
linear_backward_main_grad_wrapper)
from argparse import ArgumentParser
from megatron.core.utils import is_te_min_version
from ..feature import AbstractFeature
class OptimizerFeature(AbstractFeature):
def __init__(self):
super().__init__('use-optimizer-feature')
def register_args(self, parser: ArgumentParser):
group = parser.add_argument_group(title=self.feature_name)
group.add_argument('--use-optimizer-feature', action='store_true',
help='whether to use optimizer related feature.')
group.add_argument('--reuse-fp32-param', action='store_true',
help='The distributed training optimizer frees up '
'param copies of FP32 to save memory.')
def validate_args(self, args):
if args.reuse_fp32_param and not args.bf16:
raise AssertionError('--reuse-fp32-param only support for `bf16`')
def register_patches(self, patch_manager, args):
if args.reuse_fp32_param:
from dcu_megatron.core.memory.reuse_param.adaptor import (
step_with_ready_grads,
prepare_grads,
reuse_fp32_param_init_wrapper,
optimizer_config_init_wrapper
)
from dcu_megatron.core.memory.reuse_param.adaptor import reuse_fp32_param_distrib_optimizer_init_wrapper
from dcu_megatron.core.memory.reuse_param.adaptor import reuse_fp32_param_param_and_grad_buffer_init_wrapper
patch_manager.register_patch('megatron.core.optimizer.optimizer.MixedPrecisionOptimizer.prepare_grads',
prepare_grads)
patch_manager.register_patch('megatron.core.optimizer.optimizer.MixedPrecisionOptimizer.step_with_ready_grads',
step_with_ready_grads)
patch_manager.register_patch('megatron.core.optimizer.optimizer.Float16OptimizerWithFloat16Params.__init__',
reuse_fp32_param_init_wrapper)
patch_manager.register_patch('megatron.core.optimizer.optimizer_config.OptimizerConfig.__init__',
optimizer_config_init_wrapper)
patch_manager.register_patch('megatron.core.optimizer.distrib_optimizer.DistributedOptimizer.__init__',
reuse_fp32_param_distrib_optimizer_init_wrapper)
patch_manager.register_patch('megatron.core.distributed.param_and_grad_buffer._ParamAndGradBuffer.__init__',
reuse_fp32_param_param_and_grad_buffer_init_wrapper)
import os
import re
from argparse import ArgumentParser
from megatron.core import parallel_state
from megatron.core.utils import is_te_min_version, is_torch_min_version
from ..feature import AbstractFeature
def _eval_pattern(pattern):
""" Validate and evaluate a string containing a Python list expression """
assert isinstance(pattern, str)
# validate input, only allow comma, digits, [, ], (, ), +, and *
if bool(re.compile(r'[^,\d\[\]\(\)\+\*]').search(pattern)):
raise ValueError(f"Invalid pattern: {pattern}")
return eval(pattern)
def num_layers_build_type(x):
"""number of layers to build.
Accepts either:
- An integer N: meaning n layers for each model block
- A string "N": Same as above, but provided as a string
- A string containing a Python list expression that defines a custom pattern, e.g.:
"([1]*3+[2]*1)*3" evaluates to [1,1,1,2,1,1,1,2,1,1,1,2]
The pattern length must match the total number of transformer blocks.
"""
if isinstance(x, int):
return x
assert isinstance(x, str)
if '[' in x:
# it's a custom pattern
return _eval_pattern(x)
else:
# it's a single int but in str
return int(x)
class PipelineFeature(AbstractFeature):
def __init__(self):
super().__init__('schedule-method')
def register_args(self, parser: ArgumentParser):
group = parser.add_argument_group(title=self.feature_name)
group.add_argument('--schedule-method', type=str,
default='vanilla',
choices=['vanilla', 'dualpipev', 'seq1f1b', 'interleaved_seq1f1b'],
help='Use pipeline provided by megatron if schedule-method is set to vanilla')
# MoE communication overlap arguments
group.add_argument('--overlap-ep-comm-with-split-attn', action="store_true",
default=False,
help='whether to split attention')
group.add_argument('--num-layers-to-build',
type=num_layers_build_type,
default=None,
help='number of layers to build: '
'- An integer N: meaning n layers for each model block '
'- A string containing a Python list expression that defines a custom pattern')
# Vocabulary parallelism.
group.add_argument('--enable-vocab-parallel', action='store_true',
help='Enables vocabulary parallelism at the vocabulary layers. '
'Must be enabled together with pipeline model parallelism.')
group.add_argument('--disable-backward-fusion', action='store_true',
help='disables the forward-backward fusion for the output '
'layer. requires two communication barriers instead of one')
group.add_argument('--schedule-timer-start', type=int, default=10,
help='Start iteration of the vocabulary parallelism schedule timer')
group.add_argument('--schedule-timer-end', type=int, default=20,
help='End iteration of the vocabulary parallelism schedule timer')
def pre_validate_args(self, args):
if args.schedule_method != "dualpipev":
return args
pp_size = args.pipeline_model_parallel_size * 2
if args.num_layers is None and args.num_layers_to_build is not None:
pp_size = args.pipeline_model_parallel_size
if isinstance(args.num_layers_to_build, int):
args.num_layers = args.num_layers_to_build * pp_size * 2
else:
assert len(args.num_layers_to_build) == pp_size * 2, "The pattern length must match the total number of transformer blocks"
args.num_layers = sum(args.num_layers_to_build)
return args
def validate_args(self, args):
if args.schedule_method == "dualpipev":
if args.delay_wgrad_compute and args.overlap_grad_reduce:
assert bool(int(os.getenv("NVTE_OVERLAP_GRAD_REDUCE", "0"))), \
"NVTE_OVERLAP_GRAD_REDUCE should be set to 1 when --delay-wgrad-compute and --overlap-grad-reduce are set"
if args.schedule_method == "dualpipev":
if args.num_layers_per_virtual_pipeline_stage is not None or args.num_virtual_stages_per_pipeline_rank is not None:
raise AssertionError("The dualpipev and virtual_pipeline are incompatible.")
layers_to_distribute = args.num_layers
pipeline_stages_left = args.pipeline_model_parallel_size * 2
if args.num_layers_to_build is not None:
assert args.decoder_first_pipeline_num_layers is None and args.decoder_last_pipeline_num_layers is None, \
"--decoder-first-pipeline-num-layers and --decoder-last-pipeline-num-layers should NOT be set when using --num-layers-to-build"
if isinstance(args.num_layers_to_build, int):
assert args.num_layers_to_build * pipeline_stages_left == layers_to_distribute, "num-layers-to-build mismatch with num-layers"
else:
assert len(args.num_layers_to_build) == pipeline_stages_left, "The pattern length must match the total number of transformer blocks"
assert sum(args.num_layers_to_build) == args.num_layers
if args.decoder_first_pipeline_num_layers is not None and args.decoder_last_pipeline_num_layers is not None:
if args.decoder_first_pipeline_num_layers is not None:
layers_to_distribute -= args.decoder_first_pipeline_num_layers
pipeline_stages_left -= 1
if args.decoder_last_pipeline_num_layers is not None:
layers_to_distribute -= args.decoder_last_pipeline_num_layers
pipeline_stages_left -= 1
if layers_to_distribute < pipeline_stages_left:
raise AssertionError(
'number of layers must be at least 2*pipeline_model_parallel_size in dualpipe')
num_micro_batch = args.global_batch_size // args.micro_batch_size // args.data_parallel_size
if num_micro_batch < args.pipeline_model_parallel_size:
raise AssertionError(
"num_micro_batch should NOT be smaller than pipeline_model_parallel_size")
if not args.delay_wgrad_compute:
raise AssertionError("delay-wgrad-compute should be True")
if not is_te_min_version("2.4.0"):
raise AssertionError("Must have at least transformer-engine version of 2.4.0")
if args.overlap_moe_expert_parallel_comm:
assert args.transformer_impl == "transformer_engine", \
"moe a2a overlap is only supported with transformer_engine implementation"
assert args.schedule_method == "dualpipev" or args.num_layers_per_virtual_pipeline_stage is not None or args.num_virtual_stages_per_pipeline_rank is not None, \
'moe a2a overlap is only supported with vpp or dualpipev'
# Vocabulary parallelism.
if args.enable_vocab_parallel:
assert args.pipeline_model_parallel_size > 1, 'pipeline parallel size '\
'must be > 1 when vocab parallel is enabled'
assert args.virtual_pipeline_model_parallel_size is None, 'vocab parallel'\
'with interleaved schedule is not supported yet'
assert (
args.make_vocab_size_divisible_by %
(args.tensor_model_parallel_size * args.pipeline_model_parallel_size) == 0
), f'vocab size must be divisible by model parallel size ({args.tensor_model_parallel_size * args.pipeline_model_parallel_size}) for vocab parallel'
assert args.untie_embeddings_and_output_weights, '--enable-vocab-parallel requires' \
'untie embeddings and output weights'
else:
args.disable_backward_fusion = False
def register_patches(self, patch_manager, args):
from dcu_megatron.core.pipeline_parallel.schedules import get_forward_backward_func_wrapper
patch_manager.register_patch('megatron.core.pipeline_parallel.schedules.get_forward_backward_func',
get_forward_backward_func_wrapper,
apply_wrapper=True)
if args.schedule_method == "dualpipev":
from megatron.training.utils import print_rank_0
from dcu_megatron.core.pipeline_parallel.dualpipev.dualpipev_chunks import (
dualpipev_fp16forward,
get_num_layers_to_build,
_allreduce_embedding_grads_wrapper
)
from dcu_megatron.training.training import evaluate
from dcu_megatron.core.transformer.transformer_layer import get_transformer_layer_offset
from dcu_megatron.training.training import pretrain
from dcu_megatron.core.models.gpt.gpt_model import GPTModel
from dcu_megatron.training.global_vars import _set_tensorboard_writer, _set_wandb_writer, _set_one_logger
from dcu_megatron.core.models.common.language_module.language_module import LanguageModule
from dcu_megatron.core.transformer.multi_token_prediction import get_mtp_num_layers_to_build
from dcu_megatron.core.tensor_parallel.layers import VocabParallelEmbedding
from dcu_megatron.core.transformer.multi_token_prediction import tie_word_embeddings_state_dict_wrapper
from dcu_megatron.core.pipeline_parallel.schedules import forward_step_calc_loss
from dcu_megatron.core.distributed.distributed_data_parallel import DistributedDataParallel
patch_manager.register_patch(
'megatron.core.transformer.module.Float16Module.forward', dualpipev_fp16forward)
patch_manager.register_patch(
'megatron.core.transformer.transformer_block.get_num_layers_to_build', get_num_layers_to_build)
patch_manager.register_patch(
'megatron.training.utils.print_rank_last', print_rank_0)
patch_manager.register_patch(
'megatron.core.distributed.finalize_model_grads._allreduce_embedding_grads', _allreduce_embedding_grads_wrapper)
# use first rank
patch_manager.register_patch('megatron.training.training.evaluate', evaluate)
patch_manager.register_patch(
'megatron.core.transformer.transformer_layer.get_transformer_layer_offset', get_transformer_layer_offset)
# support dualpipev, two data iterators
patch_manager.register_patch('megatron.training.training.pretrain', pretrain)
# (1) introduce an attribute dualpipev_first_chunk. (2) remove embedding when using dualpipev
patch_manager.register_patch(
'megatron.core.models.gpt.gpt_model.GPTModel.__init__',
GPTModel.__init__)
patch_manager.register_patch(
'megatron.core.models.gpt.gpt_model.GPTModel.shared_embedding_or_output_weight',
GPTModel.shared_embedding_or_output_weight)
# set _GLOBAL_TENSORBOARD_WRITER, _GLOBAL_WANDB_WRITER, _GLOBAL_ONE_LOGGER
patch_manager.register_patch('megatron.training.global_vars._set_tensorboard_writer', _set_tensorboard_writer)
patch_manager.register_patch('megatron.training.global_vars._set_wandb_writer', _set_wandb_writer)
patch_manager.register_patch('megatron.training.global_vars._set_one_logger', _set_one_logger)
# support mtp
patch_manager.register_patch('megatron.core.models.common.language_module.language_module.LanguageModule.setup_embeddings_and_output_layer',
LanguageModule.setup_embeddings_and_output_layer)
patch_manager.register_patch('megatron.core.transformer.multi_token_prediction.get_mtp_num_layers_to_build',
get_mtp_num_layers_to_build)
patch_manager.register_patch('megatron.core.tensor_parallel.layers.VocabParallelEmbedding.__init__',
VocabParallelEmbedding.__init__)
patch_manager.register_patch('megatron.core.transformer.multi_token_prediction.tie_word_embeddings_state_dict',
tie_word_embeddings_state_dict_wrapper,
apply_wrapper=True)
patch_manager.register_patch('megatron.core.pipeline_parallel.schedules.forward_step_calc_loss',
forward_step_calc_loss)
patch_manager.register_patch('megatron.core.distributed.distributed_data_parallel.DistributedDataParallel._make_backward_post_hook',
DistributedDataParallel._make_backward_post_hook)
if args.enable_vocab_parallel:
from dcu_megatron.core.parallel_state import destroy_model_parallel_wrapper
from dcu_megatron.core.pipeline_parallel.p2p_communication import P2PCommunicator
from dcu_megatron.core.transformer.module import Float16Module
patch_manager.register_patch('megatron.core.parallel_state.destroy_model_parallel',
destroy_model_parallel_wrapper,
create_dummy=True)
patch_manager.register_cls_funcs('megatron.core.pipeline_parallel.p2p_communication.P2PCommunicator',
[P2PCommunicator._communicate,
P2PCommunicator.recv_forward,
P2PCommunicator.send_backward_recv_forward])
# embedding/output layer
patch_manager.register_cls_funcs('megatron.core.transformer.module.Float16Module',
[Float16Module.__init__,
Float16Module.forward])
from dcu_megatron.core.transformer.transformer_layer import TransformerLayer
from dcu_megatron.core.transformer.transformer_block import TransformerBlock
from dcu_megatron.core.models.gpt.gpt_model import GPTModel
from dcu_megatron.core.transformer.multi_latent_attention import MLASelfAttention
from dcu_megatron.core.transformer.attention import Attention
from dcu_megatron.core.transformer.moe.moe_layer import MoELayer
from dcu_megatron.core.distributed.data_parallel_base import _BaseDataParallel
from dcu_megatron.core.transformer.module import Float16Module
from dcu_megatron.core.transformer.multi_token_prediction import MultiTokenPredictionLayer, MultiTokenPredictionBlock
from dcu_megatron.core.pipeline_parallel.utils import ScheduleNode
patch_manager.register_patch('megatron.core.transformer.transformer_layer.TransformerLayer.backward_dw',
TransformerLayer.backward_dw,
create_dummy=True)
if args.schedule_method == "dualpipev" or args.overlap_ep_comm_with_split_attn:
patch_manager.register_patch('megatron.core.models.gpt.gpt_model.GPTModel.build_schedule_plan',
GPTModel.build_schedule_plan)
patch_manager.register_patch('megatron.core.models.gpt.gpt_model.GPTModel.backward_dw',
GPTModel.backward_dw,
create_dummy=True)
patch_manager.register_patch('megatron.core.distributed.data_parallel_base._BaseDataParallel.backward_dw',
_BaseDataParallel.backward_dw,
create_dummy=True)
patch_manager.register_patch('megatron.core.transformer.module.Float16Module.backward_dw',
Float16Module.backward_dw,
create_dummy=True)
patch_manager.register_cls_funcs('megatron.core.transformer.multi_latent_attention.MLASelfAttention',
[MLASelfAttention.compute_qkv,
MLASelfAttention.compute_attn,
MLASelfAttention.compute_proj,],
create_dummy=True)
patch_manager.register_cls_funcs('megatron.core.transformer.attention.Attention',
[Attention.compute_qkv,
Attention.compute_attn,
Attention.compute_proj,],
create_dummy=True)
patch_manager.register_patch('megatron.core.transformer.transformer_block.TransformerBlock.backward_dw',
TransformerBlock.backward_dw,
create_dummy=True)
patch_manager.register_cls_funcs('megatron.core.transformer.moe.moe_layer.MoELayer',
[MoELayer.backward_dw,
MoELayer.backward_shared_expert_dw,
MoELayer.backward_routed_expert_dw,],
create_dummy=True)
patch_manager.register_patch('megatron.core.transformer.multi_token_prediction.MultiTokenPredictionLayer.backward_dw',
MultiTokenPredictionLayer.backward_dw,
create_dummy=True)
patch_manager.register_patch('megatron.core.transformer.multi_token_prediction.MultiTokenPredictionBlock.backward_dw',
MultiTokenPredictionBlock.backward_dw,
create_dummy=True)
patch_manager.register_cls_funcs('megatron.core.pipeline_parallel.utils.ScheduleNode',
[ScheduleNode.forward,
ScheduleNode._forward,
ScheduleNode.backward,
ScheduleNode._backward,])
from ..feature import AbstractFeature
class RecomputeActivationFeature(AbstractFeature):
def __init__(self):
super().__init__('recompute-activation-function')
def register_args(self, parser):
group = parser.add_argument_group(title=self.feature_name)
group.add_argument('--recompute-activation-function', action='store_true',
help='Recompute the activation function in MLP layers.')
group.add_argument('--recompute-activation-function-num-layers', type=int, default=None,
help='Can be used together with "--recompute-method block." '
'and "--recompute-num-layers". ')
def validate_args(self, args):
if args.recompute_activation_function_num_layers is not None:
if not isinstance(args.recompute_activation_function_num_layers, int):
raise TypeError('--recompute-activation-function-num-layers must be an integer.')
if args.recompute_activation_function_num_layers < 0:
raise AssertionError('--recompute-activation-function-num-layers cannot be less than 0.')
if args.recompute_activation_function_num_layers > args.num_layers:
raise ValueError(f'--recompute-activation-function-num-layers ({args.recompute_activation_function_num_layers}) '
f'cannot be greater than --num-layers ({args.num_layers}).')
def register_patches(self, patch_manager, args):
from dcu_megatron.core.memory.recompute.activation.adaptor import dcu_activation_recompute_forward
from dcu_megatron.core.transformer.transformer import parallel_transformer_layer_init_wrapper
if getattr(args, self.feature_name, None):
patch_manager.register_patch('megatron.core.transformer.transformer_layer.TransformerLayer.__init__',
parallel_transformer_layer_init_wrapper)
patch_manager.register_patch('megatron.core.transformer.mlp.MLP.forward', dcu_activation_recompute_forward)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment