Commit 61968c79 authored by dongcl's avatar dongcl
Browse files

Merge branch 'megatron_v0.11.0' into 'main'

Megatron v0.11.0

See merge request OpenDAS/dcu_megatron!2
parents be9a69d7 e45e6793
[submodule "Megatron-LM"]
path = Megatron-LM
url = https://github.com/NVIDIA/Megatron-LM.git
Subproject commit aa6207e2da6bca789375f591c9124aa408d2509a
......@@ -67,9 +67,29 @@ def unpermute(
```
### 使用方式
在使用时,需要安装megatron,或者将megatron放到dcu_megatron同一级目录下
project/
├── dcu_megatron
├── megatron
└── pretrain_gpt.py
在使用时,进入到examples目录下,有相关模型执行脚本,所用数据集请自行下载:https://r0ddbu55vzx.feishu.cn/drive/folder/ZxHHfCoX4lg75td2hTqcmiAin3g
```
examples
├── gpt3
│ ├── hostfile_gpt_567B
│ ├── README.md
│ ├── run_gpt_567B_1nodes.sh
│ ├── run_gpt_567B_multinodes.sh
│ ├── topo-input.xml
│ ├── train_gpt_567B_1nodes.sh
│ └── train_gpt_567B_multinodes.sh
└── mixtral
├── hostfile_mixtral_8x22B
├── hostfile_mixtral_8x7B
├── README.md
├── run_mixtral_8x22B_1nodes.sh
├── run_mixtral_8x22B_multinodes.sh
├── run_mixtral_8x7B_1nodes.sh
├── run_mixtral_8x7B_multinodes.sh
├── topo-input.xml
├── train_mixtral_8x22B_1nodes.sh
├── train_mixtral_8x22B_multinodes.sh
├── train_mixtral_8x7B_1nodes.sh
└── train_mixtral_8x7B_multinodes.sh
```
......@@ -24,15 +24,26 @@ class MegatronAdaptation:
# MegatronAdaptation.post_execute()
@classmethod
def register(cls, orig_func_name, new_func=None, force_patch=False, create_dummy=False, apply_wrapper=False):
def register(cls, orig_func_name, new_func=None, force_patch=False, create_dummy=False, apply_wrapper=False, remove_origin_wrappers=False):
"""
Register adaptations into collection.
"""
if orig_func_name not in cls._patch_info_collection:
from .patch_utils import Patch
cls._patch_info_collection[orig_func_name] = Patch(orig_func_name, new_func, create_dummy, apply_wrapper=apply_wrapper)
cls._patch_info_collection[orig_func_name] = Patch(
orig_func_name,
new_func,
create_dummy,
apply_wrapper=apply_wrapper,
remove_origin_wrappers=remove_origin_wrappers
)
else:
cls._patch_info_collection.get(orig_func_name).set_patch_func(new_func, force_patch, apply_wrapper=apply_wrapper)
cls._patch_info_collection.get(orig_func_name).set_patch_func(
new_func,
force_patch,
apply_wrapper=apply_wrapper,
remove_origin_wrappers=remove_origin_wrappers
)
@classmethod
def apply(cls):
......@@ -167,9 +178,14 @@ class CoreAdaptation(MegatronAdaptationABC):
MegatronAdaptation.register('megatron.core.tensor_parallel.cross_entropy.VocabParallelCrossEntropy.calculate_predicted_logits',
VocabParallelCrossEntropy.calculate_predicted_logits)
# _VocabParallelCrossEntropy
MegatronAdaptation.register('megatron.core.tensor_parallel.cross_entropy._VocabParallelCrossEntropy.forward',
remove_origin_wrappers=True)
MegatronAdaptation.register('megatron.core.tensor_parallel.cross_entropy._VocabParallelCrossEntropy.forward',
torch.compile(mode='max-autotune-no-cudagraphs'),
apply_wrapper=True)
MegatronAdaptation.register('megatron.core.tensor_parallel.cross_entropy._VocabParallelCrossEntropy.forward',
staticmethod,
apply_wrapper=True)
# flux
MegatronAdaptation.register("megatron.core.tensor_parallel.layers.ColumnParallelLinear.__init__",
......
......@@ -17,7 +17,7 @@ def dummy_function_wrapper(func_name):
class Patch:
def __init__(self, orig_func_or_cls_name, new_func_or_cls, create_dummy, apply_wrapper=False):
def __init__(self, orig_func_or_cls_name, new_func_or_cls, create_dummy, apply_wrapper=False, remove_origin_wrappers=False):
split_name = orig_func_or_cls_name.rsplit('.', 1)
if len(split_name) == 1:
self.orig_module_name, self.orig_func_or_cls_name = orig_func_or_cls_name, None
......@@ -28,9 +28,14 @@ class Patch:
self.patch_func_or_cls = None
self.wrappers = []
if new_func_or_cls is None:
self.remove_origin_wrappers = False
if (
new_func_or_cls is None
and not remove_origin_wrappers
):
new_func_or_cls = dummy_function_wrapper(orig_func_or_cls_name)
self.set_patch_func(new_func_or_cls, apply_wrapper=apply_wrapper)
self.set_patch_func(new_func_or_cls, apply_wrapper=apply_wrapper, remove_origin_wrappers=remove_origin_wrappers)
self.is_applied = False
self.create_dummy = create_dummy
......@@ -42,7 +47,33 @@ class Patch:
def patch_func_id(self):
return id(self.patch_func_or_cls)
def set_patch_func(self, new_func_or_cls, force_patch=False, apply_wrapper=False):
@staticmethod
def remove_wrappers(module, func_name, func):
while True:
if (
module.__dict__
and func_name in module.__dict__
and isinstance(module.__dict__[func_name], (staticmethod, classmethod))
):
func = module.__dict__[func_name].__func__
if hasattr(func, '__wrapped__') and func.__wrapped__ is not None:
func = func.__wrapped__
elif hasattr(func, '__closure__') and func.__closure__ is not None:
func = func.__closure__[0].cell_contents
else:
return func
return func
def set_patch_func(self, new_func_or_cls=None, force_patch=False, apply_wrapper=False, remove_origin_wrappers=False):
if remove_origin_wrappers:
self.remove_origin_wrappers = True
else:
assert new_func_or_cls is not None
if new_func_or_cls is None:
return
if (
apply_wrapper
or (hasattr(new_func_or_cls, '__name__') and new_func_or_cls.__name__.endswith(('wrapper', 'decorator')))
......@@ -64,6 +95,11 @@ class Patch:
if self.patch_func_or_cls is not None:
final_patch_func_or_cls = self.patch_func_or_cls
# remove original wrappers
if self.remove_origin_wrappers:
final_patch_func_or_cls = self.remove_wrappers(self.orig_module, self.orig_func_or_cls_name, final_patch_func_or_cls)
# add new wrappers
for wrapper in self.wrappers:
final_patch_func_or_cls = wrapper(final_patch_func_or_cls)
......@@ -73,6 +109,7 @@ class Patch:
if self.orig_func_or_cls_name is not None and hasattr(value, self.orig_func_or_cls_name) \
and id(getattr(value, self.orig_func_or_cls_name)) == self.orig_func_or_cls_id:
setattr(value, self.orig_func_or_cls_name, final_patch_func_or_cls)
self.is_applied = True
@staticmethod
......
from .layers import (
parallel_linear_init_wrapper
parallel_linear_init_wrapper,
ColumnParallelLinearPatch,
RowParallelLinearPatch,
vocab_parallel_embedding_forward,
......
from typing import Callable
import os
import warnings
from functools import wraps
from typing import Callable, List, Optional
import flux
import torch
......@@ -20,11 +23,18 @@ from megatron.core.tensor_parallel.layers import (
VocabParallelEmbedding,
)
from megatron.core.tensor_parallel.mappings import (
copy_to_tensor_model_parallel_region,
reduce_from_tensor_model_parallel_region,
reduce_scatter_to_sequence_parallel_region,
)
from megatron.core.tensor_parallel.utils import VocabUtility
from megatron.core.tensor_parallel.mappings import _reduce
from megatron.core.tensor_parallel.layers import (
custom_fwd,
custom_bwd,
linear_with_frozen_weight,
linear_with_grad_accumulation_and_async_allreduce
)
_grad_accum_fusion_available = True
try:
......@@ -32,8 +42,6 @@ try:
except ImportError:
_grad_accum_fusion_available = False
from flux.cpp_mod import ReduceScatterOption
def vocab_parallel_embedding_init(
self,
......@@ -351,7 +359,7 @@ class AGLinear(torch.autograd.Function):
if ctx.allreduce_dgrad:
handle.wait()
return grad_input, grad_weight, grad_bias, None, None, None, None, None
return grad_input, grad_weight, grad_bias, None, None, None, None, None, None
def ag_linear(
......@@ -652,7 +660,7 @@ class LinearRS(torch.autograd.Function):
grad_weight = grad_output.t().matmul(total_input)
grad_bias = grad_output.sum(dim=0) if use_bias else None
return grad_input, grad_weight, grad_bias, None, None, None, None, None
return grad_input, grad_weight, grad_bias, None, None, None, None, None, None
def linear_rs(
......@@ -863,7 +871,6 @@ class ColumnParallelLinearPatch(torch.nn.Module):
else:
self._forward_impl = linear_with_grad_accumulation_and_async_allreduce
allreduce_dgrad = False if self.explicit_expert_comm else self.allreduce_dgrad
forward_params = {
......
import os
import argparse
from typing import Union
from megatron.training.arguments import (
_add_network_size_args,
_add_regularization_args,
_add_training_args,
_add_initialization_args,
_add_learning_rate_args,
_add_checkpointing_args,
_add_mixed_precision_args,
_add_distributed_args,
_add_validation_args,
_add_data_args,
_add_tokenizer_args,
_add_autoresume_args,
_add_biencoder_args,
_add_vision_args,
......@@ -28,6 +32,18 @@ from megatron.training.arguments import (
)
def remove_original_params(parser, param_names: Union[list, str]):
if isinstance(param_names, str):
param_names = [param_names]
for action in parser._actions:
if action.dest in param_names:
parser._actions.remove(action)
for option_string in action.option_strings:
if option_string in parser._option_string_actions:
del parser._option_string_actions[option_string]
def parse_args(extra_args_provider=None, ignore_unknown_args=False):
"""Parse all arguments."""
parser = argparse.ArgumentParser(description='Megatron-LM Arguments',
......@@ -37,14 +53,17 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
parser = _add_network_size_args(parser)
parser = _add_regularization_args(parser)
parser = _add_training_args(parser)
parser = _add_extra_training_args(parser)
parser = _add_initialization_args(parser)
parser = _add_learning_rate_args(parser)
parser = _add_checkpointing_args(parser)
parser = _add_mixed_precision_args(parser)
parser = _add_distributed_args(parser)
parser = _add_extra_distributed_args(parser)
parser = _add_validation_args(parser)
parser = _add_data_args(parser)
parser = _add_tokenizer_args(parser)
parser = _add_extra_tokenizer_args(parser)
parser = _add_autoresume_args(parser)
parser = _add_biencoder_args(parser)
parser = _add_vision_args(parser)
......@@ -61,6 +80,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
parser = _add_ft_package_args(parser)
parser = _add_config_logger_args(parser)
parser = _add_rerun_machine_args(parser)
parser = _add_flux_args(parser)
# Custom arguments.
if extra_args_provider is not None:
......@@ -79,7 +99,6 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
"Yaml config is not supported with legacy models."
args = load_yaml(args.yaml_cfg)
# Args from environment
#args.rank = int(os.getenv('RANK', '0'))
#args.world_size = int(os.getenv("WORLD_SIZE", '1'))
......@@ -87,136 +106,8 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
return args
def _add_distributed_args(parser):
group = parser.add_argument_group(title='distributed')
group.add_argument('--tensor-model-parallel-size', type=int, default=1,
help='Degree of tensor model parallelism.')
group.add_argument('--encoder-tensor-model-parallel-size', type=int, default=0,
help='Degree of tensor model parallelism for the encoder.')
group.add_argument('--pipeline-model-parallel-size', type=int, default=1,
help='Degree of pipeline model parallelism.')
group.add_argument('--encoder-pipeline-model-parallel-size', type=int, default=0,
help=('Degree of pipeline model parallelism in the encoder. This is '
'independent of the amount of pipeline in the decoder.'))
group.add_argument('--pipeline-model-parallel-split-rank',
type=int, default=None,
help=('Rank where encoder and decoder should be split. '
'Deprecated; use --encoder-pipeline-model-parallel-size instead.'))
group.add_argument('--decoder-first-pipeline-num-layers',
type=int, default=None,
help=('The number of transformer layers on the first pipeline stage of the decoder. '
'Default None is even split of transformer layers across all pipeline stages'))
group.add_argument('--decoder-last-pipeline-num-layers',
type=int, default=None,
help=('The number of transformer layers on the last pipeline stage of the decoder. '
'Default None is even split of transformer layers across all pipeline stages'))
group.add_argument('--model-parallel-size', type=int, default=None,
help='Old model parallel argument, do not use. Use '
'--tensor-model-parallel-size instead.')
group.add_argument('--num-layers-per-virtual-pipeline-stage', type=int, default=None,
help='Number of layers per virtual pipeline stage')
group.add_argument('--num-virtual-stages-per-pipeline-rank', type=int, default=None,
help='Number of virtual pipeline stages per pipeline parallelism rank')
group.add_argument('--microbatch-group-size-per-virtual-pipeline-stage', type=int, default=None,
help='Number of contiguous microbatches per virtual pipeline stage',
dest='microbatch_group_size_per_vp_stage')
group.add_argument('--no-overlap-p2p-communication', action='store_false',
help='overlap pipeline parallel communication with forward and backward chunks in 1F1B',
dest='overlap_p2p_comm')
group.add_argument('--overlap-p2p-communication-warmup-flush', action='store_true',
default=False, help='if set, overlap pipeline parallel communication in warmup and flush',
dest='overlap_p2p_comm_warmup_flush')
group.add_argument('--distributed-backend', default='nccl',
choices=['nccl', 'gloo'],
help='Which backend to use for distributed training.')
group.add_argument('--distributed-timeout-minutes', type=int, default=10,
help='Timeout minutes for torch.distributed.')
group.add_argument('--overlap-grad-reduce', action='store_true',
default=False, help='If set, overlap DDP grad reduce.')
group.add_argument('--defer-embedding-wgrad-compute', action='store_true',
default=False, help='If set, defers the vocabulary projection linear layer weight'
'gradient compute to pipeline flush.', dest='defer_embedding_wgrad_compute')
group.add_argument('--wgrad-deferral-limit', type=int, default=0, help='Number of micro-batches for which'
'weight gradient computation of vocabulary projection is deferred, defaults to 0 which'
'means all the micro-batches are deferred. Invalid if `defer-embedding-wgrad-compute`'
'is not set')
group.add_argument('--no-align-grad-reduce', action='store_false',
help='If not set, all PP stages will launch gradient reduces simultaneously. '
'Otherwise, each PP stage will independently launch as needed.',
dest='align_grad_reduce')
group.add_argument('--ddp-bucket-size', type=int, default=None,
help='Bucket size for data-parallel communication')
group.add_argument('--ddp-average-in-collective', action='store_true',
default=False, help='If set, average directly in data-parallel communication collective.')
group.add_argument('--overlap-param-gather', action='store_true',
default=False, help='If set, overlap param all-gather in distributed optimizer.')
group.add_argument('--overlap-param-gather-with-optimizer-step', action='store_true',
default=False, help='If set, overlap param all-gather of first bucket with optimizer step.')
group.add_argument('--no-align-param-gather', action='store_false',
help='If not set, all PP stages will launch param all-gathers simultaneously. '
'Otherwise, each PP stage will independently launch as needed.',
dest='align_param_gather')
group.add_argument('--no-scatter-gather-tensors-in-pipeline', action='store_false',
help='If not set, use scatter/gather to optimize communication of tensors in pipeline.',
dest='scatter_gather_tensors_in_pipeline')
group.add_argument('--use-ring-exchange-p2p', action='store_true',
default=False, help='If set, use custom-built ring exchange '
'for p2p communications. Note that this option will require '
'a custom built image that support ring-exchange p2p.')
group.add_argument('--local-rank', type=int, default=int(os.getenv('LOCAL_RANK', '0')),
help='local rank passed from distributed launcher.')
group.add_argument('--lazy-mpu-init', type=bool, required=False,
help='If set to True, initialize_megatron() '
'skips DDP initialization and returns function to '
'complete it instead.Also turns on '
'--use-cpu-initialization flag. This is for '
'external DDP manager.' )
group.add_argument('--account-for-embedding-in-pipeline-split', action='store_true',
default=False, help='If set, *input* embedding layer will be treated as a standard transformer'
'layer in the context of partition and placement for pipeline parallelism.')
group.add_argument('--account-for-loss-in-pipeline-split', action='store_true',
default=False, help='If set, loss layer will be treated as a standard transformer'
'layer in the context of partition and placement for pipeline parallelism.')
group.add_argument('--use-distributed-optimizer', action='store_true',
help='Use distributed optimizer.')
group.add_argument('--num-distributed-optimizer-instances', type=int, default=1,
help='Number of Distributed Optimizer copies across Data Parallel domain.')
group.add_argument('--use-torch-fsdp2', action='store_true',
help="Use the torch FSDP2 implementation. FSDP2 is not currently working with Pipeline Parallel."
"It is still not in a stable release stage, and may therefore contain bugs or other potential issues.")
group.add_argument('--context-parallel-size', type=int, default=1,
help='Degree of context parallelism.')
group.add_argument('--cp-comm-type', nargs='+', type=str, default=["p2p"],
help='Inter-gpu communication type for context parallelism: '
'p2p, a2a, allgather or a2a+p2p. If a single string is provided, '
'all layers will share the same communication type. Users can also '
'specify separated types for each layer like '
'--cp-comm-type p2p p2p a2a a2a a2a+p2p a2a+p2p')
group.add_argument('--hierarchical-context-parallel-sizes', nargs='+', type=int, default=None,
help='Degrees of the hierarchical context parallelism. Users should '
'provide a list to specify the sizes for different levels. '
'--hierarchical-context-parallel-sizes 2 4 indicates every two adjacent gpus '
'forms the first level of cp groups and the cp ranks with the same odevity '
'forms the second level of cp groups.')
group.add_argument('--nccl-communicator-config-path', type=str, default=None,
help='Path to the yaml file with NCCL communicator '
'configurations. The number of min/max thread groups and thread '
'group cluster size of each communicator can be configured by '
'setting `min_ctas`, `max_ctas`, and `cga_cluster_size`.')
group.add_argument('--use-tp-pp-dp-mapping', action='store_true', default=False,
help='If set, distributed ranks initialize order is changed '
'from tp-cp-ep-dp-pp to tp-cp-ep-pp-dp.')
group.add_argument('--replication', action='store_true', default=False,
help="If set, replication of local checkpoints is enabled. "
"Needs to be enabled on all ranks.")
group.add_argument('--replication-jump', default=None, type=int,
help="Specifies `J`, the spacing between ranks storing replicas of a given rank's data. "
"Replicas for rank `n` may be on ranks `n+J`, `n+2J`, ..., or `n-J`, `n-2J`, etc. "
"This flag has an effect only if --replication is used. "
"and must be consistent across all ranks.")
group.add_argument('--replication-factor', default=2, type=int,
help="Number of machines storing the replica of a given rank's data.")
def _add_extra_distributed_args(parser):
group = parser.add_argument_group(title='extra distributed args')
group.add_argument('--rank', default=-1, type=int,
help='node rank for distributed training')
group.add_argument('--world-size', type=int, default=8,
......@@ -226,19 +117,25 @@ def _add_distributed_args(parser):
return parser
def _add_tokenizer_args(parser):
group = parser.add_argument_group(title='tokenizer')
group.add_argument('--vocab-size', type=int, default=None,
help='Size of vocab before EOD or padding.')
def _add_extra_training_args(parser):
group = parser.add_argument_group(title='extra training args')
group.add_argument('--use-hip-profiler', action='store_true',
help='Use HIP PROFILER',
dest='use_hip_profiler')
group.add_argument('--profile-dir', type=str, default="./",
help='profile dir to save.')
return parser
def _add_extra_tokenizer_args(parser):
# 删除原参数
remove_original_params(parser, ["tokenizer_type"])
# 重定义参数
group = parser.add_argument_group(title='extra tokenizer args')
group.add_argument('--extra-vocab-size', type=int, default=0,
help="--extra-vocab-size")
group.add_argument('--vocab-file', type=str, default=None,
help='Path to the vocab file.')
group.add_argument('--merge-file', type=str, default=None,
help='Path to the BPE merge file.')
group.add_argument('--vocab-extra-ids', type=int, default=0,
help='Number of additional vocabulary tokens. '
'They are used for span masking in the T5 model')
group.add_argument('--tokenizer-type', type=str,
default=None,
choices=['BertWordPieceLowerCase',
......@@ -255,263 +152,6 @@ def _add_tokenizer_args(parser):
'NullTokenizer',
'DeepSeekV2Tokenizer'],
help='What type of tokenizer to use.')
group.add_argument('--tokenizer-model', type=str, default=None,
help='Sentencepiece tokenizer model.')
group.add_argument('--tiktoken-pattern', type=str, default=None,
help='Which tiktoken pattern to use. Options: [v1, v2]')
group.add_argument('--tiktoken-num-special-tokens', type=int, default=1000,
help='Number of special tokens in tiktoken tokenizer')
group.add_argument('--tiktoken-special-tokens', type=str, nargs='+', default=None,
help='List of tiktoken special tokens, needs to have ["<unk>", "<s>", "</s>"]')
return parser
def _add_training_args(parser):
group = parser.add_argument_group(title='training')
group.add_argument('--micro-batch-size', type=int, default=None,
help='Batch size per model instance (local batch size). '
'Global batch size is local batch size times data '
'parallel size times number of micro batches.')
group.add_argument('--batch-size', type=int, default=None,
help='Old batch size parameter, do not use. '
'Use --micro-batch-size instead')
group.add_argument('--global-batch-size', type=int, default=None,
help='Training batch size. If set, it should be a '
'multiple of micro-batch-size times data-parallel-size. '
'If this value is None, then '
'use micro-batch-size * data-parallel-size as the '
'global batch size. This choice will result in 1 for '
'number of micro-batches.')
group.add_argument('--rampup-batch-size', nargs='*', default=None,
help='Batch size ramp up with the following values:'
' --rampup-batch-size <start batch size> '
' <batch size incerement> '
' <ramp-up samples> '
'For example:'
' --rampup-batch-size 16 8 300000 \\ '
' --global-batch-size 1024'
'will start with global batch size 16 and over '
' (1024 - 16) / 8 = 126 intervals will increase'
'the batch size linearly to 1024. In each interval'
'we will use approximately 300000 / 126 = 2380 samples.')
group.add_argument('--decrease-batch-size-if-needed', action='store_true', default=False,
help='If set, decrease batch size if microbatch_size * dp_size'
'does not divide batch_size. Useful for KSO (Keep Soldiering On)'
'to continue making progress if number of healthy GPUs (and'
'corresponding dp_size) does not support current batch_size.'
'Old batch_size will be restored if training is re-started with'
'dp_size that divides batch_size // microbatch_size.')
group.add_argument('--recompute-activations', action='store_true',
help='recompute activation to allow for training '
'with larger models, sequences, and batch sizes.')
group.add_argument('--recompute-granularity', type=str, default=None,
choices=['full', 'selective'],
help='Checkpoint activations to allow for training '
'with larger models, sequences, and batch sizes. '
'It is supported at two granularities 1) full: '
'whole transformer layer is recomputed, '
'2) selective: core attention part of the transformer '
'layer is recomputed.')
group.add_argument('--no-check-for-nan-in-loss-and-grad', action='store_false',
help='Check for NaNs in loss and grad',
dest='check_for_nan_in_loss_and_grad')
group.add_argument('--check-for-spiky-loss', action='store_true',
help='Check for spiky loss',
dest='check_for_spiky_loss')
group.add_argument('--distribute-saved-activations',
action='store_true',
help='If set, distribute recomputed activations '
'across model parallel group.')
group.add_argument('--recompute-method', type=str, default=None,
choices=['uniform', 'block'],
help='1) uniform: uniformly divide the total number of '
'Transformer layers and recompute the input activation of '
'each divided chunk at specified granularity, '
'2) recompute the input activations of only a set number of '
'individual Transformer layers per pipeline stage and do the '
'rest without any recomputing at specified granularity'
'default) do not apply activations recompute to any layers')
group.add_argument('--recompute-num-layers', type=int, default=None,
help='1) uniform: the number of Transformer layers in each '
'uniformly divided recompute unit, '
'2) block: the number of individual Transformer layers '
'to recompute within each pipeline stage.')
group.add_argument('--no-clone-scatter-output-in-embedding', action='store_false',
help='If not set, clone the output of the scatter in embedding layer to GC original tensor.',
dest='clone_scatter_output_in_embedding')
group.add_argument('--profile', action='store_true',
help='Enable nsys profiling. When using this option, nsys '
'options should be specified in commandline. An example '
'nsys commandline is `nsys profile -s none -t nvtx,cuda '
'-o <path/to/output_file> --force-overwrite true '
'--capture-range=cudaProfilerApi '
'--capture-range-end=stop`.')
group.add_argument('--profile-step-start', type=int, default=10,
help='Global step to start profiling.')
group.add_argument('--profile-step-end', type=int, default=12,
help='Global step to stop profiling.')
group.add_argument('--use-pytorch-profiler', action='store_true',
help='Use the built-in pytorch profiler. '
'Useful if you wish to view profiles in tensorboard.',
dest='use_pytorch_profiler')
group.add_argument('--profile-ranks', nargs='+', type=int, default=[0],
help='Global ranks to profile.')
group.add_argument('--record-memory-history', action="store_true", default=False,
help='Record memory history in last rank.')
group.add_argument('--memory-snapshot-path', type=str, default="snapshot.pickle",
help='Specifies where to dump the memory history pickle.')
group.add_argument('--tp-comm-overlap', action='store_true', help='Enables the '
' overlap of Tensor parallel communication and GEMM kernels.')
group.add_argument('--tp-comm-overlap-cfg', type=str, default=None,
help='Config file when tp_comm_overlap is enabled.')
group.add_argument('--disable-tp-comm-overlap-ag', action='store_false',
help=('Disables the All-Gather overlap with GEMM by '
'pipelining the GEMM and All-Gather.'),
dest='tp_comm_overlap_ag')
group.add_argument('--disable-tp-comm-overlap-rs', action='store_false',
help=('Disables the Reduce-Scatter overlap with GEMM by '
'pipelining the GEMM and Reduce-Scatter.'),
dest='tp_comm_overlap_rs')
group.add_argument('--tp-comm-overlap-rs-dgrad', action='store_true',
help = 'Enables the Reduce-Scatter overlap with dgrad GEMM.',
dest='tp_comm_overlap_rs_dgrad')
group.add_argument('--disable-tp-comm-bulk-dgrad', action='store_false',
help='Disables the All-Gather overlap with bprop activation gradient GEMM.',
dest='tp_comm_bulk_dgrad')
group.add_argument('--disable-tp-comm-bulk-wgrad', action='store_false',
help='Disables the Reduce-Scatter overlap with bprop weight gradient GEMM.',
dest='tp_comm_bulk_wgrad')
group.add_argument('--tp-comm-bootstrap-backend', default='nccl', type=str,
choices=['nccl', 'mpi', 'gloo'],
help='Set the bootstrapping backend of Tensor parallel communications.')
group.add_argument('--use-cpu-initialization', action='store_true',
default=None,
help='If set, initialize weights on the CPU. This eliminates init differences based on tensor parallelism.')
group.add_argument('--empty-unused-memory-level', default=0, type=int,
choices=[0, 1, 2],
help='Call torch.cuda.empty_cache() each iteration '
'(training and eval), to reduce fragmentation.'
'0=off, 1=moderate, 2=aggressive.')
group.add_argument('--deterministic-mode', action='store_true',
help='Choose code that has deterministic execution. This usually '
'means slower execution, but is good for debugging and testing.')
group.add_argument('--check-weight-hash-across-dp-replicas-interval', type=int, default=None,
help='Interval to check weight hashes are same across DP replicas. If not specified, weight hashes not checked.')
group.add_argument('--calculate-per-token-loss', action='store_true',
help=('Scale cross entropy loss by the number of non-padded tokens in the '
'global batch, versus the default behavior of assuming all tokens are non-padded.'))
group.add_argument('--train-sync-interval', type=int, default=None,
help='Training CPU-GPU synchronization interval, to ensure that CPU is not running too far ahead of GPU.')
# deprecated
group.add_argument('--checkpoint-activations', action='store_true',
help='Checkpoint activation to allow for training '
'with larger models, sequences, and batch sizes.')
group.add_argument('--train-iters', type=int, default=None,
help='Total number of iterations to train over all '
'training runs. Note that either train-iters or '
'train-samples should be provided.')
group.add_argument('--train-samples', type=int, default=None,
help='Total number of samples to train over all '
'training runs. Note that either train-iters or '
'train-samples should be provided.')
group.add_argument('--log-interval', type=int, default=100,
help='Report loss and timing interval.')
group.add_argument('--exit-interval', type=int, default=None,
help='Exit the program after the iteration is divisible '
'by this value.')
group.add_argument('--exit-duration-in-mins', type=int, default=None,
help='Exit the program after this many minutes.')
group.add_argument('--exit-signal-handler', action='store_true',
help='Dynamically save the checkpoint and shutdown the '
'training if SIGTERM is received')
group.add_argument('--tensorboard-dir', type=str, default=None,
help='Write TensorBoard logs to this directory.')
group.add_argument('--no-masked-softmax-fusion',
action='store_false',
help='Disable fusion of query_key_value scaling, '
'masking, and softmax.',
dest='masked_softmax_fusion')
group.add_argument('--no-bias-gelu-fusion', action='store_false',
help='Disable bias and gelu fusion.',
dest='bias_gelu_fusion')
group.add_argument('--no-bias-swiglu-fusion', action='store_false',
help='Disable bias and swiglu fusion, the fusion is '
'available only when using megatron-core.',
dest='bias_swiglu_fusion')
group.add_argument('--no-bias-dropout-fusion', action='store_false',
help='Disable bias and dropout fusion.',
dest='bias_dropout_fusion')
group.add_argument('--no-rope-fusion', action='store_false',
help='Disable rope fusion, the fusion is available '
'only when using megatron-core.',
dest='apply_rope_fusion')
group.add_argument('--cross-entropy-loss-fusion', action='store_true',
help='Enabled fusion of cross entropy loss calculation.',
dest='cross_entropy_loss_fusion')
group.add_argument('--use-flash-attn', action='store_true',
help='use FlashAttention implementation of attention. '
'https://arxiv.org/abs/2205.14135')
group.add_argument('--disable-bias-linear', action='store_false',
help='Disable bias in the linear layers',
dest='add_bias_linear')
group.add_argument('--add-qkv-bias', action='store_true',
help='Enable bias only in the QKV linear layers',
dest='add_qkv_bias')
group.add_argument('--optimizer', type=str, default='adam',
choices=['adam', 'sgd'],
help='Optimizer function')
group.add_argument('--dataloader-type', type=str, default=None,
choices=['single', 'cyclic', 'external'],
help='Single pass vs multiple pass data loader')
group.add_argument('--no-async-tensor-model-parallel-allreduce',
action='store_false',
help='DEPRECATED. This flag is ignored.',
dest='async_tensor_model_parallel_allreduce')
group.add_argument('--no-persist-layer-norm', action='store_true',
help='Disable using persistent fused layer norm kernel. '
'This kernel supports only a set of hidden sizes. Please '
'check persist_ln_hidden_sizes if your hidden '
'size is supported.')
group.add_argument('--sequence-parallel', action='store_true',
help='Enable sequence parallel optimization.')
group.add_argument('--no-gradient-accumulation-fusion',
action='store_false',
help='Disable fusing gradient accumulation to weight '
'gradient computation of linear layers',
dest='gradient_accumulation_fusion')
group.add_argument('--use-mcore-models', action='store_true',
dest='deprecated_use_mcore_models',
help='DEPRECATED. Use the implementation from megatron core.'
'Now ignored and mcore models are the default, use '
'--use-legacy-models to not use core models.')
group.add_argument('--use-legacy-models', action='store_true',
help='Use the legacy Megatron models, not Megatron-Core models.')
group.add_argument('--manual-gc', action='store_true',
help='Disable the threshold-based default garbage '
'collector and trigger the garbage collection manually. '
'Manual garbage collection helps to align the timing of '
'the collection across ranks which mitigates the impact '
'of CPU-associated jitters. When the manual gc is enabled, '
'garbage collection is performed only at the start and the '
'end of the validation routine by default.')
group.add_argument('--manual-gc-interval', type=int, default=0,
help='Training step interval to trigger manual garbage '
'collection. When the value is set to 0, garbage '
'collection is not triggered between training steps.')
group.add_argument('--no-manual-gc-eval', action='store_false',
help='When using manual garbage collection, disable '
'garbage collection at the start and the end of each '
'evaluation run.', dest='manual_gc_eval')
group.add_argument('--disable-tp-comm-split-ag', action='store_false',
help='Disables the All-Gather overlap with fprop GEMM.',
dest='tp_comm_split_ag')
group.add_argument('--disable-tp-comm-split-rs', action='store_false',
help='Disables the Reduce-Scatter overlap with fprop GEMM.',
dest='tp_comm_split_rs')
group.add_argument('--profile-dir', type=str, default="./",
help='profile dir to save.')
return parser
......
......@@ -182,6 +182,9 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
#on_trace_ready=torch.profiler.tensorboard_trace_handler('./torch_prof_data'))
on_trace_ready=trace_handler)
prof.start()
elif args.profile and torch.distributed.get_rank() in args.profile_ranks and args.use_hip_profiler:
import ctypes
roctracer = ctypes.cdll.LoadLibrary("/opt/dtk/roctracer/lib/libroctracer64.so")
start_iteration = iteration
# Disable forward pre-hook to start training to ensure that errors in checkpoint loading
......@@ -206,6 +209,9 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
if args.profile and torch.distributed.get_rank() in args.profile_ranks:
if args.use_pytorch_profiler:
prof.step()
elif args.use_hip_profiler:
if iteration == args.profile_step_start: roctracer.roctracer_start()
if iteration == args.profile_step_end: roctracer.roctracer_stop()
elif iteration == args.profile_step_start:
torch.cuda.cudart().cudaProfilerStart()
torch.autograd.profiler.emit_nvtx(record_shapes=True).__enter__()
......
# GPT3 MODEL
## Table of contents
- [1. Training Setup](#1-training-setup)
- [2. Configurations](#2-configurations)
- [3. Training Results](#3-training-results)
## 1. Training setup
<a id="markdown-training-setup" name="training-setup"></a>
To run the model using a docker container run it as follows
```
PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3
CHECKPOINT_PATH="" #<Specify path>
TENSORBOARD_LOGS_PATH=""#<Specify path>
VOCAB_FILE="" #<Specify path to file>/gpt2-vocab.json
MERGE_FILE="" #<Specify path to file>/gpt2-merges.txt
DATA_PATH="" #<Specify path and file prefix>_text_document
docker run \
--gpus=all \
--ipc=host \
--workdir /workspace/megatron-lm \
-v /path/to/data:/path/to/data \
-v /path/to/megatron-lm:/workspace/megatron-lm \
megatron-lm nvcr.io/nvidia/pytorch:24.01-py3 \
bash examples/gpt3/train_gpt3_175b_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH $VOCAB_FILE $MERGE_FILE $DATA_PATH "
```
NOTE: Depending on the environment you are running it the above command might like slightly different.
## 2. Configurations
<a id="markdown-configurations" name="configurations"></a>
The example in this folder shows you how to run 175B model. There are other configs you could run as well
### 345M
```
--num-layers 12 \
--hidden-size 512 \
--num-attention-heads 8 \
--seq-length 1024 \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
```
### 857M
```
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 2048 \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
```
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
mpirun -np 8 --allow-run-as-root \
train_gpt_567B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1
wait
rm -rf CKPT
rm -rf gpt_dataset/redpajama_text_document
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
mpirun -np 512 --hostfile hostfile_gpt_567B \
--allow-run-as-root \
--bind-to none \
--mca plm_rsh_no_tree_spawn 1 \
train_gpt_567B_multinodes.sh node059 --profiling=$profiling > output.log 2>&1
wait
rm -rf CKPT
rm -rf gpt_dataset/redpajama_text_document
\ No newline at end of file
<system version="2">
<cpu numaid="3" affinity="00000000,00000000,ffff0000,00000000,00000000,00000000,ffff0000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:99:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:9d:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:9f:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="0" sm="93" gcn="gfx936" arch="169983" rank="0" gdr="1">
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:51:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:54:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:56:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="1" sm="93" gcn="gfx936" arch="169983" rank="1" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:9b:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_2" dev="2" speed="200000" port="1" latency="0.000000" guid="0x2227a1000373255c" maxconn="131072" gdr="1"/>
<net name="mlx5_3" dev="3" speed="200000" port="2" latency="0.000000" guid="0x2227a1000373255c" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="0" affinity="00000000,00000000,00000000,0000ffff,00000000,00000000,00000000,0000ffff" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:01:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:03:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:05:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="3" sm="93" gcn="gfx936" arch="169983" rank="3" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:59:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:5b:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:5d:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="2" sm="93" gcn="gfx936" arch="169983" rank="2" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:06:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_4" dev="4" speed="200000" port="1" latency="0.000000" guid="0x8228a1000373255c" maxconn="131072" gdr="1"/>
<net name="mlx5_5" dev="5" speed="200000" port="2" latency="0.000000" guid="0x8228a1000373255c" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="7" affinity="7fff0000,00000000,00000000,00000000,ffff0000,00000000,00000000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:e1:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:e3:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:e5:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="4" sm="93" gcn="gfx936" arch="169983" rank="4" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:bd:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:bf:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:c1:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="5" sm="93" gcn="gfx936" arch="169983" rank="5" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:e6:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_6" dev="6" speed="200000" port="1" latency="0.000000" guid="0x6227a1000373255c" maxconn="131072" gdr="1"/>
<net name="mlx5_7" dev="7" speed="200000" port="2" latency="0.000000" guid="0x6227a1000373255c" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="4" affinity="00000000,0000ffff,00000000,00000000,00000000,0000ffff,00000000,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:ab:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:af:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:b1:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="7" sm="93" gcn="gfx936" arch="169983" rank="7" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:ca:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:c5:00.0" class="0x060400" vendor="0x1000" device="0xc030" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:c8:00.0" class="0x060400" vendor="0x1d94" device="0x23b7" subsystem_vendor="0x1000" subsystem_device="0x100b" link_speed="32.0 GT/s PCIe" link_width="16">
<pci busid="0000:ca:00.0" class="0x0b4000" vendor="0x1d94" device="0x6320" subsystem_vendor="0x1d94" subsystem_device="0x6310" link_speed="32.0 GT/s PCIe" link_width="16">
<gpu dev="6" sm="93" gcn="gfx936" arch="169983" rank="6" gdr="1">
<xgmi target="0000:9f:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:56:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:5d:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:05:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:e5:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:b1:00.0" count="7" tclass="0x0b4000"/>
<xgmi target="0000:c1:00.0" count="7" tclass="0x0b4000"/>
</gpu>
</pci>
</pci>
</pci>
<pci busid="0000:ad:00.0" class="0x020000" vendor="0x15b3" device="0x1021" subsystem_vendor="0x15b3" subsystem_device="0x0022" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_8" dev="8" speed="200000" port="1" latency="0.000000" guid="0xd226a1000373255c" maxconn="131072" gdr="1"/>
<net name="mlx5_9" dev="9" speed="200000" port="2" latency="0.000000" guid="0xd226a1000373255c" maxconn="131072" gdr="1"/>
</nic>
</pci>
</pci>
</cpu>
<cpu numaid="2" affinity="00000000,00000000,0000ffff,00000000,00000000,00000000,0000ffff,00000000" arch="x86_64" vendor="HygonGenuine" familyid="159" modelid="4">
<pci busid="0000:71:00.0" class="0x020000" vendor="0x15b3" device="0xa2dc" subsystem_vendor="0x15b3" subsystem_device="0x0009" link_speed="32.0 GT/s PCIe" link_width="16">
<nic>
<net name="mlx5_0" dev="0" speed="200000" port="1" latency="0.000000" guid="0xc0d00a000324e9b8" maxconn="131072" gdr="1"/>
<net name="mlx5_1" dev="1" speed="40000" port="2" latency="0.000000" guid="0xc0d00a000324e9b8" maxconn="131072" gdr="1"/>
</nic>
</pci>
</cpu>
</system>
#!/bin/bash
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
# Runs GPT 567B model
source /opt/dtk/env.sh
# default env
DIST_URL=${1}
DIST_PORT=25900
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
export GLOG_minloglevel=3
export CUDA_DEVICE_MAX_CONNECTIONS=1
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export GPU_MAX_HW_QUEUES=10
# nccl env
export NCCL_ALGO=Ring
export NCCL_MIN_NCHANNELS=32
export NCCL_MAX_NCHANNELS=32
export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="./topo-input.xml"
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1
# data path
CHECKPOINT_PATH="path to CKPT"
TOKENIZER_MODEL="path to tokenizer.model"
DATA_PATH="path to redpajama_text_document"
DISTRIBUTED_ARGS=(
--rank ${RANK}
--world-size ${WORLD_SIZE}
--local-rank ${LOCAL_RANK}
--dist-url tcp://${DIST_URL}:${DIST_PORT}
)
MODEL_ARGS=(
--use-mcore-models
--disable-bias-linear
--seq-length 8192
--max-position-embeddings 32768
--num-layers 2
--hidden-size 8192
--ffn-hidden-size 32768
--num-attention-heads 64
--init-method-std 0.01
--attention-dropout 0.0
--hidden-dropout 0.0
--normalization RMSNorm
--position-embedding-type rope
--untie-embeddings-and-output-weights
--no-masked-softmax-fusion
--no-position-embedding
--rotary-base 1000000
--ckpt-format torch
)
MOE_ARGS=(
--num-experts 16
--moe-router-topk 2
--moe-router-load-balancing-type aux_loss
--moe-aux-loss-coeff 1e-2
--moe-token-dispatcher-type alltoall
--moe-expert-capacity-factor 0.5
--moe-pad-expert-input-to-capacity
--moe-grouped-gemm
)
DATA_ARGS=(
--tokenizer-type Llama2Tokenizer
--tokenizer-model ${TOKENIZER_MODEL}
--data-path $DATA_PATH
--split 98,2,0
)
TRAINING_ARGS=(
--micro-batch-size 1
--global-batch-size 256
--lr 1e-4
--train-iters 10
--lr-decay-iters 320000
--lr-decay-style cosine
--min-lr 1.0e-5
--weight-decay 0.1
--lr-warmup-iters 500
--clip-grad 1.0
--bf16
--overlap-param-gather
--overlap-grad-reduce
)
TORCH_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_gpt_1nodes_tp2-pp1-ep4-ep_tp2-cp1
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 2
--pipeline-model-parallel-size 1
--expert-model-parallel-size 4
--expert-tensor-parallel-size 2
--use-distributed-optimizer
--sequence-parallel
)
LOGGING_ARGS=(
--log-throughput \
--log-interval 1 \
--save-interval 100000 \
--eval-interval 10000 \
--eval-iters 5 \
#--save $CHECKPOINT_PATH \
#--load $CHECKPOINT_PATH \
--tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \
--no-load-optim \
--no-load-rng \
--no-save-optim
)
if [ -n "${WANDB_API_KEY}" ]; then
LOGGING_ARGS+=(
--wandb-project ${WANDB_PROJECT:-"Mixtral"}
--wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"}
)
fi
APP="python3 -u ${MEGATRON_PATH}/pretrain_gpt.py \
${DISTRIBUTED_ARGS[@]} \
${MODEL_ARGS[@]} \
${MOE_ARGS[@]} \
${DATA_ARGS[@]} \
${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${LOGGING_ARGS[@]} \
"
if [[ $profiling == "torch" ]]; then
APP+=" ${TORCH_PROFIE_ARGS[@]}"
elif [[ $profiling == "hip" ]]; then
mkdir -p hip_prof_data
APP+=" ${HIP_PROFIE_ARGS[@]}"
APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
fi
#for hygon cpu
case ${LOCAL_RANK} in
[0])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=3 --membind=3 ${APP}
;;
[4])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=4 --membind=4 ${APP}
;;
[5])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=5 --membind=5 ${APP}
;;
[6])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=6 --membind=6 ${APP}
;;
[7])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=7 --membind=7 ${APP}
;;
esac
#!/bin/bash
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
# Runs GPT 567B model
source /opt/dtk/env.sh
# default env
DIST_URL=${1}
DIST_PORT=25900
RANK=$OMPI_COMM_WORLD_RANK
LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
export GLOG_minloglevel=3
export CUDA_DEVICE_MAX_CONNECTIONS=1
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export GPU_MAX_HW_QUEUES=10
# nccl env
export NCCL_ALGO=Ring
export NCCL_MIN_NCHANNELS=32
export NCCL_MAX_NCHANNELS=32
export NCCL_NET_GDR_LEVEL=7
export NCCL_NET_GDR_READ=1
export RCCL_SDMA_COPY_ENABLE=0
export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export NCCL_TOPO_FILE="./topo-input.xml"
# enable BatchLinear
export GROUPED_GEMM_BatchLinear=1
# data path
CHECKPOINT_PATH="path to CKPT"
TOKENIZER_MODEL="path to tokenizer.model"
DATA_PATH="path to redpajama_text_document"
DISTRIBUTED_ARGS=(
--rank ${RANK}
--world-size ${WORLD_SIZE}
--local-rank ${LOCAL_RANK}
--dist-url tcp://${DIST_URL}:${DIST_PORT}
)
MODEL_ARGS=(
--use-mcore-models
--disable-bias-linear
--seq-length 8192
--max-position-embeddings 32768
--num-layers 64
--hidden-size 8192
--ffn-hidden-size 32768
--num-attention-heads 64
--init-method-std 0.01
--attention-dropout 0.0
--hidden-dropout 0.0
--normalization RMSNorm
--position-embedding-type rope
--untie-embeddings-and-output-weights
--no-masked-softmax-fusion
--no-position-embedding
--rotary-base 1000000
--ckpt-format torch
)
MOE_ARGS=(
--num-experts 16
--moe-router-topk 2
--moe-router-load-balancing-type aux_loss
--moe-aux-loss-coeff 1e-2
--moe-token-dispatcher-type alltoall
--moe-expert-capacity-factor 0.5
--moe-pad-expert-input-to-capacity
#--moe-grouped-gemm
)
DATA_ARGS=(
--tokenizer-type Llama2Tokenizer
--tokenizer-model ${TOKENIZER_MODEL}
--data-path $DATA_PATH
--split 98,2,0
)
TRAINING_ARGS=(
--micro-batch-size 1
--global-batch-size 1024
--lr 1e-4
--train-iters 10
--lr-decay-iters 320000
--lr-decay-style cosine
--min-lr 1.0e-5
--weight-decay 0.1
--lr-warmup-iters 500
--clip-grad 1.0
--bf16
--overlap-param-gather
--overlap-grad-reduce
)
TORCH_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 3
--profile-step-end 4
--profile-dir torch_prof_gpt_64nodes_tp4-pp8-ep16-ep_tp4-cp2
--use-pytorch-profiler
)
HIP_PROFIE_ARGS=(
--profile
--profile-ranks 0 1 2 3 4 5 6 7
--profile-step-start 4
--profile-step-end 5
--use-hip-profiler
)
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 4
--pipeline-model-parallel-size 8
--expert-model-parallel-size 16
--expert-tensor-parallel-size 4
--context-parallel-size 2
#--num-layers-per-virtual-pipeline-stage 2
--use-distributed-optimizer
--sequence-parallel
)
LOGGING_ARGS=(
--log-throughput \
--log-interval 1 \
--save-interval 100000 \
--eval-interval 10000 \
--eval-iters 5 \
#--save $CHECKPOINT_PATH \
#--load $CHECKPOINT_PATH \
--tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \
--no-load-optim \
--no-load-rng \
--no-save-optim
)
if [ -n "${WANDB_API_KEY}" ]; then
LOGGING_ARGS+=(
--wandb-project ${WANDB_PROJECT:-"Mixtral"}
--wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"}
)
fi
APP="python3 -u ${MEGATRON_PATH}/pretrain_gpt.py \
${DISTRIBUTED_ARGS[@]} \
${MODEL_ARGS[@]} \
${MOE_ARGS[@]} \
${DATA_ARGS[@]} \
${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${LOGGING_ARGS[@]} \
"
if [[ $profiling == "torch" ]]; then
APP+=" ${TORCH_PROFIE_ARGS[@]}"
elif [[ $profiling == "hip" ]]; then
mkdir -p hip_prof_data
APP+=" ${HIP_PROFIE_ARGS[@]}"
APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
fi
#for hygon cpu
case ${LOCAL_RANK} in
[0])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=3 --membind=3 ${APP}
;;
[4])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=4 --membind=4 ${APP}
;;
[5])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=5 --membind=5 ${APP}
;;
[6])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=6 --membind=6 ${APP}
;;
[7])
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
#numactl --cpunodebind=7 --membind=7 ${APP}
;;
esac
# Mixtral 8x7B Model Inference and Finetuning
## Download Mixtral 8x7B Checkpoints
Download Mixtral 8x7B HF format checkpoint from [HF-hub](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1/)
Or you can simply run this following script to download Mixtral 8x7B into a specific folder.
```python
from huggingface_hub import snapshot_download
SAVED_DIR = "" # Specify the saved directory
# Download HF checkpoints
snapshot_download(repo_id="mistralai/Mixtral-8x7B-v0.1", ignore_patterns=["*.pt"], local_dir=SAVED_DIR, local_dir_use_symlinks=False)
```
## Convert Mixtral 8x7B checkpoints from HF to MCore
The HF checkpoints can be converted to Megatron format by using the provided checkpoint converter for HF format.
The target model parallel size(e.g. TP,PP,EP) should be specified.
Currently the converter doesn't support distributed checkpointing yet, so each different parallel config requires a specific checkpoint.
- For training, the recommended model parallel config is TP1EP8PP4
- For inference, the recommended model parallel config is TP1EP1PP2
```
TOKENIZER_MODEL=/workspace/checkpoints/mixtral-hf/tokenizer.model
MEGATRON_PATH="/workspace/megatron-lm"
export PYTHONPATH=$MEGATRON_PATH:$PYTHONPATH
export CUDA_DEVICE_MAX_CONNECTIONS=1
TARGET_TP_SIZE=""
TARGET_EP_SIZE=""
TARGET_PP_SIZE=""
HF_FORMAT_DIR=/workspace/checkpoints/mixtral-hf
MEGATRON_FORMAT_DIR=/workspace/checkpoints/mixtral-mcore-TP${TARGET_TP_SIZE}PP${TARGET_PP_SIZE}EP${TARGET_EP_SIZE}
python tools/checkpoint/convert.py \
--model-type GPT \
--loader loader_mixtral_hf \
--saver mcore \
--target-tensor-parallel-size ${TARGET_TP_SIZE} \
--target-pipeline-parallel-size ${TARGET_PP_SIZE} \
--target-expert-parallel-size ${TARGET_EP_SIZE} \
--load-dir ${HF_FORMAT_DIR} \
--save-dir ${MEGATRON_FORMAT_DIR} \
--tokenizer-model ${TOKENIZER_MODEL}
```
## Text generation with Mixtral 8x7B
Inference with Mixtral 8x7B requires at least 2 GPUS, such that a distributed checkpoint with EP>=2 or PP>=2 converted with above script is needed.
The Megatron-LM have included a simple REST server to use for text generation in `tools/run_text_generation_server.py`, launch it with the following script:
```
#!/bin/bash
# This example will start serving the Mixtral 8x7B model.
DISTRIBUTED_ARGS="--nproc_per_node 2 \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"
CHECKPOINT=<Path to checkpoint>
TOKENIZER_MODEL=<Path to tokenizer (e.g. /tokenizer.model)>
export CUDA_DEVICE_MAX_CONNECTIONS=1
pip install flask-restful
torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 2 \
--expert-model-parallel-size 1 \
--load ${CHECKPOINT} \
--tokenizer-type Llama2Tokenizer \
--tokenizer-model $TOKENIZER_MODEL \
--use-mcore-models \
--max-position-embeddings 32768 \
--num-layers 32 \
--hidden-size 4096 \
--ffn-hidden-size 14336 \
--num-attention-heads 32 \
--normalization RMSNorm \
--disable-bias-linear \
--position-embedding-type rope \
--no-position-embedding \
--swiglu \
--untie-embeddings-and-output-weights \
--group-query-attention \
--num-query-groups 8 \
--bf16 \
--micro-batch-size 1 \
--seq-length 1024 \
--seed 42 \
--num-experts 8 \
--moe-router-topk 2 \
--moe-token-dispatcher-type alltoall \
--moe-grouped-gemm \
--mock-data \
--rotary-base 1000000
```
Once the server is running you can use `tools/text_generation_cli.py` to query it, it takes one argument which is the host the server is running on.
```
python tools/text_generation_cli.py localhost:5000
```
## Finetuning from pretrained Mixtral 8x7B
To finetuning pretrained Mixtral 8x7B, use the following scripts:
```bash
PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.04-py3
CHECKPOINT_PATH="" # Speicfy path to checkpoint dir
TOKENIZER_MODEL="" # Specify path to tokenizer.model
DATA_PATH="" # Specify path to data
docker run \
--gpus=all \
--ipc=host \
--workdir /workspace/megatron-lm \
-v /path/to/data:/path/to/data \
-v /path/to/megatron-lm:/workspace/megatron-lm \
$PYTORCH_IMAGE \
bash examples/mixtral/train_mixtral_8x7b_distributed.sh $CHECKPOINT_PATH $TOKENIZER_MODEL $DATA_PATH
```
The above functionality also applys to Mixtral 8x22B actually, you should set the model config (including hidden_size/head_num/num_layers/ffn_hidden_size) properly according to the original [config](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1/blob/main/config.json).
## Acknowledgements
Contributors outside NVIDIA for the huggingface converter and example of Mixtral models in Megatron-Core:
- Peng Li <jerry.lp@alibaba-inc.com>
- Jun Huang <huangjun.hj@alibaba-inc.com>
for para in $*
do
if [[ $para == --profiling* ]];then
profiling=${para#*=}
fi
done
mpirun -np 8 --allow-run-as-root \
train_mixtral_8x22B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1
wait
rm -rf CKPT
rm -rf mixtral_dataset/my-mixtral_text_document
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment