Commit c338d32c authored by hepj987's avatar hepj987
Browse files

更新运行方式

parent bf95e032
Pipeline #434 canceled with stage
......@@ -9,7 +9,7 @@ GPT2模型:第二代生成式预训练模型(Generative Pre-Training2)。
### 模型结构
```
GPT2使用 Transformer 的 Decoder 结构,并对 Transformer Decoder 进行了一些改动,并通过Megatron和deepspeed进行分布式运行
GPT2使用 Transformer 的 Decoder 结构,并对 Transformer Decoder 进行了一些改动,并通过Megatron和deepspeed可以使用DP、TP、PP的3D并行式的分布式方式训练
```
### 数据集
......@@ -21,16 +21,19 @@ wget https://huggingface.co/bigscience/misc-test-data/resolve/main/stas/oscar-1G
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
xz -d oscar-1GB.jsonl.xz
#处理数据集参数
--input 输入数据集路径,即oscar-1GB.jsonl.xz解压后的文件路径
--output-prefix 输出数据路径,处理后会自动加上_text_document后缀
--vocab 下载的gpt2-vocab.json词表文件路径
--dataset-impl dataset类型
--tokenizer-type tokenizer类型
--merge-file 下载的gpt2-merges.txt文件路径
--append-eod 添加结束标志符
--workers 进程数
#处理数据集
python tools/preprocess_data.py \
--input oscar-1GB.jsonl \
--output-prefix my-gpt2 \
--vocab gpt2-vocab.json \
--dataset-impl mmap \
--tokenizer-type GPT2BPETokenizer \
--merge-file gpt2-merges.txt \
--append-eod \
--workers 8
sh creat-data.sh
```
## GPT2预训练
......@@ -46,15 +49,15 @@ docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.10.0-centos7.6-dtk
进入docker
```
pip install -r requirements.txt -i http://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
```
### 训练(单卡测试样例)
### GPT2单节点训练
```
rm megatron/arguments.py
cp megatron/arguments.py-one_node megatron/arguments.py
sh run-train.sh(基于单节点四卡)
#np为起的进程数,和使用GPU数量一致,并且TP*PP < np,4卡的话可以设置2tp 2pp,或者1tp 4pp,4tp 1pp,节点内使用TP性能更好
mpirun -np 4 run-one-node.sh(基于单节点四卡)
```
```
......@@ -81,23 +84,43 @@ SAVE_INTERVAL 保存频率
--eval-iters 验证iter
```
### GPT2模型16B训练(多节点)
### GPT2模型16B多节点训练
要求DCU集群Slurm环境正常
要求DCU集群配置好相应的虚拟环境,已安装python依赖项
推荐用户使用预编译好的python3.7包来快速建立python3虚拟环境,pytorch、apex、torchaudio、colossalai、faiss、mmcv-full 、torchvision、tensorflow需要[光合开发者社区](https://cancon.hpccube.com:65024/4/main/)下载所需DCU版本安装包
在安装一下依赖时需要使用基于DTK编译的版本,下载地址[光合开发者社区](https://cancon.hpccube.com:65024/4/main/)
```
pytorch
deepspeed
apex
torchaudio
colossalai
faiss
mmcv-full
torchvision
tensorflow
```
这里以DTK23.04、python3.7,torch1.10为例,进入[光合开发者社区](https://cancon.hpccube.com:65024/4/main/)进入到pytorch->dtk23.04->下载 torch-1.10.0+gite378c3c.abi0.dtk2304-cp37-cp37m-manylinux2014_x86_64.whl。然后可以仿照下边配置环境:
```
#创建虚拟环境
export PYTHON3_LIB_PATH=/python_lib_path
virtualenv -p /python_bin_path/python3 --system-site-packages venv_gpt2
source env.sh #进入venv_gpt2虚拟环境
#进入venv_gpt2虚拟环境
source venv_gpt2/bin/activate
#加载DTK以及其他环境设置
source env.sh
#安装DTK版本依赖
pip install torch-1.10.0+gite378c3c.abi0.dtk2304-cp37-cp37m-manylinux2014_x86_64.whl
pip install deepspeed-0.9.2+git25d5540.abi0.dtk2304.torch1.10.0-cp37-cp37m-manylinux2014_x86_64.whl
#安装其他依赖
pip install -r requirements.txt -i http://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
```
```
rm megatron/arguments.py
cp megatron/arguments.py-nodes megatron/arguments.py
#多节点运行
sbatch run-16B.sh(主要参数在single-16B.sh)
```
......@@ -137,61 +160,45 @@ SAVE_INTERVAL 保存频率
| :-------: | :-----------: | :----------: |
| 32 x 4DCU | 4.299443E+00 | 7.365877E+01 |
## GPT2文本生成
使用GPT做文本生成时需要对训练好的模型进行转换,转换需要安装0.7.3版本 deepspeed(此工程已包含)
### 转换成多卡推理
```
pip install deepspeed-0.7.3+unknown-cp37-cp37m-linux_x86_64.whl -i http://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
#训练后的模型保存格式为deepspeed格式,如果用于推理,需要进行格式转换成megatron格式,deepspeed-> megatron格式时转换前后TP数需要保持相同
#转换脚本
sh conver-model_to_megatron.sh
```
对deepspeed进行一些修改
```
修改/usr/local/lib/python3.7/site-packages/deepspeed/checkpoint/constants.py
第34行
ZERO_FILE_PREFIX = 'bf16_' + 'zero_pp_rank_'
改为:
ZERO_FILE_PREFIX = 'zero_pp_rank_'
修改/usr/local/lib/python3.7/site-packages/deepspeed/ops/op_builder/builder.py
第133行 def assert_torch_info(torch_info):函数
删除下边的版本判断
install_torch_version = torch_info['version']
install_cuda_version = torch_info['cuda_version']
install_hip_version = torch_info['hip_version']
修改/usr/local/lib/python3.7/site-packages/deepspeed/runtime/state_dict_factory.py文件
第177行def check_ckpt_list(self):函数
删除mp_world_size判断
if 'mp_world_size' in sd.keys():
assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}"
#重要参数
需要将工程路径加入PYTHONPATH
例如:export PYTHONPATH=/home/megatron-deepspeed_dtk23.04:$PYTHONPATH
CHECKPOINT_PATH 需要转换的模型路径(具体到保存的global_step)
output_folder 转换后的模型路径
target_tp 转换后的TP数,与训练保持一直或设置为1
target_pp 转换后的PP数,与训练保持一直或设置为1
```
### 转换脚本
### 转换成单卡推理
```
sh conver.sh
```
#原始模型保存的是deepspeed格式,deepspeed-> megatron格式时转换前后TP数需要保持相同,因此需要先deepspeed->deepspeed(改变TP成1),然后再由deepspeed-> megatron转换成可推理的格式
#转换脚本
sh conver-model-1tp.sh
```
#重要参数
需要将工程路径加入PYTHONPATH
例如:export PYTHONPATH=/home/megatron-deepspeed_dtk22.10:$PYTHONPATH
CHECKPOINT_PATH 需要转换的模型路径(具体到保存的global_step)
output_folder 转换后的模型路径
target_tp 转换后的TP数(需要与训练时保持一致)
target_pp 转换后的PP数 (设置为1)
```
### 无条件文本生成
```
sh run-inf.sh(这里以单节点小模型为例)
#多卡推理
mpirun -np 4 run-inf-gpus.sh
#单卡推理
mpirun -np 1 run-inf.sh
```
```
......
export PYTHONPATH=/home/megatron-deepspeed_dtk23.04::$PYTHONPATH
CHECKPOINT_PATH=/home/megatron-deepspeed-dtk23.04/checkopints/gpt2-4tp/global_step1000
OUTPUT_PATH=./checkopints/megatron-1tp
python tools/convert_checkpoint/deepspeed_to_deepspeed.py \
--input_folder $CHECKPOINT_PATH \
--output_folder ./conver-model-deepspeed-1tp \
--target_tp 1 \
--target_pp 1
python tools/convert_checkpoint/deepspeed_to_megatron.py \
--input_folder ./conver-model-deepspeed-1tp/global_step1000 \
--output_folder $OUTPUT_PATH \
--target_tp 1 \
--target_pp 1
export PYTHONPATH=/home/megatron-deepspeed_dtk23.04::$PYTHONPATH
CHECKPOINT_PATH=/home/megatron-deepspeed-dtk23.04/checkopints/gpt2-oscar_16B-4tp/global_step1000
OUTPUT_PATH=./conver-4tp-model
python tools/convert_checkpoint/deepspeed_to_megatron.py \
--input_folder $CHECKPOINT_PATH \
--output_folder $OUTPUT_PATH \
--target_tp 4 \
--target_pp 1
python tools/preprocess_data.py \
--input oscar-1GB.jsonl \
--output-prefix my-gpt2 \
--vocab gpt2-vocab.json \
--dataset-impl mmap \
--tokenizer-type GPT2BPETokenizer \
--merge-file gpt2-merges.txt \
--append-eod \
--workers 8
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Megatron arguments."""
import argparse
import collections
import os
import re
import time
import torch
import deepspeed
from megatron.enums import PositionEmbeddingType
import megatron
from megatron.logging import log_levels
def parse_args(extra_args_provider=None, defaults={},
ignore_unknown_args=False):
"""Parse all arguments."""
parser = argparse.ArgumentParser(description='Megatron-LM Arguments',
allow_abbrev=False)
# Standard arguments.
parser = _add_network_size_args(parser)
parser = _add_regularization_args(parser)
parser = _add_training_args(parser)
parser = _add_initialization_args(parser)
parser = _add_learning_rate_args(parser)
parser = _add_checkpointing_args(parser)
parser = _add_mixed_precision_args(parser)
parser = _add_distributed_args(parser)
parser = _add_validation_args(parser)
parser = _add_data_args(parser)
parser = _add_autoresume_args(parser)
parser = _add_biencoder_args(parser)
parser = _add_vit_args(parser)
parser = _add_logging_args(parser)
parser = _add_zero_args(parser)
parser = _add_memoryopt_args(parser)
parser = _add_activation_checkpoint_args(parser)
# Custom arguments.
if extra_args_provider is not None:
parser = extra_args_provider(parser)
parser = deepspeed.add_config_arguments(parser)
# Parse.
if ignore_unknown_args:
args, _ = parser.parse_known_args()
else:
args = parser.parse_args()
# Distributed args.
#args.rank = int(os.getenv('RANK', '0'))
#args.world_size = int(os.getenv("WORLD_SIZE", '1'))
# Tensor model parallel size.
args.tensor_model_parallel_size = min(
args.tensor_model_parallel_size, args.world_size)
assert args.world_size % args.tensor_model_parallel_size == 0, 'world size'\
' ({}) is not divisible by tensor model parallel size ({})'.format(
args.world_size, args.tensor_model_parallel_size)
# Pipeline model parallel size.
args.pipeline_model_parallel_size = min(
args.pipeline_model_parallel_size,
(args.world_size // args.tensor_model_parallel_size))
# Checks.
model_parallel_size = args.pipeline_model_parallel_size * \
args.tensor_model_parallel_size
assert args.world_size % model_parallel_size == 0, 'world size is not'\
' divisible by tensor parallel size ({}) times pipeline parallel ' \
'size ({})'.format(args.world_size, args.tensor_model_parallel_size,
args.pipeline_model_parallel_size)
args.data_parallel_size = args.world_size // model_parallel_size
if args.rank == 0:
print('using world size: {}, data-parallel-size: {}, '
'tensor-model-parallel size: {}, '
'pipeline-model-parallel size: {} '.format(
args.world_size, args.data_parallel_size,
args.tensor_model_parallel_size,
args.pipeline_model_parallel_size), flush=True)
# --data-path and --train-weighted-splits-paths
message = "Data loading Mode 1: --data-path and --split "\
"and Mode 2: --(train|valid|test)-weighted-split-paths"\
"are mutually exclusive i.e. cannot be set together."
if args.data_path:
assert args.train_weighted_split_paths is None, message
setattr(args, "valid_weighted_split_names", None)
setattr(args, "valid_weighted_split_weights", None)
setattr(args, "valid_weighted_split_splits", None)
setattr(args, "test_weighted_split_names", None)
setattr(args, "test_weighted_split_weights", None)
setattr(args, "test_weighted_split_splits", None)
# args.split default value in the args is None it is set here in order
# to check that it does not to overlap with the 2nd mode of data loading
if args.split is None:
args.split = "969, 30, 1"
if args.train_weighted_split_paths or args.valid_weighted_split_paths or \
args.test_weighted_split_paths:
assert args.data_path is None and args.split is None, message
# Deprecated arguments
assert args.batch_size is None, '--batch-size argument is no longer ' \
'valid, use --micro-batch-size instead'
del args.batch_size
assert args.warmup is None, '--warmup argument is no longer valid, use ' \
'--lr-warmup-fraction instead'
del args.warmup
assert args.model_parallel_size is None, '--model-parallel-size is no ' \
'longer valid, use --tensor-model-parallel-size instead'
del args.model_parallel_size
# Set input defaults.
for key in defaults:
# For default to be valid, it should not be provided in the
# arguments that are passed to the program. We check this by
# ensuring the arg is set to None.
if getattr(args, key) is not None:
if args.rank == 0:
print('WARNING: overriding default arguments for {key}:{v} \
with {key}:{v2}'.format(key=key, v=defaults[key],
v2=getattr(args, key)),
flush=True)
else:
setattr(args, key, defaults[key])
# Batch size.
assert args.micro_batch_size is not None
assert args.micro_batch_size > 0
if args.global_batch_size is None:
args.global_batch_size = args.micro_batch_size * args.data_parallel_size
if args.rank == 0:
print('setting global batch size to {}'.format(
args.global_batch_size), flush=True)
assert args.global_batch_size > 0
if args.num_layers_per_virtual_pipeline_stage is not None:
assert args.pipeline_model_parallel_size > 2, \
'pipeline-model-parallel size should be greater than 2 with ' \
'interleaved schedule'
assert args.num_layers % args.num_layers_per_virtual_pipeline_stage == 0, \
'number of layers is not divisible by number of layers per virtual ' \
'pipeline stage'
args.virtual_pipeline_model_parallel_size = \
(args.num_layers // args.pipeline_model_parallel_size) // \
args.num_layers_per_virtual_pipeline_stage
else:
args.virtual_pipeline_model_parallel_size = None
# Parameters dtype.
args.params_dtype = torch.float
if args.fp16:
assert not args.bf16
args.params_dtype = torch.half
if args.bf16:
assert not args.fp16
args.params_dtype = torch.bfloat16
# bfloat16 requires gradient accumulation and all-reduce to
# be done in fp32.
if not args.accumulate_allreduce_grads_in_fp32:
args.accumulate_allreduce_grads_in_fp32 = True
if args.rank == 0:
print('accumulate and all-reduce gradients in fp32 for '
'bfloat16 data type.', flush=True)
if args.rank == 0:
print('using {} for parameters ...'.format(args.params_dtype),
flush=True)
# If we do accumulation and all-reduces in fp32, we need to have
# local DDP and we should set the use-contiguous-buffers-in-ddp.
if args.accumulate_allreduce_grads_in_fp32:
assert args.DDP_impl == 'local'
args.use_contiguous_buffers_in_ddp = True
if args.dataloader_type is None:
args.dataloader_type = 'single'
# Consumed tokens.
args.consumed_train_samples = 0
args.consumed_valid_samples = 0
args.consumed_train_tokens = 0
args.gigaflos_no_embeds = 0
# Iteration-based training.
if args.train_iters:
# If we use iteration-based training, make sure the
# sample-based options are off.
assert args.train_samples is None, \
'expected iteration-based training'
assert args.lr_decay_samples is None, \
'expected iteration-based learning rate decay'
assert args.lr_warmup_samples == 0, \
'expected iteration-based learning rate warmup'
assert args.rampup_batch_size is None, \
'expected no batch-size rampup for iteration-based training'
if args.lr_warmup_fraction is not None:
assert args.lr_warmup_iters == 0, \
'can only specify one of lr-warmup-fraction and lr-warmup-iters'
# Sample-based training.
if args.train_samples:
# If we use sample-based training, make sure the
# iteration-based options are off.
assert args.train_iters is None, \
'expected sample-based training'
assert args.lr_decay_iters is None, \
'expected sample-based learning rate decay'
assert args.lr_warmup_iters == 0, \
'expected sample-based learnig rate warmup'
if args.lr_warmup_fraction is not None:
assert args.lr_warmup_samples == 0, \
'can only specify one of lr-warmup-fraction ' \
'and lr-warmup-samples'
# Check required arguments.
required_args = ['num_layers', 'hidden_size', 'num_attention_heads']
for req_arg in required_args:
_check_arg_is_not_none(args, req_arg)
# Checks.
if args.ffn_hidden_size is None:
args.ffn_hidden_size = 4 * args.hidden_size
if args.kv_channels is None:
assert args.hidden_size % args.num_attention_heads == 0
args.kv_channels = args.hidden_size // args.num_attention_heads
if args.seq_length is not None:
assert args.encoder_seq_length is None
args.encoder_seq_length = args.seq_length
else:
assert args.encoder_seq_length is not None
args.seq_length = args.encoder_seq_length
if args.position_embedding_type == PositionEmbeddingType.absolute or args.position_embedding_type == PositionEmbeddingType.alibi:
assert args.max_position_embeddings is not None
if args.seq_length is not None:
assert args.max_position_embeddings >= args.seq_length
if args.decoder_seq_length is not None:
assert args.max_position_embeddings >= args.decoder_seq_length
else:
assert args.max_position_embeddings is None
if args.lr is not None:
assert args.min_lr <= args.lr
if args.save is not None:
assert args.save_interval is not None
# Mixed precision checks.
if args.fp16_lm_cross_entropy:
assert args.fp16, 'lm cross entropy in fp16 only support in fp16 mode.'
if args.fp32_residual_connection:
assert args.fp16 or args.bf16, \
'residual connection in fp32 only supported when using fp16 or bf16.'
# Activation checkpointing.
if args.distribute_checkpointed_activations:
assert args.checkpoint_activations, \
'for distribute-checkpointed-activations to work you '\
'need to enable checkpoint-activations'
args.curriculum_learning = False
# Activation function
if args.glu_activation is not None and args.bias_gelu_fusion:
raise ValueError("if glu-activation is used, please set --no-bias-gelu-fusion")
# Skip train iterations
if args.skip_train_iteration_range is not None:
args.skip_train_iteration_range = [
list(map(int, range_.split("-"))) for range_ in args.skip_train_iteration_range
]
args.skip_train_iteration_range.sort()
skip_train_iteration_range = collections.deque()
for range_ in args.skip_train_iteration_range:
if len(range_) == 2:
start, end = range_
assert end >= start, \
"end of skip range cannot be smaller than start of skip range"
# merge overlapping intervals (e.g. 1-5 2-6 -> 1-6)
if not skip_train_iteration_range:
skip_train_iteration_range.append([start, end])
elif skip_train_iteration_range[-1][1] >= start:
skip_train_iteration_range[-1][1] = max(end, skip_train_iteration_range[-1][1])
else:
skip_train_iteration_range.append([start, end])
else:
raise ValueError(
"skip train iterations should be specified as two numbers, i.e. start-end"
)
args.skip_train_iteration_range = skip_train_iteration_range
if args.use_bnb_optimizer:
try:
import bitsandbytes as bnb
except ModuleNotFoundError:
raise ModuleNotFoundError("Please install bitsandbytes from https://github.com/facebookresearch/bitsandbytes.")
_print_args(args)
return args
def _print_args(args):
"""Print arguments."""
if args.rank == 0:
print('------------------------ arguments ------------------------',
flush=True)
str_list = []
for arg in vars(args):
dots = '.' * (48 - len(arg))
str_list.append(' {} {} {}'.format(arg, dots, getattr(args, arg)))
if args.log_path is not None:
with open(os.path.join(args.log_path,f'args_{time.strftime("%Y-%m-%dT%H:%M:%S")}.txt'), 'w') as f:
for arg in sorted(str_list, key=lambda x: x.lower()):
f.write(arg+"\n")
print(arg, flush=True)
else:
for arg in sorted(str_list, key=lambda x: x.lower()):
print(arg, flush=True)
print('-------------------- end of arguments ---------------------',
flush=True)
def _check_arg_is_not_none(args, arg):
assert getattr(args, arg) is not None, '{} argument is None'.format(arg)
def _add_network_size_args(parser):
group = parser.add_argument_group(title='network size')
group.add_argument('--num-layers', type=int, default=None,
help='Number of transformer layers.')
group.add_argument('--hidden-size', type=int, default=None,
help='Tansformer hidden size.')
group.add_argument('--ffn-hidden-size', type=int, default=None,
help='Transformer Feed-Forward Network hidden size. '
'This is set to 4*hidden-size if not provided')
group.add_argument('--num-attention-heads', type=int, default=None,
help='Number of transformer attention heads.')
group.add_argument('--kv-channels', type=int, default=None,
help='Projection weights dimension in multi-head '
'attention. This is set to '
' args.hidden_size // args.num_attention_heads '
'if not provided.')
group.add_argument('--max-position-embeddings', type=int, default=None,
help='Maximum number of position embeddings to use. '
'This is the size of position embedding.')
group.add_argument('--make-vocab-size-divisible-by', type=int, default=128,
help='Pad the vocab size to be divisible by this value.'
'This is added for computational efficieny reasons.')
group.add_argument('--pad-vocab-size-to', type=int, default=None,
help='Pad the vocab size to this value.'
'This value must be greater than the initial size of the tokenizer'
', needs to be divisible by TP size and `make-vocab-size-divisible-by`.')
group.add_argument('--layernorm-epsilon', type=float, default=1e-5,
help='Layer norm epsilon.')
group.add_argument('--sync-tp-duplicated-parameters', action='store_true',
help='Force syncing duplicated params across TP ranks in forward. '
'This is a workaround for an unresolved bug leading to TP ranks '
'getting out of sync with each other.')
group.add_argument('--apply-residual-connection-post-layernorm',
action='store_true',
help='If set, use original BERT residula connection '
'ordering.')
group.add_argument('--embed-layernorm', action='store_true',
help='use layernorm for embedding')
group.add_argument('--openai-gelu', action='store_true',
help='Use OpenAIs GeLU implementation. This option'
'should not be used unless for backward compatibility'
'reasons.')
group.add_argument('--onnx-safe', type=bool, required=False,
help='Use workarounds for known problems with '
'Torch ONNX exporter')
group.add_argument('--bert-no-binary-head', action='store_false',
help='Disable BERT binary head.',
dest='bert_binary_head')
group.add_argument('--position-embedding-type', type=lambda x: PositionEmbeddingType[x],
choices=list(PositionEmbeddingType),
default=PositionEmbeddingType.absolute,
help='Define position embedding type ("absolute" | "rotary" | "alibi"). "absolute" by default.'
)
group.add_argument('--glu-activation', type=str,
choices=megatron.model.glu_activations.GLU_ACTIVATIONS.keys(),
help='GLU activations to use.'
)
group.add_argument('--kill-switch-path', type=str,
help='path to look for a kill switch, which if found will automatically exit the program'
)
group.add_argument('--log-level', type=str, choices=list(log_levels.keys()),
help="Logger log level to use on the main process. Possible choices are the log levels as strings: 'debug', "
"'info', 'warning', 'error' and 'critical', plus a 'passive' level which doesn't set anything and lets the "
"application set the level."
)
group.add_argument('--log-level-replica', type=str, choices=list(log_levels.keys()),
help="Logger log level to use on replicas. Same choices as ``log_level``"
)
return parser
def _add_logging_args(parser):
group = parser.add_argument_group(title='logging')
group.add_argument('--log-params-norm', action='store_true',
help='If set, calculate and log parameters norm.')
group.add_argument('--log-num-zeros-in-grad', action='store_true',
help='If set, calculate and log the number of zeros in gradient.')
group.add_argument('--tensorboard-log-interval', type=int, default=1,
help='Report to tensorboard interval.')
group.add_argument('--tensorboard-queue-size', type=int, default=1000,
help='Size of the tensorboard queue for pending events '
'and summaries before one of the ‘add’ calls forces a '
'flush to disk.')
group.add_argument('--log-timers-to-tensorboard', action='store_true',
help='If set, write timers to tensorboard.')
group.add_argument('--log-batch-size-to-tensorboard', action='store_true',
help='If set, write batch-size to tensorboard.')
group.add_argument('--no-log-learnig-rate-to-tensorboard',
action='store_false',
help='Disable learning rate logging to tensorboard.',
dest='log_learning_rate_to_tensorboard')
group.add_argument('--no-log-loss-scale-to-tensorboard',
action='store_false',
help='Disable loss-scale logging to tensorboard.',
dest='log_loss_scale_to_tensorboard')
group.add_argument('--log-validation-ppl-to-tensorboard',
action='store_true',
help='If set, write validation perplexity to '
'tensorboard.')
return parser
def _add_regularization_args(parser):
group = parser.add_argument_group(title='regularization')
group.add_argument('--attention-dropout', type=float, default=0.1,
help='Post attention dropout probability.')
group.add_argument('--hidden-dropout', type=float, default=0.1,
help='Dropout probability for hidden state transformer.')
group.add_argument('--weight-decay', type=float, default=0.01,
help='Weight decay coefficient for L2 regularization.')
group.add_argument('--clip-grad', type=float, default=1.0,
help='Gradient clipping based on global L2 norm.')
group.add_argument('--adam-beta1', type=float, default=0.9,
help='First coefficient for computing running averages '
'of gradient and its square')
group.add_argument('--adam-beta2', type=float, default=0.999,
help='Second coefficient for computing running averages '
'of gradient and its square')
group.add_argument('--adam-eps', type=float, default=1e-08,
help='Term added to the denominator to improve'
'numerical stability')
group.add_argument('--sgd-momentum', type=float, default=0.9,
help='Momentum factor for sgd')
return parser
def _add_training_args(parser):
group = parser.add_argument_group(title='training')
group.add_argument('--micro-batch-size', type=int, default=None,
help='Batch size per model instance (local batch size). '
'Global batch size is local batch size times data '
'parallel size times number of micro batches.')
group.add_argument('--batch-size', type=int, default=None,
help='Old batch size parameter, do not use. '
'Use --micro-batch-size instead')
group.add_argument('--global-batch-size', type=int, default=None,
help='Training batch size. If set, it should be a '
'multiple of micro-batch-size times data-parallel-size. '
'If this value is None, then '
'use micro-batch-size * data-parallel-size as the '
'global batch size. This choice will result in 1 for '
'number of micro-batches.')
group.add_argument('--rampup-batch-size', nargs='*', default=None,
help='Batch size ramp up with the following values:'
' --rampup-batch-size <start batch size> '
' <batch size increment> '
' <ramp-up samples> '
'For example: '
' --rampup-batch-size 16 8 300000 '
' --global-batch-size 1024 '
'will start with global batch size 16 and over '
' (1024 - 16) / 8 = 126 intervals will increase '
'the batch size linearly to 1024. In each interval '
'we will use approximately 300000 / 126 = 2380 samples.')
group.add_argument('--checkpoint-activations', action='store_true',
help='Checkpoint activation to allow for training '
'with larger models, sequences, and batch sizes.')
group.add_argument('--distribute-checkpointed-activations',
action='store_true',
help='If set, distribute checkpointed activations '
'across model parallel group.')
group.add_argument('--checkpoint-num-layers', type=int, default=1,
help='chunk size (number of layers) for checkpointing.')
group.add_argument('--train-iters', type=int, default=None,
help='Total number of iterations to train over all '
'training runs. Note that either train-iters or '
'train-samples should be provided.')
group.add_argument('--train-samples', type=int, default=None,
help='Total number of samples to train over all '
'training runs. Note that either train-iters or '
'train-samples should be provided.')
group.add_argument('--train-tokens', type=int, default=None,
help='Total number of tokens to train over all '
'training runs.')
group.add_argument('--log-interval', type=int, default=100,
help='Report loss and timing interval.')
group.add_argument('--exit-interval', type=int, default=None,
help='Exit the program after the iteration is divisible '
'by this value.')
group.add_argument('--exit-duration-in-mins', type=int, default=None,
help='Exit the program after this many minutes.')
group.add_argument('--tensorboard-dir', type=str, default=None,
help='Write TensorBoard logs to this directory.')
group.add_argument('--no-masked-softmax-fusion',
action='store_false',
help='Disable fusion of query_key_value scaling, '
'masking, and softmax.',
dest='masked_softmax_fusion')
group.add_argument('--no-bias-gelu-fusion', action='store_false',
help='Disable bias and gelu fusion.',
dest='bias_gelu_fusion')
group.add_argument('--no-bias-dropout-fusion', action='store_false',
help='Disable bias and dropout fusion.',
dest='bias_dropout_fusion')
group.add_argument('--optimizer', type=str, default='adam',
choices=['adam', 'sgd'],
help='Optimizer function')
group.add_argument('--use-bnb-optimizer', action='store_true',
help='Use bitsandbytes optimizer for efficient training,'
'please refer https://github.com/facebookresearch/bitsandbytes.',
dest='use_bnb_optimizer')
group.add_argument('--dataloader-type', type=str, default=None,
choices=['single', 'cyclic'],
help='Single pass vs multiple pass data loader')
group.add_argument('--cpu-optimizer', action='store_true',
help='Run optimizer on CPU')
group.add_argument('--cpu_torch_adam', action='store_true',
help='Use Torch Adam as optimizer on CPU.')
group.add_argument('--codecarbon-dir', type=str, default=None,
help='Write CodeCarbon logs to this directory.')
group.add_argument('--eval-only', type=bool, required=False,
help='If set to True, no train step will be performed.'
'and only the evaluation on the `valid` and `test` sets '
'will be performed' )
group.add_argument('--skip-train-iteration-range', type=str, nargs='+', default=None,
help='Iteration ranges to skip. The values are one or more dash-separated ranges. e.g., 101-200 251-300.')
group.add_argument('--inference', action='store_true',
help='Very basic inference mode: not allocating optim/lr - requires ZERO_STAGE=0')
group.add_argument('--abort-on-unmet-fused-kernel-constraints', action='store_true',
help="If set to True, the program will abort if the constraints for loading a fused kernel aren't met")
group.add_argument('--pp-partition-method', type=str, default=None,
help="Use to override the pipeline stages partitioning method. e.g., 'type:transformer|embedding'")
return parser
def _add_initialization_args(parser):
group = parser.add_argument_group(title='initialization')
group.add_argument('--seed', type=int, default=1234,
help='Random seed used for python, numpy, '
'pytorch, and cuda.')
group.add_argument('--init-method-std', type=float, default=0.02,
help='Standard deviation of the zero mean normal '
'distribution used for weight initialization.')
group.add_argument('--init-method-xavier-uniform', action='store_true',
help='Enable Xavier uniform parameter initialization')
return parser
def _add_learning_rate_args(parser):
group = parser.add_argument_group(title='learning rate')
group.add_argument('--lr', type=float, default=None,
help='Initial learning rate. Depending on decay style '
'and initial warmup, the learing rate at each '
'iteration would be different.')
group.add_argument('--lr-decay-style', type=str, default='linear',
choices=['constant', 'linear', 'cosine'],
help='Learning rate decay function.')
group.add_argument('--lr-decay-iters', type=int, default=None,
help='number of iterations to decay learning rate over,'
' If None defaults to `--train-iters`')
group.add_argument('--lr-decay-samples', type=int, default=None,
help='number of samples to decay learning rate over,'
' If None defaults to `--train-samples`')
group.add_argument('--lr-decay-tokens', type=int, default=None,
help='number of tokens to decay learning rate over,'
' If not None will override iter/sample-based decay')
group.add_argument('--lr-warmup-fraction', type=float, default=None,
help='fraction of lr-warmup-(iters/samples) to use '
'for warmup (as a float)')
group.add_argument('--lr-warmup-iters', type=int, default=0,
help='number of iterations to linearly warmup '
'learning rate over.')
group.add_argument('--lr-warmup-samples', type=int, default=0,
help='number of samples to linearly warmup '
'learning rate over.')
group.add_argument('--warmup', type=int, default=None,
help='Old lr warmup argument, do not use. Use one of the'
'--lr-warmup-* arguments above')
group.add_argument('--min-lr', type=float, default=0.0,
help='Minumum value for learning rate. The scheduler'
'clip values below this threshold.')
group.add_argument('--override-lr-scheduler', action='store_true',
help='Reset the values of the scheduler (learning rate,'
'warmup iterations, minimum learning rate, maximum '
'number of iterations, and decay style from input '
'arguments and ignore values from checkpoints. Note'
'that all the above values will be reset.')
group.add_argument('--use-checkpoint-lr-scheduler', action='store_true',
help='Use checkpoint to set the values of the scheduler '
'(learning rate, warmup iterations, minimum learning '
'rate, maximum number of iterations, and decay style '
'from checkpoint and ignore input arguments.')
group.add_argument('--universal-checkpoint', action='store_true',
help='Loading a universal format checkpoint.')
return parser
def _add_checkpointing_args(parser):
group = parser.add_argument_group(title='checkpointing')
group.add_argument('--save', type=str, default=None,
help='Output directory to save checkpoints to.')
group.add_argument('--save-interval', type=int, default=None,
help='Number of iterations between checkpoint saves.')
group.add_argument('--no-save-optim', action='store_true', default=None,
help='Do not save current optimizer.')
group.add_argument('--no-save-rng', action='store_true', default=None,
help='Do not save current rng state.')
group.add_argument('--load', type=str, default=None,
help='Directory containing a model checkpoint.')
group.add_argument('--no-load-optim', action='store_true', default=None,
help='Do not load optimizer when loading checkpoint.')
group.add_argument('--no-load-rng', action='store_true', default=None,
help='Do not load rng state when loading checkpoint.')
group.add_argument('--finetune', action='store_true',
help='Load model for finetuning. Do not load optimizer '
'or rng state from checkpoint and set iteration to 0. '
'Assumed when loading a release checkpoint.')
return parser
def _add_mixed_precision_args(parser):
group = parser.add_argument_group(title='mixed precision')
group.add_argument('--fp16', action='store_true',
help='Run model in fp16 mode.')
group.add_argument('--bf16', action='store_true',
help='Run model in bfloat16 mode.')
group.add_argument('--loss-scale', type=float, default=None,
help='Static loss scaling, positive power of 2 '
'values can improve fp16 convergence. If None, dynamic'
'loss scaling is used.')
group.add_argument('--initial-loss-scale', type=float, default=2**32,
help='Initial loss-scale for dynamic loss scaling.')
group.add_argument('--min-loss-scale', type=float, default=1.0,
help='Minimum loss scale for dynamic loss scale.')
group.add_argument('--loss-scale-window', type=float, default=1000,
help='Window over which to raise/lower dynamic scale.')
group.add_argument('--hysteresis', type=int, default=2,
help='hysteresis for dynamic loss scaling')
group.add_argument('--fp32-residual-connection', action='store_true',
help='Move residual connections to fp32.')
group.add_argument('--no-query-key-layer-scaling', action='store_false',
help='Do not scale Q * K^T by 1 / layer-number.',
dest='apply_query_key_layer_scaling')
group.add_argument('--attention-softmax-in-fp32', action='store_true',
help='Run attention masking and softmax in fp32. '
'This flag is ignored unless '
'--no-query-key-layer-scaling is specified.')
group.add_argument('--accumulate-allreduce-grads-in-fp32',
action='store_true',
help='Gradient accumulation and all-reduce in fp32.')
group.add_argument('--fp16-lm-cross-entropy', action='store_true',
help='Move the cross entropy unreduced loss calculation'
'for lm head to fp16.')
return parser
def _add_distributed_args(parser):
group = parser.add_argument_group(title='distributed')
group.add_argument('--tensor-model-parallel-size', type=int, default=1,
help='Degree of tensor model parallelism.')
group.add_argument('--pipeline-model-parallel-size', type=int, default=1,
help='Degree of pipeline model parallelism.')
group.add_argument('--model-parallel-size', type=int, default=None,
help='Old model parallel argument, do not use. Use '
'--tensor-model-parallel-size instead.')
group.add_argument('--num-layers-per-virtual-pipeline-stage', type=int, default=None,
help='Number of layers per virtual pipeline stage')
group.add_argument('--distributed-backend', default='nccl',
choices=['nccl', 'gloo'],
help='Which backend to use for distributed training.')
group.add_argument('--DDP-impl', default='local',
choices=['local', 'torch'],
help='which DistributedDataParallel implementation '
'to use.')
group.add_argument('--use-contiguous-buffers-in-ddp', action='store_true',
help='If set, use contiguous buffer in DDP. Note that '
'this option only works woth local DDP.' )
group.add_argument('--no-scatter-gather-tensors-in-pipeline', action='store_false',
help='Use scatter/gather to optimize communication of tensors in pipeline',
dest='scatter_gather_tensors_in_pipeline')
group.add_argument('--local_rank', type=int, default=None,
help='local rank passed from distributed launcher.')
group.add_argument('--lazy-mpu-init', type=bool, required=False,
help='If set to True, initialize_megatron() '
'skips DDP initialization and returns function to '
'complete it instead.Also turns on '
'--use-cpu-initialization flag. This is for '
'external DDP manager.' )
group.add_argument('--use-cpu-initialization', action='store_true',
default=None, help='If set, affine parallel weights '
'initialization uses CPU' )
group.add_argument('--rank', default=-1, type=int, help='node rank for distributed training')
group.add_argument('--dist_url', type=str, default="env://127.0.0.1:23456")
group.add_argument('--world_size', type=int, default=-1, help='number of nodes for distributed training')
group.add_argument('--dist_backend', default='nccl', type=str, help='distributed backend')
return parser
def _add_validation_args(parser):
group = parser.add_argument_group(title='validation')
group.add_argument('--eval-iters', type=int, default=100,
help='Number of iterations to run for evaluation'
'validation/test for.')
group.add_argument('--eval-interval', type=int, default=1000,
help='Interval between running evaluation on '
'validation set.')
return parser
def _add_data_args(parser):
group = parser.add_argument_group(title='data and dataloader')
# option 1 for data loading (mutually exclusive with option2)
group.add_argument('--data-path', nargs='*', default=None,
help='Path to the training dataset. Accepted format:'
'1) a single data path, 2) multiple datasets in the'
'form: dataset1-weight dataset1-path dataset2-weight '
'dataset2-path ...')
group.add_argument('--split', type=str, default=None,
help='Comma-separated list of proportions for training,'
' validation, and test split. For example the split '
'`90,5,5` will use 90%% of data for training, 5%% for '
'validation and 5%% for test.')
# option 2 for data loading (mutually exclusive with option1)
# helper class to parse the --xxx-weighted-split-paths
# note here two args are set: extra valid dataset paths and names
class parse_data_paths(argparse.Action):
def __call__(self, parser, args, values, option_string=None):
if option_string == "--train-weighted-split-paths":
assert len(values) == 1, 'Only 1 dataset group is allowed to'
'be passed for the argument --train-weighted-split-paths'
# make sure string given in the correct format
err_message = 'Each data group should be input on the following format'
'"GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2"'
'where START < END'
for v in values:
# each prefix consists several datasets separated by commas
prefix = ":".join(v.split(":")[1:]) # remove GIVEN_NAME
datasets = prefix.split(",")
# check if each dataset is formatted like `WEIGHT START:END PATH`
for d in datasets:
assert len(d.split()) == 3, err_message
start, end = d.split()[1].split(":")
assert float(start) < float(end), err_message
names = [v.split(":")[0] for v in values]
prefixes = [":".join(v.split(":")[1:]).strip() for v in values]
weights = [[d.split()[0] for d in p.split(",")] for p in prefixes]
splits = [[d.split()[1] for d in p.split(",")] for p in prefixes]
paths = [[d.split()[2] for d in p.split(",")] for p in prefixes]
# # to keep consistency with Option 1 of data loading (through --data-path)
# # paths will contain strings on the following form
# # "WEIGHTS1 PATH1 WEIGHTS2 PATH2 WEIGHTS3 PATH3" for each dataset group
# # while data will be parsed in additional arguments below
# paths_option1_style = []
# for p, w in zip(paths, weights):
# paths_option1_style.append(" ".join([f"{w_i} {p_i}" for p_i, w_i in zip(p,w)]))
# setattr(args, self.dest, paths_option1_style)
setattr(args, self.dest, paths)
setattr(args, self.dest.replace("paths", "weights"), weights)
setattr(args, self.dest.replace("paths", "splits"), splits)
setattr(args, self.dest.replace("paths","names"), names)
group.add_argument('--train-weighted-split-paths', nargs='*', default=None,
help='Weights, splits and paths to groups of datasets'
'Accepted format: ONE dataset groups could be'
'submitted in the following form between double quotes'
'"GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2"'
'e.g.: "NAME_ABC: 0.6 0:0.6 A, 0.3 0:1 B, 0.1 0:1 C" '
'WEIGHT is used to up and down sample each dataset A,B,C in the group'
'START:END indicates the split portion of the dataset',
action=parse_data_paths)
group.add_argument('--valid-weighted-split-paths', nargs='*', default=None,
help='Weights, splits and paths to groups of datasets'
'Accepted format: one or many dataset groups could be'
'submitted in the following form each between double quotes'
'"GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2"'
'e.g.: "NAME_ABC: 0.6 0.6:0.8 A, 0.3 0:1 B, 0.1 0:1 C" '
'"NAME_CDE: 0.6 0.6:0.8 C, 0.3 0:1 D, 0.1 0:1 E" '
'validation will be run on each of those groups independently',
action=parse_data_paths)
group.add_argument('--test-weighted-split-paths', nargs='*', default=None,
help='Weights, splits and paths to groups of datasets'
'Accepted format: one or many dataset groups could be'
'submitted in the following form each between double quotes'
'"GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2"'
'e.g.: "NAME_ABC: 0.6 0.6:0.8 A, 0.3 0:1 B, 0.1 0:1 C" '
'"NAME_CDE: 0.6 0.6:0.8 C, 0.3 0:1 D, 0.1 0:1 E" '
'test will be run on each of those groups independently',
action=parse_data_paths)
class parse_data_paths_path(argparse.Action):
def __call__(self, parser, args, values, option_string=None):
expected_option_strings = ["--train-weighted-split-paths-path", "--valid-weighted-split-paths-path", "--test-weighted-split-paths-path"]
assert option_string in expected_option_strings, f"Expected {option_string} to be in {expected_option_strings}"
with open(values, "r") as fi:
lines = fi.readlines()
assert len(lines) == 1, f"Got multiple lines {len(lines)} instead of 1 expected"
assert lines[0][-2:] == "\"\n" and lines[0][0] == "\"", f"Invalid input format, got {lines}"
values = lines[0][1:-2].split("\" \"")
weighted_split_paths_dest = re.sub(r"_path$", "", self.dest)
weighted_split_paths_option = re.sub(r"-path$", "", self.option_strings[0])
setattr(args, weighted_split_paths_dest, values)
parse_data_paths(option_strings=[weighted_split_paths_option], dest=weighted_split_paths_dest)(parser, args, values, option_string=weighted_split_paths_option)
group.add_argument('--train-weighted-split-paths-path', type=str, action=parse_data_paths_path ,default=None)
group.add_argument('--valid-weighted-split-paths-path', type=str, action=parse_data_paths_path, default=None)
group.add_argument('--test-weighted-split-paths-path', type=str, action=parse_data_paths_path, default=None)
group.add_argument('--log-path', type=str, default=None,
help='Path to the save arguments file.')
group.add_argument('--vocab-file', type=str, default=None,
help='Path to the vocab file.')
group.add_argument('--merge-file', type=str, default=None,
help='Path to the BPE merge file.')
group.add_argument('--vocab-extra-ids', type=int, default=0,
help='Number of additional vocabulary tokens. '
'They are used for span masking in the T5 model')
group.add_argument('--seq-length', type=int, default=None,
help='Maximum sequence length to process.')
group.add_argument('--encoder-seq-length', type=int, default=None,
help='Maximum encoder sequence length to process.'
'This should be exclusive of --seq-length')
group.add_argument('--decoder-seq-length', type=int, default=None,
help="Maximum decoder sequence length to process.")
group.add_argument('--retriever-seq-length', type=int, default=256,
help='Maximum sequence length for the biencoder model '
' for retriever')
group.add_argument('--sample-rate', type=float, default=1.0,
help='sample rate for training data. Supposed to be 0 '
' < sample_rate < 1')
group.add_argument('--mask-prob', type=float, default=0.15,
help='Probability of replacing a token with mask.')
group.add_argument('--short-seq-prob', type=float, default=0.1,
help='Probability of producing a short sequence.')
group.add_argument('--mmap-warmup', action='store_true',
help='Warm up mmap files.')
group.add_argument('--num-workers', type=int, default=2,
help="Dataloader number of workers.")
group.add_argument('--valid-num-workers', type=int, default=2,
help="Dataloader number of workers for validation.")
group.add_argument('--tokenizer-type', type=str,
default=None,
choices=['BertWordPieceLowerCase',
'BertWordPieceCase',
'GPT2BPETokenizer',
'PretrainedFromHF'],
help='What type of tokenizer to use.')
group.add_argument("--tokenizer-name-or-path", type=str, default=None,
help="Name or path of the huggingface tokenizer.")
group.add_argument('--data-impl', type=str, default='infer',
choices=['lazy', 'cached', 'mmap', 'infer'],
help='Implementation of indexed datasets.')
group.add_argument('--reset-position-ids', action='store_true',
help='Reset posistion ids after end-of-document token.')
group.add_argument('--reset-attention-mask', action='store_true',
help='Reset self attention maske after '
'end-of-document token. Attention between tokens from different documents is null.')
group.add_argument('--eod-mask-loss', action='store_true',
help='Mask loss for the end of document tokens.')
group.add_argument('--loss-on-targets-only', action='store_true',
help='Mask loss on input sequence.')
group.add_argument('--reweight-loss-based-on-position-frequency', action="store_true",
help='Some objectives require us to sample loss_mask. This might introduce bias towards '
'specific positions. This option tries to un-bias the loss by reweighting loss on specific '
'positions based on how frequently we train on that position.'
'This is mostly used for prefix_lm training')
group.add_argument("--noise-density", type=float, default=None, help="Span corruption noise density")
group.add_argument("--mean-noise-span-length", type=int, default=None, help="Span corruption mean noise span length")
return parser
def _add_autoresume_args(parser):
group = parser.add_argument_group(title='autoresume')
group.add_argument('--adlr-autoresume', action='store_true',
help='Enable autoresume on adlr cluster.')
group.add_argument('--adlr-autoresume-interval', type=int, default=1000,
help='Intervals over which check for autoresume'
'termination signal')
return parser
def _add_biencoder_args(parser):
group = parser.add_argument_group(title='biencoder')
# network size
group.add_argument('--ict-head-size', type=int, default=None,
help='Size of block embeddings to be used in ICT and '
'REALM (paper default: 128)')
group.add_argument('--biencoder-projection-dim', type=int, default=0,
help='Size of projection head used in biencoder (paper'
' default: 128)')
group.add_argument('--biencoder-shared-query-context-model', action='store_true',
help='Whether to share the parameters of the query '
'and context models or not')
# checkpointing
group.add_argument('--ict-load', type=str, default=None,
help='Directory containing an ICTBertModel checkpoint')
group.add_argument('--bert-load', type=str, default=None,
help='Directory containing an BertModel checkpoint '
'(needed to start ICT and REALM)')
# data
group.add_argument('--titles-data-path', type=str, default=None,
help='Path to titles dataset used for ICT')
group.add_argument('--query-in-block-prob', type=float, default=0.1,
help='Probability of keeping query in block for '
'ICT dataset')
group.add_argument('--use-one-sent-docs', action='store_true',
help='Whether to use one sentence documents in ICT')
group.add_argument('--evidence-data-path', type=str, default=None,
help='Path to Wikipedia Evidence frm DPR paper')
# training
group.add_argument('--retriever-report-topk-accuracies', nargs='+', type=int,
default=[], help="Which top-k accuracies to report "
"(e.g. '1 5 20')")
group.add_argument('--retriever-score-scaling', action='store_true',
help='Whether to scale retriever scores by inverse '
'square root of hidden size')
# faiss index
group.add_argument('--block-data-path', type=str, default=None,
help='Where to save/load BlockData to/from')
group.add_argument('--embedding-path', type=str, default=None,
help='Where to save/load Open-Retrieval Embedding'
' data to/from')
# indexer
group.add_argument('--indexer-batch-size', type=int, default=128,
help='How large of batches to use when doing indexing '
'jobs')
group.add_argument('--indexer-log-interval', type=int, default=1000,
help='After how many batches should the indexer '
'report progress')
return parser
def _add_vit_args(parser):
group = parser.add_argument_group(title="vit")
group.add_argument('--num-classes', type=int, default=1000,
help='num of classes in vision classificaiton task')
group.add_argument('--img-dim', type=int, default=224,
help='Image size for vision classification task')
group.add_argument('--num-channels', type=int, default=3,
help='Number of channels in input image data')
group.add_argument('--patch-dim', type=int, default=16,
help='patch dimension used in vit')
return parser
def _add_zero_args(parser):
"""Text generate arguments."""
group = parser.add_argument_group('ZeRO configurations', 'configurations')
group.add_argument("--zero-stage", type=int, default=1.0)
group.add_argument('--zero-reduce-scatter', action='store_true',
help='Use reduce scatter if specified')
group.add_argument('--zero-contigious-gradients', action='store_true',
help='Use contigious memory optimizaiton if specified')
group.add_argument("--zero-reduce-bucket-size", type=int, default=0.0)
group.add_argument("--zero-allgather-bucket-size", type=int, default=0.0)
group.add_argument('--remote-device', type=str, default='none', choices=['none', 'cpu', 'nvme'],
help='Remote device for ZeRO-3 initialized parameters.')
group.add_argument('--use-pin-memory', action='store_true',
help='Use pinned CPU memory for ZeRO-3 initialized model parameters.')
return parser
def _add_memoryopt_args(parser):
"""Memory optimization arguments."""
group = parser.add_argument_group('Memory optimizations', 'configurations')
group.add_argument("--scattered-embeddings", action='store_true',
help='Save memory by scattering embedding activations. '
'Introduces dropout differences across MP configurations.')
group.add_argument("--split-transformers", action='store_true',
help='Save memory by splitting transformer layers into two parts, '
'allowing for more frequent activation checkpoint savings.')
group.add_argument("--memory-centric-tiled-linear", action="store_true",
help='Save memory by tiling with deepspeed.zero.TiledLinear.')
group.add_argument("--tile-factor", type=int, default=1,
help='Make all linear layers the same size of [hidden/tile_factor, hidden/tile_factor]. '
'Must be enabled with --memory-centric-tiled-linear. '
'Example A: if tile_factor=1, the qkv layer [hidden, 3* hidden] would be converted into [1,3] tiles of size [hidden,hidden]. '
'Example B: if tile_factor=2, the intermediate layer [4*hidden, hidden] will be converted into [8, 2] tiles of size [hidden/2, hidden/2]. '
'Default is 1.')
return parser
def _add_activation_checkpoint_args(parser):
group = parser.add_argument_group('Activation Checkpointing',
'Checkpointing Configurations')
group.add_argument('--deepspeed-activation-checkpointing', action='store_true',
help='uses activation checkpointing from deepspeed')
group.add_argument('--partition-activations', action='store_true',
help='partition Activations across GPUs before checkpointing.')
group.add_argument('--contigious-checkpointing', action='store_true',
help='Contigious memory checkpointing for activatoins.')
group.add_argument('--checkpoint-in-cpu', action='store_true',
help='Move the activation checkpoints to CPU.')
group.add_argument('--synchronize-each-layer', action='store_true',
help='does a synchronize at the beginning and end of each checkpointed layer.')
group.add_argument('--profile-backward', action='store_true',
help='Enables backward pass profiling for checkpointed layers.')
return parser
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Megatron arguments."""
import argparse
import collections
import os
import re
import time
import torch
import deepspeed
from megatron.enums import PositionEmbeddingType
import megatron
from megatron.logging import log_levels
def parse_args(extra_args_provider=None, defaults={},
ignore_unknown_args=False):
"""Parse all arguments."""
parser = argparse.ArgumentParser(description='Megatron-LM Arguments',
allow_abbrev=False)
# Standard arguments.
parser = _add_network_size_args(parser)
parser = _add_regularization_args(parser)
parser = _add_training_args(parser)
parser = _add_initialization_args(parser)
parser = _add_learning_rate_args(parser)
parser = _add_checkpointing_args(parser)
parser = _add_mixed_precision_args(parser)
parser = _add_distributed_args(parser)
parser = _add_validation_args(parser)
parser = _add_data_args(parser)
parser = _add_autoresume_args(parser)
parser = _add_biencoder_args(parser)
parser = _add_vit_args(parser)
parser = _add_logging_args(parser)
parser = _add_zero_args(parser)
parser = _add_memoryopt_args(parser)
parser = _add_activation_checkpoint_args(parser)
# Custom arguments.
if extra_args_provider is not None:
parser = extra_args_provider(parser)
parser = deepspeed.add_config_arguments(parser)
# Parse.
if ignore_unknown_args:
args, _ = parser.parse_known_args()
else:
args = parser.parse_args()
# Distributed args.
args.rank = int(os.getenv('RANK', '0'))
args.world_size = int(os.getenv("WORLD_SIZE", '1'))
# Tensor model parallel size.
args.tensor_model_parallel_size = min(
args.tensor_model_parallel_size, args.world_size)
assert args.world_size % args.tensor_model_parallel_size == 0, 'world size'\
' ({}) is not divisible by tensor model parallel size ({})'.format(
args.world_size, args.tensor_model_parallel_size)
# Pipeline model parallel size.
args.pipeline_model_parallel_size = min(
args.pipeline_model_parallel_size,
(args.world_size // args.tensor_model_parallel_size))
# Checks.
model_parallel_size = args.pipeline_model_parallel_size * \
args.tensor_model_parallel_size
assert args.world_size % model_parallel_size == 0, 'world size is not'\
' divisible by tensor parallel size ({}) times pipeline parallel ' \
'size ({})'.format(args.world_size, args.tensor_model_parallel_size,
args.pipeline_model_parallel_size)
args.data_parallel_size = args.world_size // model_parallel_size
if args.rank == 0:
print('using world size: {}, data-parallel-size: {}, '
'tensor-model-parallel size: {}, '
'pipeline-model-parallel size: {} '.format(
args.world_size, args.data_parallel_size,
args.tensor_model_parallel_size,
args.pipeline_model_parallel_size), flush=True)
# --data-path and --train-weighted-splits-paths
message = "Data loading Mode 1: --data-path and --split "\
"and Mode 2: --(train|valid|test)-weighted-split-paths"\
"are mutually exclusive i.e. cannot be set together."
if args.data_path:
assert args.train_weighted_split_paths is None, message
setattr(args, "valid_weighted_split_names", None)
setattr(args, "valid_weighted_split_weights", None)
setattr(args, "valid_weighted_split_splits", None)
setattr(args, "test_weighted_split_names", None)
setattr(args, "test_weighted_split_weights", None)
setattr(args, "test_weighted_split_splits", None)
# args.split default value in the args is None it is set here in order
# to check that it does not to overlap with the 2nd mode of data loading
if args.split is None:
args.split = "969, 30, 1"
if args.train_weighted_split_paths or args.valid_weighted_split_paths or \
args.test_weighted_split_paths:
assert args.data_path is None and args.split is None, message
# Deprecated arguments
assert args.batch_size is None, '--batch-size argument is no longer ' \
'valid, use --micro-batch-size instead'
del args.batch_size
assert args.warmup is None, '--warmup argument is no longer valid, use ' \
'--lr-warmup-fraction instead'
del args.warmup
assert args.model_parallel_size is None, '--model-parallel-size is no ' \
'longer valid, use --tensor-model-parallel-size instead'
del args.model_parallel_size
# Set input defaults.
for key in defaults:
# For default to be valid, it should not be provided in the
# arguments that are passed to the program. We check this by
# ensuring the arg is set to None.
if getattr(args, key) is not None:
if args.rank == 0:
print('WARNING: overriding default arguments for {key}:{v} \
with {key}:{v2}'.format(key=key, v=defaults[key],
v2=getattr(args, key)),
flush=True)
else:
setattr(args, key, defaults[key])
# Batch size.
assert args.micro_batch_size is not None
assert args.micro_batch_size > 0
if args.global_batch_size is None:
args.global_batch_size = args.micro_batch_size * args.data_parallel_size
if args.rank == 0:
print('setting global batch size to {}'.format(
args.global_batch_size), flush=True)
assert args.global_batch_size > 0
if args.num_layers_per_virtual_pipeline_stage is not None:
assert args.pipeline_model_parallel_size > 2, \
'pipeline-model-parallel size should be greater than 2 with ' \
'interleaved schedule'
assert args.num_layers % args.num_layers_per_virtual_pipeline_stage == 0, \
'number of layers is not divisible by number of layers per virtual ' \
'pipeline stage'
args.virtual_pipeline_model_parallel_size = \
(args.num_layers // args.pipeline_model_parallel_size) // \
args.num_layers_per_virtual_pipeline_stage
else:
args.virtual_pipeline_model_parallel_size = None
# Parameters dtype.
args.params_dtype = torch.float
if args.fp16:
assert not args.bf16
args.params_dtype = torch.half
if args.bf16:
assert not args.fp16
args.params_dtype = torch.bfloat16
# bfloat16 requires gradient accumulation and all-reduce to
# be done in fp32.
if not args.accumulate_allreduce_grads_in_fp32:
args.accumulate_allreduce_grads_in_fp32 = True
if args.rank == 0:
print('accumulate and all-reduce gradients in fp32 for '
'bfloat16 data type.', flush=True)
if args.rank == 0:
print('using {} for parameters ...'.format(args.params_dtype),
flush=True)
# If we do accumulation and all-reduces in fp32, we need to have
# local DDP and we should set the use-contiguous-buffers-in-ddp.
if args.accumulate_allreduce_grads_in_fp32:
assert args.DDP_impl == 'local'
args.use_contiguous_buffers_in_ddp = True
if args.dataloader_type is None:
args.dataloader_type = 'single'
# Consumed tokens.
args.consumed_train_samples = 0
args.consumed_valid_samples = 0
args.consumed_train_tokens = 0
args.gigaflos_no_embeds = 0
# Iteration-based training.
if args.train_iters:
# If we use iteration-based training, make sure the
# sample-based options are off.
assert args.train_samples is None, \
'expected iteration-based training'
assert args.lr_decay_samples is None, \
'expected iteration-based learning rate decay'
assert args.lr_warmup_samples == 0, \
'expected iteration-based learning rate warmup'
assert args.rampup_batch_size is None, \
'expected no batch-size rampup for iteration-based training'
if args.lr_warmup_fraction is not None:
assert args.lr_warmup_iters == 0, \
'can only specify one of lr-warmup-fraction and lr-warmup-iters'
# Sample-based training.
if args.train_samples:
# If we use sample-based training, make sure the
# iteration-based options are off.
assert args.train_iters is None, \
'expected sample-based training'
assert args.lr_decay_iters is None, \
'expected sample-based learning rate decay'
assert args.lr_warmup_iters == 0, \
'expected sample-based learnig rate warmup'
if args.lr_warmup_fraction is not None:
assert args.lr_warmup_samples == 0, \
'can only specify one of lr-warmup-fraction ' \
'and lr-warmup-samples'
# Check required arguments.
required_args = ['num_layers', 'hidden_size', 'num_attention_heads']
for req_arg in required_args:
_check_arg_is_not_none(args, req_arg)
# Checks.
if args.ffn_hidden_size is None:
args.ffn_hidden_size = 4 * args.hidden_size
if args.kv_channels is None:
assert args.hidden_size % args.num_attention_heads == 0
args.kv_channels = args.hidden_size // args.num_attention_heads
if args.seq_length is not None:
assert args.encoder_seq_length is None
args.encoder_seq_length = args.seq_length
else:
assert args.encoder_seq_length is not None
args.seq_length = args.encoder_seq_length
if args.position_embedding_type == PositionEmbeddingType.absolute or args.position_embedding_type == PositionEmbeddingType.alibi:
assert args.max_position_embeddings is not None
if args.seq_length is not None:
assert args.max_position_embeddings >= args.seq_length
if args.decoder_seq_length is not None:
assert args.max_position_embeddings >= args.decoder_seq_length
else:
assert args.max_position_embeddings is None
if args.lr is not None:
assert args.min_lr <= args.lr
if args.save is not None:
assert args.save_interval is not None
# Mixed precision checks.
if args.fp16_lm_cross_entropy:
assert args.fp16, 'lm cross entropy in fp16 only support in fp16 mode.'
if args.fp32_residual_connection:
assert args.fp16 or args.bf16, \
'residual connection in fp32 only supported when using fp16 or bf16.'
# Activation checkpointing.
if args.distribute_checkpointed_activations:
assert args.checkpoint_activations, \
'for distribute-checkpointed-activations to work you '\
'need to enable checkpoint-activations'
args.curriculum_learning = False
# Activation function
if args.glu_activation is not None and args.bias_gelu_fusion:
raise ValueError("if glu-activation is used, please set --no-bias-gelu-fusion")
# Skip train iterations
if args.skip_train_iteration_range is not None:
args.skip_train_iteration_range = [
list(map(int, range_.split("-"))) for range_ in args.skip_train_iteration_range
]
args.skip_train_iteration_range.sort()
skip_train_iteration_range = collections.deque()
for range_ in args.skip_train_iteration_range:
if len(range_) == 2:
start, end = range_
assert end >= start, \
"end of skip range cannot be smaller than start of skip range"
# merge overlapping intervals (e.g. 1-5 2-6 -> 1-6)
if not skip_train_iteration_range:
skip_train_iteration_range.append([start, end])
elif skip_train_iteration_range[-1][1] >= start:
skip_train_iteration_range[-1][1] = max(end, skip_train_iteration_range[-1][1])
else:
skip_train_iteration_range.append([start, end])
else:
raise ValueError(
"skip train iterations should be specified as two numbers, i.e. start-end"
)
args.skip_train_iteration_range = skip_train_iteration_range
if args.use_bnb_optimizer:
try:
import bitsandbytes as bnb
except ModuleNotFoundError:
raise ModuleNotFoundError("Please install bitsandbytes from https://github.com/facebookresearch/bitsandbytes.")
_print_args(args)
return args
def _print_args(args):
"""Print arguments."""
if args.rank == 0:
print('------------------------ arguments ------------------------',
flush=True)
str_list = []
for arg in vars(args):
dots = '.' * (48 - len(arg))
str_list.append(' {} {} {}'.format(arg, dots, getattr(args, arg)))
if args.log_path is not None:
with open(os.path.join(args.log_path,f'args_{time.strftime("%Y-%m-%dT%H:%M:%S")}.txt'), 'w') as f:
for arg in sorted(str_list, key=lambda x: x.lower()):
f.write(arg+"\n")
print(arg, flush=True)
else:
for arg in sorted(str_list, key=lambda x: x.lower()):
print(arg, flush=True)
print('-------------------- end of arguments ---------------------',
flush=True)
def _check_arg_is_not_none(args, arg):
assert getattr(args, arg) is not None, '{} argument is None'.format(arg)
def _add_network_size_args(parser):
group = parser.add_argument_group(title='network size')
group.add_argument('--num-layers', type=int, default=None,
help='Number of transformer layers.')
group.add_argument('--hidden-size', type=int, default=None,
help='Tansformer hidden size.')
group.add_argument('--ffn-hidden-size', type=int, default=None,
help='Transformer Feed-Forward Network hidden size. '
'This is set to 4*hidden-size if not provided')
group.add_argument('--num-attention-heads', type=int, default=None,
help='Number of transformer attention heads.')
group.add_argument('--kv-channels', type=int, default=None,
help='Projection weights dimension in multi-head '
'attention. This is set to '
' args.hidden_size // args.num_attention_heads '
'if not provided.')
group.add_argument('--max-position-embeddings', type=int, default=None,
help='Maximum number of position embeddings to use. '
'This is the size of position embedding.')
group.add_argument('--make-vocab-size-divisible-by', type=int, default=128,
help='Pad the vocab size to be divisible by this value.'
'This is added for computational efficieny reasons.')
group.add_argument('--pad-vocab-size-to', type=int, default=None,
help='Pad the vocab size to this value.'
'This value must be greater than the initial size of the tokenizer'
', needs to be divisible by TP size and `make-vocab-size-divisible-by`.')
group.add_argument('--layernorm-epsilon', type=float, default=1e-5,
help='Layer norm epsilon.')
group.add_argument('--sync-tp-duplicated-parameters', action='store_true',
help='Force syncing duplicated params across TP ranks in forward. '
'This is a workaround for an unresolved bug leading to TP ranks '
'getting out of sync with each other.')
group.add_argument('--apply-residual-connection-post-layernorm',
action='store_true',
help='If set, use original BERT residula connection '
'ordering.')
group.add_argument('--embed-layernorm', action='store_true',
help='use layernorm for embedding')
group.add_argument('--openai-gelu', action='store_true',
help='Use OpenAIs GeLU implementation. This option'
'should not be used unless for backward compatibility'
'reasons.')
group.add_argument('--onnx-safe', type=bool, required=False,
help='Use workarounds for known problems with '
'Torch ONNX exporter')
group.add_argument('--bert-no-binary-head', action='store_false',
help='Disable BERT binary head.',
dest='bert_binary_head')
group.add_argument('--position-embedding-type', type=lambda x: PositionEmbeddingType[x],
choices=list(PositionEmbeddingType),
default=PositionEmbeddingType.absolute,
help='Define position embedding type ("absolute" | "rotary" | "alibi"). "absolute" by default.'
)
group.add_argument('--glu-activation', type=str,
choices=megatron.model.glu_activations.GLU_ACTIVATIONS.keys(),
help='GLU activations to use.'
)
group.add_argument('--kill-switch-path', type=str,
help='path to look for a kill switch, which if found will automatically exit the program'
)
group.add_argument('--log-level', type=str, choices=list(log_levels.keys()),
help="Logger log level to use on the main process. Possible choices are the log levels as strings: 'debug', "
"'info', 'warning', 'error' and 'critical', plus a 'passive' level which doesn't set anything and lets the "
"application set the level."
)
group.add_argument('--log-level-replica', type=str, choices=list(log_levels.keys()),
help="Logger log level to use on replicas. Same choices as ``log_level``"
)
return parser
def _add_logging_args(parser):
group = parser.add_argument_group(title='logging')
group.add_argument('--log-params-norm', action='store_true',
help='If set, calculate and log parameters norm.')
group.add_argument('--log-num-zeros-in-grad', action='store_true',
help='If set, calculate and log the number of zeros in gradient.')
group.add_argument('--tensorboard-log-interval', type=int, default=1,
help='Report to tensorboard interval.')
group.add_argument('--tensorboard-queue-size', type=int, default=1000,
help='Size of the tensorboard queue for pending events '
'and summaries before one of the ‘add’ calls forces a '
'flush to disk.')
group.add_argument('--log-timers-to-tensorboard', action='store_true',
help='If set, write timers to tensorboard.')
group.add_argument('--log-batch-size-to-tensorboard', action='store_true',
help='If set, write batch-size to tensorboard.')
group.add_argument('--no-log-learnig-rate-to-tensorboard',
action='store_false',
help='Disable learning rate logging to tensorboard.',
dest='log_learning_rate_to_tensorboard')
group.add_argument('--no-log-loss-scale-to-tensorboard',
action='store_false',
help='Disable loss-scale logging to tensorboard.',
dest='log_loss_scale_to_tensorboard')
group.add_argument('--log-validation-ppl-to-tensorboard',
action='store_true',
help='If set, write validation perplexity to '
'tensorboard.')
return parser
def _add_regularization_args(parser):
group = parser.add_argument_group(title='regularization')
group.add_argument('--attention-dropout', type=float, default=0.1,
help='Post attention dropout probability.')
group.add_argument('--hidden-dropout', type=float, default=0.1,
help='Dropout probability for hidden state transformer.')
group.add_argument('--weight-decay', type=float, default=0.01,
help='Weight decay coefficient for L2 regularization.')
group.add_argument('--clip-grad', type=float, default=1.0,
help='Gradient clipping based on global L2 norm.')
group.add_argument('--adam-beta1', type=float, default=0.9,
help='First coefficient for computing running averages '
'of gradient and its square')
group.add_argument('--adam-beta2', type=float, default=0.999,
help='Second coefficient for computing running averages '
'of gradient and its square')
group.add_argument('--adam-eps', type=float, default=1e-08,
help='Term added to the denominator to improve'
'numerical stability')
group.add_argument('--sgd-momentum', type=float, default=0.9,
help='Momentum factor for sgd')
return parser
def _add_training_args(parser):
group = parser.add_argument_group(title='training')
group.add_argument('--micro-batch-size', type=int, default=None,
help='Batch size per model instance (local batch size). '
'Global batch size is local batch size times data '
'parallel size times number of micro batches.')
group.add_argument('--batch-size', type=int, default=None,
help='Old batch size parameter, do not use. '
'Use --micro-batch-size instead')
group.add_argument('--global-batch-size', type=int, default=None,
help='Training batch size. If set, it should be a '
'multiple of micro-batch-size times data-parallel-size. '
'If this value is None, then '
'use micro-batch-size * data-parallel-size as the '
'global batch size. This choice will result in 1 for '
'number of micro-batches.')
group.add_argument('--rampup-batch-size', nargs='*', default=None,
help='Batch size ramp up with the following values:'
' --rampup-batch-size <start batch size> '
' <batch size increment> '
' <ramp-up samples> '
'For example: '
' --rampup-batch-size 16 8 300000 '
' --global-batch-size 1024 '
'will start with global batch size 16 and over '
' (1024 - 16) / 8 = 126 intervals will increase '
'the batch size linearly to 1024. In each interval '
'we will use approximately 300000 / 126 = 2380 samples.')
group.add_argument('--checkpoint-activations', action='store_true',
help='Checkpoint activation to allow for training '
'with larger models, sequences, and batch sizes.')
group.add_argument('--distribute-checkpointed-activations',
action='store_true',
help='If set, distribute checkpointed activations '
'across model parallel group.')
group.add_argument('--checkpoint-num-layers', type=int, default=1,
help='chunk size (number of layers) for checkpointing.')
group.add_argument('--train-iters', type=int, default=None,
help='Total number of iterations to train over all '
'training runs. Note that either train-iters or '
'train-samples should be provided.')
group.add_argument('--train-samples', type=int, default=None,
help='Total number of samples to train over all '
'training runs. Note that either train-iters or '
'train-samples should be provided.')
group.add_argument('--train-tokens', type=int, default=None,
help='Total number of tokens to train over all '
'training runs.')
group.add_argument('--log-interval', type=int, default=100,
help='Report loss and timing interval.')
group.add_argument('--exit-interval', type=int, default=None,
help='Exit the program after the iteration is divisible '
'by this value.')
group.add_argument('--exit-duration-in-mins', type=int, default=None,
help='Exit the program after this many minutes.')
group.add_argument('--tensorboard-dir', type=str, default=None,
help='Write TensorBoard logs to this directory.')
group.add_argument('--no-masked-softmax-fusion',
action='store_false',
help='Disable fusion of query_key_value scaling, '
'masking, and softmax.',
dest='masked_softmax_fusion')
group.add_argument('--no-bias-gelu-fusion', action='store_false',
help='Disable bias and gelu fusion.',
dest='bias_gelu_fusion')
group.add_argument('--no-bias-dropout-fusion', action='store_false',
help='Disable bias and dropout fusion.',
dest='bias_dropout_fusion')
group.add_argument('--optimizer', type=str, default='adam',
choices=['adam', 'sgd'],
help='Optimizer function')
group.add_argument('--use-bnb-optimizer', action='store_true',
help='Use bitsandbytes optimizer for efficient training,'
'please refer https://github.com/facebookresearch/bitsandbytes.',
dest='use_bnb_optimizer')
group.add_argument('--dataloader-type', type=str, default=None,
choices=['single', 'cyclic'],
help='Single pass vs multiple pass data loader')
group.add_argument('--cpu-optimizer', action='store_true',
help='Run optimizer on CPU')
group.add_argument('--cpu_torch_adam', action='store_true',
help='Use Torch Adam as optimizer on CPU.')
group.add_argument('--codecarbon-dir', type=str, default=None,
help='Write CodeCarbon logs to this directory.')
group.add_argument('--eval-only', type=bool, required=False,
help='If set to True, no train step will be performed.'
'and only the evaluation on the `valid` and `test` sets '
'will be performed' )
group.add_argument('--skip-train-iteration-range', type=str, nargs='+', default=None,
help='Iteration ranges to skip. The values are one or more dash-separated ranges. e.g., 101-200 251-300.')
group.add_argument('--inference', action='store_true',
help='Very basic inference mode: not allocating optim/lr - requires ZERO_STAGE=0')
group.add_argument('--abort-on-unmet-fused-kernel-constraints', action='store_true',
help="If set to True, the program will abort if the constraints for loading a fused kernel aren't met")
group.add_argument('--pp-partition-method', type=str, default=None,
help="Use to override the pipeline stages partitioning method. e.g., 'type:transformer|embedding'")
return parser
def _add_initialization_args(parser):
group = parser.add_argument_group(title='initialization')
group.add_argument('--seed', type=int, default=1234,
help='Random seed used for python, numpy, '
'pytorch, and cuda.')
group.add_argument('--init-method-std', type=float, default=0.02,
help='Standard deviation of the zero mean normal '
'distribution used for weight initialization.')
group.add_argument('--init-method-xavier-uniform', action='store_true',
help='Enable Xavier uniform parameter initialization')
return parser
def _add_learning_rate_args(parser):
group = parser.add_argument_group(title='learning rate')
group.add_argument('--lr', type=float, default=None,
help='Initial learning rate. Depending on decay style '
'and initial warmup, the learing rate at each '
'iteration would be different.')
group.add_argument('--lr-decay-style', type=str, default='linear',
choices=['constant', 'linear', 'cosine'],
help='Learning rate decay function.')
group.add_argument('--lr-decay-iters', type=int, default=None,
help='number of iterations to decay learning rate over,'
' If None defaults to `--train-iters`')
group.add_argument('--lr-decay-samples', type=int, default=None,
help='number of samples to decay learning rate over,'
' If None defaults to `--train-samples`')
group.add_argument('--lr-decay-tokens', type=int, default=None,
help='number of tokens to decay learning rate over,'
' If not None will override iter/sample-based decay')
group.add_argument('--lr-warmup-fraction', type=float, default=None,
help='fraction of lr-warmup-(iters/samples) to use '
'for warmup (as a float)')
group.add_argument('--lr-warmup-iters', type=int, default=0,
help='number of iterations to linearly warmup '
'learning rate over.')
group.add_argument('--lr-warmup-samples', type=int, default=0,
help='number of samples to linearly warmup '
'learning rate over.')
group.add_argument('--warmup', type=int, default=None,
help='Old lr warmup argument, do not use. Use one of the'
'--lr-warmup-* arguments above')
group.add_argument('--min-lr', type=float, default=0.0,
help='Minumum value for learning rate. The scheduler'
'clip values below this threshold.')
group.add_argument('--override-lr-scheduler', action='store_true',
help='Reset the values of the scheduler (learning rate,'
'warmup iterations, minimum learning rate, maximum '
'number of iterations, and decay style from input '
'arguments and ignore values from checkpoints. Note'
'that all the above values will be reset.')
group.add_argument('--use-checkpoint-lr-scheduler', action='store_true',
help='Use checkpoint to set the values of the scheduler '
'(learning rate, warmup iterations, minimum learning '
'rate, maximum number of iterations, and decay style '
'from checkpoint and ignore input arguments.')
group.add_argument('--universal-checkpoint', action='store_true',
help='Loading a universal format checkpoint.')
return parser
def _add_checkpointing_args(parser):
group = parser.add_argument_group(title='checkpointing')
group.add_argument('--save', type=str, default=None,
help='Output directory to save checkpoints to.')
group.add_argument('--save-interval', type=int, default=None,
help='Number of iterations between checkpoint saves.')
group.add_argument('--no-save-optim', action='store_true', default=None,
help='Do not save current optimizer.')
group.add_argument('--no-save-rng', action='store_true', default=None,
help='Do not save current rng state.')
group.add_argument('--load', type=str, default=None,
help='Directory containing a model checkpoint.')
group.add_argument('--no-load-optim', action='store_true', default=None,
help='Do not load optimizer when loading checkpoint.')
group.add_argument('--no-load-rng', action='store_true', default=None,
help='Do not load rng state when loading checkpoint.')
group.add_argument('--finetune', action='store_true',
help='Load model for finetuning. Do not load optimizer '
'or rng state from checkpoint and set iteration to 0. '
'Assumed when loading a release checkpoint.')
return parser
def _add_mixed_precision_args(parser):
group = parser.add_argument_group(title='mixed precision')
group.add_argument('--fp16', action='store_true',
help='Run model in fp16 mode.')
group.add_argument('--bf16', action='store_true',
help='Run model in bfloat16 mode.')
group.add_argument('--loss-scale', type=float, default=None,
help='Static loss scaling, positive power of 2 '
'values can improve fp16 convergence. If None, dynamic'
'loss scaling is used.')
group.add_argument('--initial-loss-scale', type=float, default=2**32,
help='Initial loss-scale for dynamic loss scaling.')
group.add_argument('--min-loss-scale', type=float, default=1.0,
help='Minimum loss scale for dynamic loss scale.')
group.add_argument('--loss-scale-window', type=float, default=1000,
help='Window over which to raise/lower dynamic scale.')
group.add_argument('--hysteresis', type=int, default=2,
help='hysteresis for dynamic loss scaling')
group.add_argument('--fp32-residual-connection', action='store_true',
help='Move residual connections to fp32.')
group.add_argument('--no-query-key-layer-scaling', action='store_false',
help='Do not scale Q * K^T by 1 / layer-number.',
dest='apply_query_key_layer_scaling')
group.add_argument('--attention-softmax-in-fp32', action='store_true',
help='Run attention masking and softmax in fp32. '
'This flag is ignored unless '
'--no-query-key-layer-scaling is specified.')
group.add_argument('--accumulate-allreduce-grads-in-fp32',
action='store_true',
help='Gradient accumulation and all-reduce in fp32.')
group.add_argument('--fp16-lm-cross-entropy', action='store_true',
help='Move the cross entropy unreduced loss calculation'
'for lm head to fp16.')
return parser
def _add_distributed_args(parser):
group = parser.add_argument_group(title='distributed')
group.add_argument('--tensor-model-parallel-size', type=int, default=1,
help='Degree of tensor model parallelism.')
group.add_argument('--pipeline-model-parallel-size', type=int, default=1,
help='Degree of pipeline model parallelism.')
group.add_argument('--model-parallel-size', type=int, default=None,
help='Old model parallel argument, do not use. Use '
'--tensor-model-parallel-size instead.')
group.add_argument('--num-layers-per-virtual-pipeline-stage', type=int, default=None,
help='Number of layers per virtual pipeline stage')
group.add_argument('--distributed-backend', default='nccl',
choices=['nccl', 'gloo'],
help='Which backend to use for distributed training.')
group.add_argument('--DDP-impl', default='local',
choices=['local', 'torch'],
help='which DistributedDataParallel implementation '
'to use.')
group.add_argument('--use-contiguous-buffers-in-ddp', action='store_true',
help='If set, use contiguous buffer in DDP. Note that '
'this option only works woth local DDP.' )
group.add_argument('--no-scatter-gather-tensors-in-pipeline', action='store_false',
help='Use scatter/gather to optimize communication of tensors in pipeline',
dest='scatter_gather_tensors_in_pipeline')
group.add_argument('--local_rank', type=int, default=None,
help='local rank passed from distributed launcher.')
group.add_argument('--lazy-mpu-init', type=bool, required=False,
help='If set to True, initialize_megatron() '
'skips DDP initialization and returns function to '
'complete it instead.Also turns on '
'--use-cpu-initialization flag. This is for '
'external DDP manager.' )
group.add_argument('--use-cpu-initialization', action='store_true',
default=None, help='If set, affine parallel weights '
'initialization uses CPU' )
#group.add_argument('--rank', default=-1, type=int, help='node rank for distributed training')
#group.add_argument('--dist_url', type=str, default="env://127.0.0.1:23456")
#group.add_argument('--world_size', type=int, default=-1, help='number of nodes for distributed training')
#group.add_argument('--dist_backend', default='nccl', type=str, help='distributed backend')
return parser
def _add_validation_args(parser):
group = parser.add_argument_group(title='validation')
group.add_argument('--eval-iters', type=int, default=100,
help='Number of iterations to run for evaluation'
'validation/test for.')
group.add_argument('--eval-interval', type=int, default=1000,
help='Interval between running evaluation on '
'validation set.')
return parser
def _add_data_args(parser):
group = parser.add_argument_group(title='data and dataloader')
# option 1 for data loading (mutually exclusive with option2)
group.add_argument('--data-path', nargs='*', default=None,
help='Path to the training dataset. Accepted format:'
'1) a single data path, 2) multiple datasets in the'
'form: dataset1-weight dataset1-path dataset2-weight '
'dataset2-path ...')
group.add_argument('--split', type=str, default=None,
help='Comma-separated list of proportions for training,'
' validation, and test split. For example the split '
'`90,5,5` will use 90%% of data for training, 5%% for '
'validation and 5%% for test.')
# option 2 for data loading (mutually exclusive with option1)
# helper class to parse the --xxx-weighted-split-paths
# note here two args are set: extra valid dataset paths and names
class parse_data_paths(argparse.Action):
def __call__(self, parser, args, values, option_string=None):
if option_string == "--train-weighted-split-paths":
assert len(values) == 1, 'Only 1 dataset group is allowed to'
'be passed for the argument --train-weighted-split-paths'
# make sure string given in the correct format
err_message = 'Each data group should be input on the following format'
'"GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2"'
'where START < END'
for v in values:
# each prefix consists several datasets separated by commas
prefix = ":".join(v.split(":")[1:]) # remove GIVEN_NAME
datasets = prefix.split(",")
# check if each dataset is formatted like `WEIGHT START:END PATH`
for d in datasets:
assert len(d.split()) == 3, err_message
start, end = d.split()[1].split(":")
assert float(start) < float(end), err_message
names = [v.split(":")[0] for v in values]
prefixes = [":".join(v.split(":")[1:]).strip() for v in values]
weights = [[d.split()[0] for d in p.split(",")] for p in prefixes]
splits = [[d.split()[1] for d in p.split(",")] for p in prefixes]
paths = [[d.split()[2] for d in p.split(",")] for p in prefixes]
# # to keep consistency with Option 1 of data loading (through --data-path)
# # paths will contain strings on the following form
# # "WEIGHTS1 PATH1 WEIGHTS2 PATH2 WEIGHTS3 PATH3" for each dataset group
# # while data will be parsed in additional arguments below
# paths_option1_style = []
# for p, w in zip(paths, weights):
# paths_option1_style.append(" ".join([f"{w_i} {p_i}" for p_i, w_i in zip(p,w)]))
# setattr(args, self.dest, paths_option1_style)
setattr(args, self.dest, paths)
setattr(args, self.dest.replace("paths", "weights"), weights)
setattr(args, self.dest.replace("paths", "splits"), splits)
setattr(args, self.dest.replace("paths","names"), names)
group.add_argument('--train-weighted-split-paths', nargs='*', default=None,
help='Weights, splits and paths to groups of datasets'
'Accepted format: ONE dataset groups could be'
'submitted in the following form between double quotes'
'"GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2"'
'e.g.: "NAME_ABC: 0.6 0:0.6 A, 0.3 0:1 B, 0.1 0:1 C" '
'WEIGHT is used to up and down sample each dataset A,B,C in the group'
'START:END indicates the split portion of the dataset',
action=parse_data_paths)
group.add_argument('--valid-weighted-split-paths', nargs='*', default=None,
help='Weights, splits and paths to groups of datasets'
'Accepted format: one or many dataset groups could be'
'submitted in the following form each between double quotes'
'"GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2"'
'e.g.: "NAME_ABC: 0.6 0.6:0.8 A, 0.3 0:1 B, 0.1 0:1 C" '
'"NAME_CDE: 0.6 0.6:0.8 C, 0.3 0:1 D, 0.1 0:1 E" '
'validation will be run on each of those groups independently',
action=parse_data_paths)
group.add_argument('--test-weighted-split-paths', nargs='*', default=None,
help='Weights, splits and paths to groups of datasets'
'Accepted format: one or many dataset groups could be'
'submitted in the following form each between double quotes'
'"GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2"'
'e.g.: "NAME_ABC: 0.6 0.6:0.8 A, 0.3 0:1 B, 0.1 0:1 C" '
'"NAME_CDE: 0.6 0.6:0.8 C, 0.3 0:1 D, 0.1 0:1 E" '
'test will be run on each of those groups independently',
action=parse_data_paths)
class parse_data_paths_path(argparse.Action):
def __call__(self, parser, args, values, option_string=None):
expected_option_strings = ["--train-weighted-split-paths-path", "--valid-weighted-split-paths-path", "--test-weighted-split-paths-path"]
assert option_string in expected_option_strings, f"Expected {option_string} to be in {expected_option_strings}"
with open(values, "r") as fi:
lines = fi.readlines()
assert len(lines) == 1, f"Got multiple lines {len(lines)} instead of 1 expected"
assert lines[0][-2:] == "\"\n" and lines[0][0] == "\"", f"Invalid input format, got {lines}"
values = lines[0][1:-2].split("\" \"")
weighted_split_paths_dest = re.sub(r"_path$", "", self.dest)
weighted_split_paths_option = re.sub(r"-path$", "", self.option_strings[0])
setattr(args, weighted_split_paths_dest, values)
parse_data_paths(option_strings=[weighted_split_paths_option], dest=weighted_split_paths_dest)(parser, args, values, option_string=weighted_split_paths_option)
group.add_argument('--train-weighted-split-paths-path', type=str, action=parse_data_paths_path ,default=None)
group.add_argument('--valid-weighted-split-paths-path', type=str, action=parse_data_paths_path, default=None)
group.add_argument('--test-weighted-split-paths-path', type=str, action=parse_data_paths_path, default=None)
group.add_argument('--log-path', type=str, default=None,
help='Path to the save arguments file.')
group.add_argument('--vocab-file', type=str, default=None,
help='Path to the vocab file.')
group.add_argument('--merge-file', type=str, default=None,
help='Path to the BPE merge file.')
group.add_argument('--vocab-extra-ids', type=int, default=0,
help='Number of additional vocabulary tokens. '
'They are used for span masking in the T5 model')
group.add_argument('--seq-length', type=int, default=None,
help='Maximum sequence length to process.')
group.add_argument('--encoder-seq-length', type=int, default=None,
help='Maximum encoder sequence length to process.'
'This should be exclusive of --seq-length')
group.add_argument('--decoder-seq-length', type=int, default=None,
help="Maximum decoder sequence length to process.")
group.add_argument('--retriever-seq-length', type=int, default=256,
help='Maximum sequence length for the biencoder model '
' for retriever')
group.add_argument('--sample-rate', type=float, default=1.0,
help='sample rate for training data. Supposed to be 0 '
' < sample_rate < 1')
group.add_argument('--mask-prob', type=float, default=0.15,
help='Probability of replacing a token with mask.')
group.add_argument('--short-seq-prob', type=float, default=0.1,
help='Probability of producing a short sequence.')
group.add_argument('--mmap-warmup', action='store_true',
help='Warm up mmap files.')
group.add_argument('--num-workers', type=int, default=2,
help="Dataloader number of workers.")
group.add_argument('--valid-num-workers', type=int, default=2,
help="Dataloader number of workers for validation.")
group.add_argument('--tokenizer-type', type=str,
default=None,
choices=['BertWordPieceLowerCase',
'BertWordPieceCase',
'GPT2BPETokenizer',
'PretrainedFromHF'],
help='What type of tokenizer to use.')
group.add_argument("--tokenizer-name-or-path", type=str, default=None,
help="Name or path of the huggingface tokenizer.")
group.add_argument('--data-impl', type=str, default='infer',
choices=['lazy', 'cached', 'mmap', 'infer'],
help='Implementation of indexed datasets.')
group.add_argument('--reset-position-ids', action='store_true',
help='Reset posistion ids after end-of-document token.')
group.add_argument('--reset-attention-mask', action='store_true',
help='Reset self attention maske after '
'end-of-document token. Attention between tokens from different documents is null.')
group.add_argument('--eod-mask-loss', action='store_true',
help='Mask loss for the end of document tokens.')
group.add_argument('--loss-on-targets-only', action='store_true',
help='Mask loss on input sequence.')
group.add_argument('--reweight-loss-based-on-position-frequency', action="store_true",
help='Some objectives require us to sample loss_mask. This might introduce bias towards '
'specific positions. This option tries to un-bias the loss by reweighting loss on specific '
'positions based on how frequently we train on that position.'
'This is mostly used for prefix_lm training')
group.add_argument("--noise-density", type=float, default=None, help="Span corruption noise density")
group.add_argument("--mean-noise-span-length", type=int, default=None, help="Span corruption mean noise span length")
return parser
def _add_autoresume_args(parser):
group = parser.add_argument_group(title='autoresume')
group.add_argument('--adlr-autoresume', action='store_true',
help='Enable autoresume on adlr cluster.')
group.add_argument('--adlr-autoresume-interval', type=int, default=1000,
help='Intervals over which check for autoresume'
'termination signal')
return parser
def _add_biencoder_args(parser):
group = parser.add_argument_group(title='biencoder')
# network size
group.add_argument('--ict-head-size', type=int, default=None,
help='Size of block embeddings to be used in ICT and '
'REALM (paper default: 128)')
group.add_argument('--biencoder-projection-dim', type=int, default=0,
help='Size of projection head used in biencoder (paper'
' default: 128)')
group.add_argument('--biencoder-shared-query-context-model', action='store_true',
help='Whether to share the parameters of the query '
'and context models or not')
# checkpointing
group.add_argument('--ict-load', type=str, default=None,
help='Directory containing an ICTBertModel checkpoint')
group.add_argument('--bert-load', type=str, default=None,
help='Directory containing an BertModel checkpoint '
'(needed to start ICT and REALM)')
# data
group.add_argument('--titles-data-path', type=str, default=None,
help='Path to titles dataset used for ICT')
group.add_argument('--query-in-block-prob', type=float, default=0.1,
help='Probability of keeping query in block for '
'ICT dataset')
group.add_argument('--use-one-sent-docs', action='store_true',
help='Whether to use one sentence documents in ICT')
group.add_argument('--evidence-data-path', type=str, default=None,
help='Path to Wikipedia Evidence frm DPR paper')
# training
group.add_argument('--retriever-report-topk-accuracies', nargs='+', type=int,
default=[], help="Which top-k accuracies to report "
"(e.g. '1 5 20')")
group.add_argument('--retriever-score-scaling', action='store_true',
help='Whether to scale retriever scores by inverse '
'square root of hidden size')
# faiss index
group.add_argument('--block-data-path', type=str, default=None,
help='Where to save/load BlockData to/from')
group.add_argument('--embedding-path', type=str, default=None,
help='Where to save/load Open-Retrieval Embedding'
' data to/from')
# indexer
group.add_argument('--indexer-batch-size', type=int, default=128,
help='How large of batches to use when doing indexing '
'jobs')
group.add_argument('--indexer-log-interval', type=int, default=1000,
help='After how many batches should the indexer '
'report progress')
return parser
def _add_vit_args(parser):
group = parser.add_argument_group(title="vit")
group.add_argument('--num-classes', type=int, default=1000,
help='num of classes in vision classificaiton task')
group.add_argument('--img-dim', type=int, default=224,
help='Image size for vision classification task')
group.add_argument('--num-channels', type=int, default=3,
help='Number of channels in input image data')
group.add_argument('--patch-dim', type=int, default=16,
help='patch dimension used in vit')
return parser
def _add_zero_args(parser):
"""Text generate arguments."""
group = parser.add_argument_group('ZeRO configurations', 'configurations')
group.add_argument("--zero-stage", type=int, default=1.0)
group.add_argument('--zero-reduce-scatter', action='store_true',
help='Use reduce scatter if specified')
group.add_argument('--zero-contigious-gradients', action='store_true',
help='Use contigious memory optimizaiton if specified')
group.add_argument("--zero-reduce-bucket-size", type=int, default=0.0)
group.add_argument("--zero-allgather-bucket-size", type=int, default=0.0)
group.add_argument('--remote-device', type=str, default='none', choices=['none', 'cpu', 'nvme'],
help='Remote device for ZeRO-3 initialized parameters.')
group.add_argument('--use-pin-memory', action='store_true',
help='Use pinned CPU memory for ZeRO-3 initialized model parameters.')
return parser
def _add_memoryopt_args(parser):
"""Memory optimization arguments."""
group = parser.add_argument_group('Memory optimizations', 'configurations')
group.add_argument("--scattered-embeddings", action='store_true',
help='Save memory by scattering embedding activations. '
'Introduces dropout differences across MP configurations.')
group.add_argument("--split-transformers", action='store_true',
help='Save memory by splitting transformer layers into two parts, '
'allowing for more frequent activation checkpoint savings.')
group.add_argument("--memory-centric-tiled-linear", action="store_true",
help='Save memory by tiling with deepspeed.zero.TiledLinear.')
group.add_argument("--tile-factor", type=int, default=1,
help='Make all linear layers the same size of [hidden/tile_factor, hidden/tile_factor]. '
'Must be enabled with --memory-centric-tiled-linear. '
'Example A: if tile_factor=1, the qkv layer [hidden, 3* hidden] would be converted into [1,3] tiles of size [hidden,hidden]. '
'Example B: if tile_factor=2, the intermediate layer [4*hidden, hidden] will be converted into [8, 2] tiles of size [hidden/2, hidden/2]. '
'Default is 1.')
return parser
def _add_activation_checkpoint_args(parser):
group = parser.add_argument_group('Activation Checkpointing',
'Checkpointing Configurations')
group.add_argument('--deepspeed-activation-checkpointing', action='store_true',
help='uses activation checkpointing from deepspeed')
group.add_argument('--partition-activations', action='store_true',
help='partition Activations across GPUs before checkpointing.')
group.add_argument('--contigious-checkpointing', action='store_true',
help='Contigious memory checkpointing for activatoins.')
group.add_argument('--checkpoint-in-cpu', action='store_true',
help='Move the activation checkpoints to CPU.')
group.add_argument('--synchronize-each-layer', action='store_true',
help='does a synchronize at the beginning and end of each checkpointed layer.')
group.add_argument('--profile-backward', action='store_true',
help='Enables backward pass profiling for checkpointed layers.')
return parser
python tools/preprocess_data.py \
--input oscar-1GB.jsonl \
--output-prefix ./data/my-gpt2 \
--vocab gpt2-vocab.json \
--dataset-impl mmap \
--tokenizer-type GPT2BPETokenizer \
--merge-file gpt2-merges.txt \
--append-eod \
--workers 8
\ No newline at end of file
#!/bin/bash
export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_FIND_MODE=3
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
RANK=$OMPI_COMM_WORLD_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
MODEL_NAME=gpt2-4tp
DATA_OUTPUT_PATH=./
LOGS_PATH=$DATA_OUTPUT_PATH/logs
CHECKPOINT_PATH=checkpoint/$MODEL_NAME
DATA_PATH=./data/my-gpt2_text_document
TENSORBOARD_PATH=output_dir/tensorboard/$MODEL_NAME
CODECARBON_PATH=output_dir/codecarbon/$MODEL_NAME
N_GPUS=4
TP_SIZE=4 # always fixed to the size of a single node
PP_SIZE=1 #128 #96 # NLAYERS must be a multiple of PP_SIZE here
MICRO_BATCH_SIZE=2
GLOBAL_BATCH_SIZE=32 #256 #1536
NLAYERS=24
NHIDDEN=1024 #12480
NHEADS=16
SEQ_LEN=1024
SAVE_INTERVAL=1000
#rampup-batch-size 16 16 5859375
OPTIMIZER_ARGS=" \
--optimizer adam \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--adam-eps 1e-8 \
--lr 6.0e-5 \
--min-lr 6.0e-6 \
--lr-decay-style cosine \
--clip-grad 1.0 \
--weight-decay 1e-1 \
"
GPT_ARGS=" \
--num-layers $NLAYERS \
--hidden-size $NHIDDEN \
--num-attention-heads $NHEADS \
--seq-length $SEQ_LEN \
--max-position-embeddings $SEQ_LEN \
--micro-batch-size $MICRO_BATCH_SIZE \
--global-batch-size $GLOBAL_BATCH_SIZE \
--train-iters 50 \
--loss-scale 12 \
--vocab-file gpt2-vocab.json \
--merge-file gpt2-merges.txt \
--clip-grad 1.0 \
--fp16 \
--checkpoint-activations \
--seed 42
$OPTIMIZER_ARGS \
"
OUTPUT_ARGS=" \
--log-interval 1 \
--save-interval $SAVE_INTERVAL \
--eval-interval 10 \
--eval-iters 40 \
--tensorboard-dir $TENSORBOARD_PATH \
--tensorboard-queue-size 5 \
--log-timers-to-tensorboard \
--log-batch-size-to-tensorboard \
--log-validation-ppl-to-tensorboard \
"
DATA_ARGS=" \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
"
ZERO_STAGE=1
config_json="./${MODEL_NAME}_ds_config.json"
cat <<EOT > $config_json
{
"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
"train_batch_size": $GLOBAL_BATCH_SIZE,
"gradient_clipping": 1.0,
"zero_optimization": {
"stage": $ZERO_STAGE
},
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 12
},
"steps_per_print": 2000,
"wall_clock_breakdown": false
}
EOT
DEEPSPEED_ARGS=" \
--deepspeed \
--deepspeed_config ${config_json} \
--zero-stage ${ZERO_STAGE} \
--deepspeed-activation-checkpointing \
"
APP="python pretrain_gpt.py \
--tensor-model-parallel-size $TP_SIZE \
--pipeline-model-parallel-size $PP_SIZE \
$GPT_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
$DEEPSPEED_ARGS \
--rank ${RANK} \
--world_size ${WORLD_SIZE} \
--dist_url env://127.0.0.1::34566
"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
#!/bin/bash
#SBATCH -p tydexclu01
#SBATCH -N 16
#SBATCH --cpus-per-task=1
#SBATCH --ntasks-per-node=32
#SBATCH --mem 0
#SBATCH --gres=dcu:4
#SBATCH -J gpt2
#SBATCH -o logs/gpt2-16B-%j.out
#SBATCH -e logs/gpt2-16B-%j.out
ulimit -u 200000
export NCCL_IB_HCA=mlx5
export NCCL_SOCKET_IFNAME=ib0
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
echo "START TIME: $(date)"
rm -f ./hostfile/*
hostfile=./hostfile/$SLURM_JOB_ID
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
for i in `cat $hostfile`
do
echo ${i} slots=4 >> `pwd`/hostfile/hostfile-dl-$SLURM_JOB_ID
done
np=$(cat $hostfile|sort|uniq |wc -l)
np=$(($np*4))
nodename=$(cat $hostfile |sed -n "1p")
dist_url=`echo $nodename | awk '{print $1}'`
mpirun -np $np --allow-run-as-root --hostfile hostfile/hostfile-dl-$SLURM_JOB_ID --bind-to none `pwd`/single-16B.sh $dist_url
#!/bin/bash
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
RANK=$OMPI_COMM_WORLD_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
CHECKPOINT_PATH=./conver-4tp-model
VOCAB_FILE=gpt2-vocab.json
MERGE_FILE=gpt2-merges.txt
APP="python tools/generate_samples_gpt.py \
--tensor-model-parallel-size 4 \
--num-layers 40 \
--hidden-size 5760 \
--load $CHECKPOINT_PATH \
--num-attention-heads 24 \
--max-position-embeddings 2048 \
--tokenizer-type GPT2BPETokenizer \
--fp16 \
--micro-batch-size 2 \
--seq-length 2048 \
--out-seq-length 128 \
--temperature 1.0 \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--genfile gpt2-genfile.json \
--num-samples 4 \
--top_p 0.9 \
--recompute \
--rank ${RANK} \
--world_size ${WORLD_SIZE}"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
#!/bin/bash
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
RANK=$OMPI_COMM_WORLD_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
CHECKPOINT_PATH=./checkopints/megatron-1tp
VOCAB_FILE=gpt2-vocab.json
MERGE_FILE=gpt2-merges.txt
APP="python tools/generate_samples_gpt.py \
--tensor-model-parallel-size 1 \
--num-layers 24 \
--hidden-size 1024 \
--load $CHECKPOINT_PATH \
--num-attention-heads 16 \
--max-position-embeddings 1024 \
--tokenizer-type GPT2BPETokenizer \
--fp16 \
--micro-batch-size 2 \
--seq-length 1024 \
--out-seq-length 128 \
--temperature 1.0 \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--genfile gpt2-genfile.json \
--num-samples 4 \
--top_p 0.9 \
--recompute \
--rank ${RANK} \
--world_size ${WORLD_SIZE}"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
#!/bin/bash
export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_FIND_MODE=3
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
RANK=$OMPI_COMM_WORLD_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
MODEL_NAME=gpt2-4tp
DATA_OUTPUT_PATH=./
LOGS_PATH=$DATA_OUTPUT_PATH/logs
CHECKPOINT_PATH=checkpoint/$MODEL_NAME
DATA_PATH=my-gpt2_text_document
TENSORBOARD_PATH=output_dir/tensorboard/$MODEL_NAME
CODECARBON_PATH=output_dir/codecarbon/$MODEL_NAME
TP_SIZE=4 # always fixed to the size of a single node
PP_SIZE=1 # NLAYERS must be a multiple of PP_SIZE here
MICRO_BATCH_SIZE=2
GLOBAL_BATCH_SIZE=32
NLAYERS=24
NHIDDEN=1024
NHEADS=16
SEQ_LEN=1024
SAVE_INTERVAL=1000
OPTIMIZER_ARGS=" \
--optimizer adam \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--adam-eps 1e-8 \
--lr 6.0e-5 \
--min-lr 6.0e-6 \
--lr-decay-style cosine \
--clip-grad 1.0 \
--weight-decay 1e-1 \
"
GPT_ARGS=" \
--num-layers $NLAYERS \
--hidden-size $NHIDDEN \
--num-attention-heads $NHEADS \
--seq-length $SEQ_LEN \
--max-position-embeddings $SEQ_LEN \
--micro-batch-size $MICRO_BATCH_SIZE \
--global-batch-size $GLOBAL_BATCH_SIZE \
--train-iters 1000 \
--loss-scale 12 \
--vocab-file gpt2-vocab.json \
--merge-file gpt2-merges.txt \
--clip-grad 1.0 \
--fp16 \
--checkpoint-activations \
--seed 42
$OPTIMIZER_ARGS \
"
OUTPUT_ARGS=" \
--log-interval 1 \
--save-interval $SAVE_INTERVAL \
--eval-interval 10 \
--eval-iters 40 \
--tensorboard-dir $TENSORBOARD_PATH \
--tensorboard-queue-size 5 \
--log-timers-to-tensorboard \
--log-batch-size-to-tensorboard \
--log-validation-ppl-to-tensorboard \
"
DATA_ARGS=" \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
"
ZERO_STAGE=1
config_json="./${MODEL_NAME}_ds_config.json"
cat <<EOT > $config_json
{
"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
"train_batch_size": $GLOBAL_BATCH_SIZE,
"gradient_clipping": 1.0,
"zero_optimization": {
"stage": $ZERO_STAGE
},
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 12
},
"steps_per_print": 2000,
"wall_clock_breakdown": false
}
EOT
DEEPSPEED_ARGS=" \
--deepspeed \
--deepspeed_config ${config_json} \
--zero-stage ${ZERO_STAGE} \
--deepspeed-activation-checkpointing \
"
APP="python pretrain_gpt.py \
--tensor-model-parallel-size $TP_SIZE \
--pipeline-model-parallel-size $PP_SIZE \
$GPT_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
$DEEPSPEED_ARGS \
--rank ${RANK} \
--world_size ${WORLD_SIZE} \
--dist_url env://127.0.0.1::34566
"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
#!/bin/bash
export NCCL_SOCKET_IFNAME=ib0
export NCCL_IB_HCA=mlx5
export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_FIND_MODE=3
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
RANK=$OMPI_COMM_WORLD_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
MODEL_NAME=gpt2-oscar_16B-4tp
DATA_OUTPUT_PATH=./
LOGS_PATH=$DATA_OUTPUT_PATH/logs
CHECKPOINT_PATH=checkopints/$MODEL_NAME
DATA_PATH=my-gpt2_text_document
TENSORBOARD_PATH=output_dir/tensorboard/$MODEL_NAME
CODECARBON_PATH=output_dir/codecarbon/$MODEL_NAME
TP_SIZE=4 # always fixed to the size of a single node
PP_SIZE=8 # NLAYERS must be a multiple of PP_SIZE here
MICRO_BATCH_SIZE=1
GLOBAL_BATCH_SIZE=128
NLAYERS=40
NHIDDEN=5760
NHEADS=24
SEQ_LEN=2048
SAVE_INTERVAL=1000
OPTIMIZER_ARGS=" \
--optimizer adam \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--adam-eps 1e-8 \
--lr 6.0e-5 \
--min-lr 6.0e-6 \
--lr-decay-style cosine \
--clip-grad 1.0 \
--weight-decay 1e-1 \
"
GPT_ARGS=" \
--num-layers $NLAYERS \
--hidden-size $NHIDDEN \
--num-attention-heads $NHEADS \
--seq-length $SEQ_LEN \
--max-position-embeddings $SEQ_LEN \
--micro-batch-size $MICRO_BATCH_SIZE \
--global-batch-size $GLOBAL_BATCH_SIZE \
--train_iters 7000 \
--loss-scale 12 \
--vocab-file gpt2-vocab.json \
--merge-file gpt2-merges.txt \
--clip-grad 1.0 \
--checkpoint-activations \
--seed 42
$OPTIMIZER_ARGS \
"
OUTPUT_ARGS=" \
--log-interval 1 \
--save-interval $SAVE_INTERVAL \
--eval-interval 1000 \
--eval-iters 40 \
--tensorboard-dir $TENSORBOARD_PATH \
--tensorboard-queue-size 5 \
--log-timers-to-tensorboard \
--log-batch-size-to-tensorboard \
--log-validation-ppl-to-tensorboard \
"
DATA_ARGS=" \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
"
ZERO_STAGE=1
config_json="./${MODEL_NAME}_ds_config.json"
cat <<EOT > $config_json
{
"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
"train_batch_size": $GLOBAL_BATCH_SIZE,
"gradient_clipping": 1.0,
"zero_optimization": {
"stage": $ZERO_STAGE
},
"fp16": {
"enabled": false,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 12
},
"steps_per_print": 2000,
"wall_clock_breakdown": false
}
EOT
DEEPSPEED_ARGS=" \
--deepspeed \
--deepspeed_config ${config_json} \
--zero-stage ${ZERO_STAGE} \
--deepspeed-activation-checkpointing \
"
export CMD=" \
--tensor-model-parallel-size $TP_SIZE \
--pipeline-model-parallel-size $PP_SIZE \
$GPT_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
$DEEPSPEED_ARGS \
"
APP="python3 -u `pwd`/pretrain_gpt.py \
--rank ${RANK} \
--world_size ${WORLD_SIZE} \
--dist_url tcp://${1}:34566 \
--num-workers 2 \
${CMD} \
"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment