更新运行方式

c338d32c · hepj987 · bf95e032 · c338d32c · c338d32c · c338d32c
Commit c338d32c authored Jul 14, 2023 by hepj987
13 changed files
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ GPT2模型：第二代生成式预训练模型（Generative Pre-Training2）。
 ### 模型结构

 ```
-GPT2使用 Transformer 的 Decoder 结构，并对 Transformer Decoder 进行了一些改动，并通过Megatron和deepspeed进行分布式运行
+GPT2使用 Transformer 的 Decoder 结构，并对 Transformer Decoder 进行了一些改动，并通过Megatron和deepspeed可以使用DP、TP、PP的3D并行式的分布式方式训练
 ```

 ### 数据集
@@ -21,16 +21,19 @@ wget https://huggingface.co/bigscience/misc-test-data/resolve/main/stas/oscar-1G
 wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
 wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
 xz -d oscar-1GB.jsonl.xz
+
+#处理数据集参数
+--input				输入数据集路径，即oscar-1GB.jsonl.xz解压后的文件路径
+--output-prefix		输出数据路径，处理后会自动加上_text_document后缀
+--vocab				下载的gpt2-vocab.json词表文件路径
+--dataset-impl		dataset类型
+--tokenizer-type 	tokenizer类型
+--merge-file		下载的gpt2-merges.txt文件路径		
+--append-eod		添加结束标志符		
+--workers			进程数
+
 #处理数据集
-python tools/preprocess_data.py \
-    --input oscar-1GB.jsonl \
-    --output-prefix my-gpt2 \
-    --vocab gpt2-vocab.json \
-    --dataset-impl mmap \
-    --tokenizer-type GPT2BPETokenizer \
-    --merge-file gpt2-merges.txt \
-    --append-eod \
-    --workers 8
+sh creat-data.sh
 ```

 ## GPT2预训练
@@ -46,15 +49,15 @@ docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.10.0-centos7.6-dtk
 进入docker

 ```
-pip install -r requirements.txt  -i http://pypi.tuna.tsinghua.edu.cn/simple  --trusted-host pypi.tuna.tsinghua.edu.cn
+pip install -r requirements.txt  -i https://mirrors.aliyun.com/pypi/simple/  --trusted-host mirrors.aliyun.com
 ```

-### 训练（单卡测试样例）
+### GPT2单节点训练

 ```
-rm megatron/arguments.py
-cp megatron/arguments.py-one_node megatron/arguments.py
-sh run-train.sh（基于单节点四卡）
+#np为起的进程数，和使用GPU数量一致，并且TP*PP < np,4卡的话可以设置2tp 2pp，或者1tp 4pp，4tp 1pp，节点内使用TP性能更好
+
+mpirun -np 4 run-one-node.sh（基于单节点四卡）
 ```

 ```
@@ -81,23 +84,43 @@ SAVE_INTERVAL				保存频率
 --eval-iters				验证iter
 ```

-### GPT2模型16B训练(多节点)
+### GPT2模型16B多节点训练

-要求DCU集群Slurm环境正常。
+要求DCU集群配置好相应的虚拟环境，已安装python依赖项。

-推荐用户使用预编译好的python3.7包来快速建立python3虚拟环境，pytorch、apex、torchaudio、colossalai、faiss、mmcv-full 、torchvision、tensorflow需要在[光合开发者社区](https://cancon.hpccube.com:65024/4/main/)下载所需DCU版本安装包
+在安装一下依赖时需要使用基于DTK编译的版本,下载地址在[光合开发者社区](https://cancon.hpccube.com:65024/4/main/)

 ```
+pytorch
+deepspeed
+apex
+torchaudio
+colossalai
+faiss
+mmcv-full
+torchvision
+tensorflow
+```
+
+这里以DTK23.04、python3.7，torch1.10为例，进入[光合开发者社区](https://cancon.hpccube.com:65024/4/main/)进入到pytorch->dtk23.04->下载 torch-1.10.0+gite378c3c.abi0.dtk2304-cp37-cp37m-manylinux2014_x86_64.whl。然后可以仿照下边配置环境：
+
+```
+#创建虚拟环境
 export PYTHON3_LIB_PATH=/python_lib_path
 virtualenv -p /python_bin_path/python3 --system-site-packages venv_gpt2
-source env.sh	#进入venv_gpt2虚拟环境
-
+#进入venv_gpt2虚拟环境
+source venv_gpt2/bin/activate
+#加载DTK以及其他环境设置
+source env.sh		
+#安装DTK版本依赖
+pip install torch-1.10.0+gite378c3c.abi0.dtk2304-cp37-cp37m-manylinux2014_x86_64.whl
+pip install deepspeed-0.9.2+git25d5540.abi0.dtk2304.torch1.10.0-cp37-cp37m-manylinux2014_x86_64.whl
+#安装其他依赖
 pip install -r requirements.txt  -i http://pypi.tuna.tsinghua.edu.cn/simple  --trusted-host pypi.tuna.tsinghua.edu.cn
 ```

 ```
-rm megatron/arguments.py
-cp megatron/arguments.py-nodes megatron/arguments.py
+#多节点运行
 sbatch  run-16B.sh(主要参数在single-16B.sh)
 ```

@@ -137,61 +160,45 @@ SAVE_INTERVAL				保存频率
 | :-------: | :-----------: | :----------: |
 | 32 x 4DCU | 4.299443E+00  | 7.365877E+01 |

-
-
 ## GPT2文本生成

-使用GPT做文本生成时需要对训练好的模型进行转换，转换需要安装0.7.3版本 deepspeed（此工程已包含）
+### 转换成多卡推理

 ```
-pip install deepspeed-0.7.3+unknown-cp37-cp37m-linux_x86_64.whl -i http://pypi.tuna.tsinghua.edu.cn/simple  --trusted-host pypi.tuna.tsinghua.edu.cn
+#训练后的模型保存格式为deepspeed格式，如果用于推理，需要进行格式转换成megatron格式，deepspeed-> megatron格式时转换前后TP数需要保持相同
+#转换脚本
+sh conver-model_to_megatron.sh
 ```

-对deepspeed进行一些修改
-
 ```
-修改/usr/local/lib/python3.7/site-packages/deepspeed/checkpoint/constants.py
-第34行
-	ZERO_FILE_PREFIX = 'bf16_' + 'zero_pp_rank_'
-改为：
-	ZERO_FILE_PREFIX =  'zero_pp_rank_'
-
-修改/usr/local/lib/python3.7/site-packages/deepspeed/ops/op_builder/builder.py
-第133行 def assert_torch_info(torch_info):函数
-删除下边的版本判断
-	install_torch_version = torch_info['version']
-	install_cuda_version = torch_info['cuda_version']
-	install_hip_version = torch_info['hip_version']
-
-修改/usr/local/lib/python3.7/site-packages/deepspeed/runtime/state_dict_factory.py文件
-第177行def check_ckpt_list(self):函数
-删除mp_world_size判断
-	if 'mp_world_size' in sd.keys():
-            assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}"
+#重要参数
+需要将工程路径加入PYTHONPATH
+例如：export PYTHONPATH=/home/megatron-deepspeed_dtk23.04:$PYTHONPATH

+CHECKPOINT_PATH  需要转换的模型路径(具体到保存的global_step)
+output_folder	 转换后的模型路径
+target_tp		 转换后的TP数，与训练保持一直或设置为1
+target_pp		 转换后的PP数，与训练保持一直或设置为1
 ```

-### 转换脚本
+### 转换成单卡推理

 ```
-sh conver.sh
-```
+#原始模型保存的是deepspeed格式，deepspeed-> megatron格式时转换前后TP数需要保持相同，因此需要先deepspeed->deepspeed（改变TP成1），然后再由deepspeed-> megatron转换成可推理的格式

+#转换脚本
+sh conver-model-1tp.sh
 ```
-#重要参数
-需要将工程路径加入PYTHONPATH
-例如：export PYTHONPATH=/home/megatron-deepspeed_dtk22.10:$PYTHONPATH

-CHECKPOINT_PATH  需要转换的模型路径(具体到保存的global_step)
-output_folder	 转换后的模型路径
-target_tp		 转换后的TP数（需要与训练时保持一致） 	 
-target_pp		 转换后的PP数 （设置为1）
-```
+

 ### 无条件文本生成

 ```
-sh run-inf.sh（这里以单节点小模型为例）
+#多卡推理
+mpirun -np 4 run-inf-gpus.sh
+#单卡推理
+mpirun -np 1 run-inf.sh
 ```

 ```

--- a/conver-model-1tp.sh
+++ b/conver-model-1tp.sh
+export PYTHONPATH=/home/megatron-deepspeed_dtk23.04::$PYTHONPATH
+CHECKPOINT_PATH=/home/megatron-deepspeed-dtk23.04/checkopints/gpt2-4tp/global_step1000
+OUTPUT_PATH=./checkopints/megatron-1tp
+
+python tools/convert_checkpoint/deepspeed_to_deepspeed.py \
+  --input_folder $CHECKPOINT_PATH \
+  --output_folder ./conver-model-deepspeed-1tp \
+  --target_tp 1 \
+  --target_pp 1
+
+python tools/convert_checkpoint/deepspeed_to_megatron.py \
+  --input_folder ./conver-model-deepspeed-1tp/global_step1000 \
+  --output_folder $OUTPUT_PATH \
+  --target_tp 1 \
+  --target_pp 1
--- a/conver-model_to_megatron.sh
+++ b/conver-model_to_megatron.sh
+export PYTHONPATH=/home/megatron-deepspeed_dtk23.04::$PYTHONPATH
+CHECKPOINT_PATH=/home/megatron-deepspeed-dtk23.04/checkopints/gpt2-oscar_16B-4tp/global_step1000
+OUTPUT_PATH=./conver-4tp-model
+python tools/convert_checkpoint/deepspeed_to_megatron.py \
+  --input_folder $CHECKPOINT_PATH \
+  --output_folder $OUTPUT_PATH \
+  --target_tp 4 \
+  --target_pp 1
+
--- a/creat-data.sh
+++ b/creat-data.sh
+python tools/preprocess_data.py \
+    --input oscar-1GB.jsonl \
+    --output-prefix my-gpt2 \
+    --vocab gpt2-vocab.json \
+    --dataset-impl mmap \
+    --tokenizer-type GPT2BPETokenizer \
+    --merge-file gpt2-merges.txt \
+    --append-eod \
+    --workers 8
--- a/megatron/arguments.py-nodes
+++ b/megatron/arguments.py-nodes
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Megatron arguments."""
-
-import argparse
-import collections
-import os
-import re
-import time
-
-import torch
-import deepspeed
-
-from megatron.enums import PositionEmbeddingType
-import megatron
-from megatron.logging import log_levels
-
-
-def parse_args(extra_args_provider=None, defaults={},
-               ignore_unknown_args=False):
-    """Parse all arguments."""
-    parser = argparse.ArgumentParser(description='Megatron-LM Arguments',
-                                     allow_abbrev=False)
-
-    # Standard arguments.
-    parser = _add_network_size_args(parser)
-    parser = _add_regularization_args(parser)
-    parser = _add_training_args(parser)
-    parser = _add_initialization_args(parser)
-    parser = _add_learning_rate_args(parser)
-    parser = _add_checkpointing_args(parser)
-    parser = _add_mixed_precision_args(parser)
-    parser = _add_distributed_args(parser)
-    parser = _add_validation_args(parser)
-    parser = _add_data_args(parser)
-    parser = _add_autoresume_args(parser)
-    parser = _add_biencoder_args(parser)
-    parser = _add_vit_args(parser)
-    parser = _add_logging_args(parser)
-    parser = _add_zero_args(parser)
-    parser = _add_memoryopt_args(parser)
-    parser = _add_activation_checkpoint_args(parser)
-
-    # Custom arguments.
-    if extra_args_provider is not None:
-        parser = extra_args_provider(parser)
-
-    parser = deepspeed.add_config_arguments(parser)
-
-    # Parse.
-    if ignore_unknown_args:
-        args, _ = parser.parse_known_args()
-    else:
-        args = parser.parse_args()
-
-    # Distributed args.
-    #args.rank = int(os.getenv('RANK', '0'))
-    #args.world_size = int(os.getenv("WORLD_SIZE", '1'))
-    # Tensor model parallel size.
-    args.tensor_model_parallel_size = min(
-        args.tensor_model_parallel_size, args.world_size)
-    assert args.world_size % args.tensor_model_parallel_size == 0, 'world size'\
-        ' ({}) is not divisible by tensor model parallel size ({})'.format(
-            args.world_size, args.tensor_model_parallel_size)
-    # Pipeline model parallel size.
-    args.pipeline_model_parallel_size = min(
-        args.pipeline_model_parallel_size,
-        (args.world_size // args.tensor_model_parallel_size))
-    # Checks.
-    model_parallel_size = args.pipeline_model_parallel_size * \
-                          args.tensor_model_parallel_size
-    assert args.world_size % model_parallel_size == 0, 'world size is not'\
-        ' divisible by tensor parallel size ({}) times pipeline parallel ' \
-        'size ({})'.format(args.world_size, args.tensor_model_parallel_size,
-                           args.pipeline_model_parallel_size)
-    args.data_parallel_size = args.world_size // model_parallel_size
-    if args.rank == 0:
-        print('using world size: {}, data-parallel-size: {}, '
-              'tensor-model-parallel size: {}, '
-              'pipeline-model-parallel size: {} '.format(
-                  args.world_size, args.data_parallel_size,
-                  args.tensor_model_parallel_size,
-                  args.pipeline_model_parallel_size), flush=True)
-
-    # --data-path and --train-weighted-splits-paths
-    message = "Data loading Mode 1: --data-path and --split "\
-            "and Mode 2: --(train|valid|test)-weighted-split-paths"\
-            "are mutually exclusive i.e. cannot be set together."
-
-    if args.data_path:
-        assert args.train_weighted_split_paths is None, message
-        setattr(args, "valid_weighted_split_names", None)
-        setattr(args, "valid_weighted_split_weights", None)
-        setattr(args, "valid_weighted_split_splits", None)
-
-        setattr(args, "test_weighted_split_names", None)
-        setattr(args, "test_weighted_split_weights", None)
-        setattr(args, "test_weighted_split_splits", None)
-
-        # args.split default value in the args is None it is set here in order
-        # to check that it does not to overlap with the 2nd mode of data loading
-        if args.split is None:
-            args.split = "969, 30, 1"
-
-    if args.train_weighted_split_paths or args.valid_weighted_split_paths or \
-                args.test_weighted_split_paths:
-        assert args.data_path is None and args.split is None, message
-
-
-
-    # Deprecated arguments
-    assert args.batch_size is None, '--batch-size argument is no longer ' \
-        'valid, use --micro-batch-size instead'
-    del args.batch_size
-    assert args.warmup is None, '--warmup argument is no longer valid, use ' \
-        '--lr-warmup-fraction instead'
-    del args.warmup
-    assert args.model_parallel_size is None, '--model-parallel-size is no ' \
-        'longer valid, use --tensor-model-parallel-size instead'
-    del args.model_parallel_size
-
-    # Set input defaults.
-    for key in defaults:
-        # For default to be valid, it should not be provided in the
-        # arguments that are passed to the program. We check this by
-        # ensuring the arg is set to None.
-        if getattr(args, key) is not None:
-            if args.rank == 0:
-                print('WARNING: overriding default arguments for {key}:{v} \
-                       with {key}:{v2}'.format(key=key, v=defaults[key],
-                                               v2=getattr(args, key)),
-                                               flush=True)
-        else:
-            setattr(args, key, defaults[key])
-
-    # Batch size.
-    assert args.micro_batch_size is not None
-    assert args.micro_batch_size > 0
-    if args.global_batch_size is None:
-        args.global_batch_size = args.micro_batch_size * args.data_parallel_size
-        if args.rank == 0:
-            print('setting global batch size to {}'.format(
-                args.global_batch_size), flush=True)
-    assert args.global_batch_size > 0
-    if args.num_layers_per_virtual_pipeline_stage is not None:
-        assert args.pipeline_model_parallel_size > 2, \
-            'pipeline-model-parallel size should be greater than 2 with ' \
-            'interleaved schedule'
-        assert args.num_layers % args.num_layers_per_virtual_pipeline_stage == 0, \
-            'number of layers is not divisible by number of layers per virtual ' \
-            'pipeline stage'
-        args.virtual_pipeline_model_parallel_size = \
-            (args.num_layers // args.pipeline_model_parallel_size) // \
-            args.num_layers_per_virtual_pipeline_stage
-    else:
-        args.virtual_pipeline_model_parallel_size = None
-
-    # Parameters dtype.
-    args.params_dtype = torch.float
-    if args.fp16:
-        assert not args.bf16
-        args.params_dtype = torch.half
-    if args.bf16:
-        assert not args.fp16
-        args.params_dtype = torch.bfloat16
-        # bfloat16 requires gradient accumulation and all-reduce to
-        # be done in fp32.
-        if not args.accumulate_allreduce_grads_in_fp32:
-            args.accumulate_allreduce_grads_in_fp32 = True
-            if args.rank == 0:
-                print('accumulate and all-reduce gradients in fp32 for '
-                      'bfloat16 data type.', flush=True)
-
-    if args.rank == 0:
-        print('using {} for parameters ...'.format(args.params_dtype),
-              flush=True)
-
-    # If we do accumulation and all-reduces in fp32, we need to have
-    # local DDP and we should set the use-contiguous-buffers-in-ddp.
-    if args.accumulate_allreduce_grads_in_fp32:
-        assert args.DDP_impl == 'local'
-        args.use_contiguous_buffers_in_ddp = True
-
-    if args.dataloader_type is None:
-        args.dataloader_type = 'single'
-
-    # Consumed tokens.
-    args.consumed_train_samples = 0
-    args.consumed_valid_samples = 0
-    args.consumed_train_tokens = 0
-    args.gigaflos_no_embeds = 0
-
-    # Iteration-based training.
-    if args.train_iters:
-        # If we use iteration-based training, make sure the
-        # sample-based options are off.
-        assert args.train_samples is None, \
-            'expected iteration-based training'
-        assert args.lr_decay_samples is None, \
-            'expected iteration-based learning rate decay'
-        assert args.lr_warmup_samples == 0, \
-            'expected iteration-based learning rate warmup'
-        assert args.rampup_batch_size is None, \
-            'expected no batch-size rampup for iteration-based training'
-        if args.lr_warmup_fraction is not None:
-            assert args.lr_warmup_iters == 0, \
-                'can only specify one of lr-warmup-fraction and lr-warmup-iters'
-
-    # Sample-based training.
-    if args.train_samples:
-        # If we use sample-based training, make sure the
-        # iteration-based options are off.
-        assert args.train_iters is None, \
-            'expected sample-based training'
-        assert args.lr_decay_iters is None, \
-            'expected sample-based learning rate decay'
-        assert args.lr_warmup_iters == 0, \
-            'expected sample-based learnig rate warmup'
-        if args.lr_warmup_fraction is not None:
-            assert args.lr_warmup_samples == 0, \
-                'can only specify one of lr-warmup-fraction ' \
-                'and lr-warmup-samples'
-
-    # Check required arguments.
-    required_args = ['num_layers', 'hidden_size', 'num_attention_heads']
-    for req_arg in required_args:
-        _check_arg_is_not_none(args, req_arg)
-
-    # Checks.
-    if args.ffn_hidden_size is None:
-        args.ffn_hidden_size = 4 * args.hidden_size
-
-    if args.kv_channels is None:
-        assert args.hidden_size % args.num_attention_heads == 0
-        args.kv_channels = args.hidden_size // args.num_attention_heads
-
-    if args.seq_length is not None:
-        assert args.encoder_seq_length is None
-        args.encoder_seq_length = args.seq_length
-    else:
-        assert args.encoder_seq_length is not None
-        args.seq_length = args.encoder_seq_length
-
-    if args.position_embedding_type == PositionEmbeddingType.absolute or args.position_embedding_type == PositionEmbeddingType.alibi:
-        assert args.max_position_embeddings is not None
-        if args.seq_length is not None:
-            assert args.max_position_embeddings >= args.seq_length
-        if args.decoder_seq_length is not None:
-            assert args.max_position_embeddings >= args.decoder_seq_length
-    else:
-        assert args.max_position_embeddings is None
-
-    if args.lr is not None:
-        assert args.min_lr <= args.lr
-    if args.save is not None:
-        assert args.save_interval is not None
-    # Mixed precision checks.
-    if args.fp16_lm_cross_entropy:
-        assert args.fp16, 'lm cross entropy in fp16 only support in fp16 mode.'
-    if args.fp32_residual_connection:
-        assert args.fp16 or args.bf16, \
-            'residual connection in fp32 only supported when using fp16 or bf16.'
-    # Activation checkpointing.
-    if args.distribute_checkpointed_activations:
-        assert args.checkpoint_activations, \
-            'for distribute-checkpointed-activations to work you '\
-            'need to enable checkpoint-activations'
-
-    args.curriculum_learning = False
-
-    # Activation function
-    if args.glu_activation is not None and args.bias_gelu_fusion:
-        raise ValueError("if glu-activation is used, please set --no-bias-gelu-fusion")
-
-    # Skip train iterations
-    if args.skip_train_iteration_range is not None:
-        args.skip_train_iteration_range = [
-            list(map(int, range_.split("-"))) for range_ in args.skip_train_iteration_range
-        ]
-        args.skip_train_iteration_range.sort()
-        skip_train_iteration_range = collections.deque()
-        for range_ in args.skip_train_iteration_range:
-            if len(range_) == 2:
-                start, end = range_
-                assert end >= start, \
-                "end of skip range cannot be smaller than start of skip range"
-                # merge overlapping intervals (e.g. 1-5 2-6 -> 1-6)
-                if not skip_train_iteration_range:
-                    skip_train_iteration_range.append([start, end])
-                elif skip_train_iteration_range[-1][1] >= start:
-                    skip_train_iteration_range[-1][1] = max(end, skip_train_iteration_range[-1][1])
-                else:
-                    skip_train_iteration_range.append([start, end])
-            else:
-                raise ValueError(
-                    "skip train iterations should be specified as two numbers, i.e. start-end"
-                )
-        args.skip_train_iteration_range = skip_train_iteration_range
-
-    if args.use_bnb_optimizer:
-        try:
-            import bitsandbytes as bnb
-        except ModuleNotFoundError:
-            raise ModuleNotFoundError("Please install bitsandbytes from https://github.com/facebookresearch/bitsandbytes.")
-
-    _print_args(args)
-    return args
-
-
-def _print_args(args):
-    """Print arguments."""
-    if args.rank == 0:
-        print('------------------------ arguments ------------------------',
-              flush=True)
-        str_list = []
-        for arg in vars(args):
-            dots = '.' * (48 - len(arg))
-            str_list.append('  {} {} {}'.format(arg, dots, getattr(args, arg)))
-
-        if args.log_path is not None:
-            with open(os.path.join(args.log_path,f'args_{time.strftime("%Y-%m-%dT%H:%M:%S")}.txt'), 'w') as f:
-                for arg in sorted(str_list, key=lambda x: x.lower()):
-                    f.write(arg+"\n")
-                    print(arg, flush=True)
-        else:
-            for arg in sorted(str_list, key=lambda x: x.lower()):
-                print(arg, flush=True)
-        print('-------------------- end of arguments ---------------------',
-              flush=True)
-
-
-def _check_arg_is_not_none(args, arg):
-    assert getattr(args, arg) is not None, '{} argument is None'.format(arg)
-
-
-def _add_network_size_args(parser):
-    group = parser.add_argument_group(title='network size')
-
-    group.add_argument('--num-layers', type=int, default=None,
-                       help='Number of transformer layers.')
-    group.add_argument('--hidden-size', type=int, default=None,
-                       help='Tansformer hidden size.')
-    group.add_argument('--ffn-hidden-size', type=int, default=None,
-                       help='Transformer Feed-Forward Network hidden size. '
-                       'This is set to 4*hidden-size if not provided')
-    group.add_argument('--num-attention-heads', type=int, default=None,
-                       help='Number of transformer attention heads.')
-    group.add_argument('--kv-channels', type=int, default=None,
-                       help='Projection weights dimension in multi-head '
-                       'attention. This is set to '
-                       '   args.hidden_size // args.num_attention_heads '
-                       'if not provided.')
-    group.add_argument('--max-position-embeddings', type=int, default=None,
-                       help='Maximum number of position embeddings to use. '
-                       'This is the size of position embedding.')
-    group.add_argument('--make-vocab-size-divisible-by', type=int, default=128,
-                       help='Pad the vocab size to be divisible by this value.'
-                       'This is added for computational efficieny reasons.')
-    group.add_argument('--pad-vocab-size-to', type=int, default=None,
-                       help='Pad the vocab size to this value.'
-                       'This value must be greater than the initial size of the tokenizer'
-                       ', needs to be divisible by TP size and `make-vocab-size-divisible-by`.')
-    group.add_argument('--layernorm-epsilon', type=float, default=1e-5,
-                       help='Layer norm epsilon.')
-    group.add_argument('--sync-tp-duplicated-parameters', action='store_true',
-                       help='Force syncing duplicated params across TP ranks in forward. '
-                       'This is a workaround for an unresolved bug leading to TP ranks '
-                       'getting out of sync with each other.')
-    group.add_argument('--apply-residual-connection-post-layernorm',
-                       action='store_true',
-                       help='If set, use original BERT residula connection '
-                       'ordering.')
-    group.add_argument('--embed-layernorm', action='store_true',
-                       help='use layernorm for embedding')
-    group.add_argument('--openai-gelu', action='store_true',
-                       help='Use OpenAIs GeLU implementation. This option'
-                       'should not be used unless for backward compatibility'
-                       'reasons.')
-    group.add_argument('--onnx-safe', type=bool, required=False,
-                       help='Use workarounds for known problems with '
-                       'Torch ONNX exporter')
-    group.add_argument('--bert-no-binary-head', action='store_false',
-                       help='Disable BERT binary head.',
-                       dest='bert_binary_head')
-    group.add_argument('--position-embedding-type', type=lambda x: PositionEmbeddingType[x],
-                       choices=list(PositionEmbeddingType),
-                       default=PositionEmbeddingType.absolute,
-                       help='Define position embedding type ("absolute" | "rotary" | "alibi"). "absolute" by default.'
-                       )
-    group.add_argument('--glu-activation', type=str,
-                       choices=megatron.model.glu_activations.GLU_ACTIVATIONS.keys(),
-                       help='GLU activations to use.'
-                       )
-
-    group.add_argument('--kill-switch-path', type=str,
-                       help='path to look for a kill switch, which if found will automatically exit the program'
-                       )
-
-
-    group.add_argument('--log-level', type=str, choices=list(log_levels.keys()),
-                       help="Logger log level to use on the main process. Possible choices are the log levels as strings: 'debug', "
-                       "'info', 'warning', 'error' and 'critical', plus a 'passive' level which doesn't set anything and lets the "
-                       "application set the level."
-                       )
-    group.add_argument('--log-level-replica', type=str, choices=list(log_levels.keys()),
-                       help="Logger log level to use on replicas. Same choices as ``log_level``"
-                       )
-    return parser
-
-
-def _add_logging_args(parser):
-    group = parser.add_argument_group(title='logging')
-
-    group.add_argument('--log-params-norm', action='store_true',
-                       help='If set, calculate and log parameters norm.')
-    group.add_argument('--log-num-zeros-in-grad', action='store_true',
-                       help='If set, calculate and log the number of zeros in gradient.')
-    group.add_argument('--tensorboard-log-interval', type=int, default=1,
-                       help='Report to tensorboard interval.')
-    group.add_argument('--tensorboard-queue-size', type=int, default=1000,
-                       help='Size of the tensorboard queue for pending events '
-                       'and summaries before one of the ‘add’ calls forces a '
-                       'flush to disk.')
-    group.add_argument('--log-timers-to-tensorboard', action='store_true',
-                       help='If set, write timers to tensorboard.')
-    group.add_argument('--log-batch-size-to-tensorboard', action='store_true',
-                       help='If set, write batch-size to tensorboard.')
-    group.add_argument('--no-log-learnig-rate-to-tensorboard',
-                       action='store_false',
-                       help='Disable learning rate logging to tensorboard.',
-                       dest='log_learning_rate_to_tensorboard')
-    group.add_argument('--no-log-loss-scale-to-tensorboard',
-                       action='store_false',
-                       help='Disable loss-scale logging to tensorboard.',
-                       dest='log_loss_scale_to_tensorboard')
-    group.add_argument('--log-validation-ppl-to-tensorboard',
-                       action='store_true',
-                       help='If set, write validation perplexity to '
-                       'tensorboard.')
-
-    return parser
-
-
-def _add_regularization_args(parser):
-    group = parser.add_argument_group(title='regularization')
-
-    group.add_argument('--attention-dropout', type=float, default=0.1,
-                       help='Post attention dropout probability.')
-    group.add_argument('--hidden-dropout', type=float, default=0.1,
-                       help='Dropout probability for hidden state transformer.')
-    group.add_argument('--weight-decay', type=float, default=0.01,
-                       help='Weight decay coefficient for L2 regularization.')
-    group.add_argument('--clip-grad', type=float, default=1.0,
-                       help='Gradient clipping based on global L2 norm.')
-    group.add_argument('--adam-beta1', type=float, default=0.9,
-                       help='First coefficient for computing running averages '
-                       'of gradient and its square')
-    group.add_argument('--adam-beta2', type=float, default=0.999,
-                       help='Second coefficient for computing running averages '
-                       'of gradient and its square')
-    group.add_argument('--adam-eps', type=float, default=1e-08,
-                       help='Term added to the denominator to improve'
-                       'numerical stability')
-    group.add_argument('--sgd-momentum', type=float, default=0.9,
-                       help='Momentum factor for sgd')
-
-    return parser
-
-
-def _add_training_args(parser):
-    group = parser.add_argument_group(title='training')
-
-    group.add_argument('--micro-batch-size', type=int, default=None,
-                       help='Batch size per model instance (local batch size). '
-                       'Global batch size is local batch size times data '
-                       'parallel size times number of micro batches.')
-    group.add_argument('--batch-size', type=int, default=None,
-                       help='Old batch size parameter, do not use. '
-                       'Use --micro-batch-size instead')
-    group.add_argument('--global-batch-size', type=int, default=None,
-                       help='Training batch size. If set, it should be a '
-                       'multiple of micro-batch-size times data-parallel-size. '
-                       'If this value is None, then '
-                       'use micro-batch-size * data-parallel-size as the '
-                       'global batch size. This choice will result in 1 for '
-                       'number of micro-batches.')
-    group.add_argument('--rampup-batch-size', nargs='*', default=None,
-                       help='Batch size ramp up with the following values:'
-                       '  --rampup-batch-size <start batch size> '
-                       '                      <batch size increment> '
-                       '                      <ramp-up samples> '
-                       'For example: '
-                       '   --rampup-batch-size 16 8 300000 '
-                       '   --global-batch-size 1024 '
-                       'will start with global batch size 16 and over '
-                       ' (1024 - 16) / 8 = 126 intervals will increase '
-                       'the batch size linearly to 1024. In each interval '
-                       'we will use approximately 300000 / 126 = 2380 samples.')
-    group.add_argument('--checkpoint-activations', action='store_true',
-                       help='Checkpoint activation to allow for training '
-                       'with larger models, sequences, and batch sizes.')
-    group.add_argument('--distribute-checkpointed-activations',
-                       action='store_true',
-                       help='If set, distribute checkpointed activations '
-                       'across model parallel group.')
-    group.add_argument('--checkpoint-num-layers', type=int, default=1,
-                       help='chunk size (number of layers) for checkpointing.')
-    group.add_argument('--train-iters', type=int, default=None,
-                       help='Total number of iterations to train over all '
-                       'training runs. Note that either train-iters or '
-                       'train-samples should be provided.')
-    group.add_argument('--train-samples', type=int, default=None,
-                       help='Total number of samples to train over all '
-                       'training runs. Note that either train-iters or '
-                       'train-samples should be provided.')
-    group.add_argument('--train-tokens', type=int, default=None,
-                       help='Total number of tokens to train over all '
-                       'training runs.')
-    group.add_argument('--log-interval', type=int, default=100,
-                       help='Report loss and timing interval.')
-    group.add_argument('--exit-interval', type=int, default=None,
-                       help='Exit the program after the iteration is divisible '
-                       'by this value.')
-    group.add_argument('--exit-duration-in-mins', type=int, default=None,
-                       help='Exit the program after this many minutes.')
-    group.add_argument('--tensorboard-dir', type=str, default=None,
-                       help='Write TensorBoard logs to this directory.')
-    group.add_argument('--no-masked-softmax-fusion',
-                       action='store_false',
-                       help='Disable fusion of query_key_value scaling, '
-                       'masking, and softmax.',
-                       dest='masked_softmax_fusion')
-    group.add_argument('--no-bias-gelu-fusion', action='store_false',
-                       help='Disable bias and gelu fusion.',
-                       dest='bias_gelu_fusion')
-    group.add_argument('--no-bias-dropout-fusion', action='store_false',
-                       help='Disable bias and dropout fusion.',
-                       dest='bias_dropout_fusion')
-    group.add_argument('--optimizer', type=str, default='adam',
-                       choices=['adam', 'sgd'],
-                       help='Optimizer function')
-    group.add_argument('--use-bnb-optimizer', action='store_true',
-                       help='Use bitsandbytes optimizer for efficient training,'
-                       'please refer https://github.com/facebookresearch/bitsandbytes.',
-                       dest='use_bnb_optimizer')
-    group.add_argument('--dataloader-type', type=str, default=None,
-                       choices=['single', 'cyclic'],
-                       help='Single pass vs multiple pass data loader')
-    group.add_argument('--cpu-optimizer', action='store_true',
-                       help='Run optimizer on CPU')
-    group.add_argument('--cpu_torch_adam', action='store_true',
-                       help='Use Torch Adam as optimizer on CPU.')
-    group.add_argument('--codecarbon-dir', type=str, default=None,
-                       help='Write CodeCarbon logs to this directory.')
-    group.add_argument('--eval-only', type=bool, required=False,
-                       help='If set to True, no train step will be performed.'
-                       'and only the evaluation on the `valid` and `test` sets '
-                       'will be performed' )
-    group.add_argument('--skip-train-iteration-range', type=str, nargs='+', default=None,
-                       help='Iteration ranges to skip. The values are one or more dash-separated ranges. e.g., 101-200 251-300.')
-    group.add_argument('--inference', action='store_true',
-                       help='Very basic inference mode: not allocating optim/lr - requires ZERO_STAGE=0')
-    group.add_argument('--abort-on-unmet-fused-kernel-constraints', action='store_true',
-                       help="If set to True, the program will abort if the constraints for loading a fused kernel aren't met")
-    group.add_argument('--pp-partition-method', type=str, default=None,
-                       help="Use to override the pipeline stages partitioning method. e.g., 'type:transformer|embedding'")
-
-    return parser
-
-
-def _add_initialization_args(parser):
-    group = parser.add_argument_group(title='initialization')
-
-    group.add_argument('--seed', type=int, default=1234,
-                       help='Random seed used for python, numpy, '
-                       'pytorch, and cuda.')
-    group.add_argument('--init-method-std', type=float, default=0.02,
-                       help='Standard deviation of the zero mean normal '
-                       'distribution used for weight initialization.')
-    group.add_argument('--init-method-xavier-uniform', action='store_true',
-                       help='Enable Xavier uniform parameter initialization')
-
-    return parser
-
-
-def _add_learning_rate_args(parser):
-    group = parser.add_argument_group(title='learning rate')
-
-    group.add_argument('--lr', type=float, default=None,
-                       help='Initial learning rate. Depending on decay style '
-                       'and initial warmup, the learing rate at each '
-                       'iteration would be different.')
-    group.add_argument('--lr-decay-style', type=str, default='linear',
-                       choices=['constant', 'linear', 'cosine'],
-                       help='Learning rate decay function.')
-    group.add_argument('--lr-decay-iters', type=int, default=None,
-                       help='number of iterations to decay learning rate over,'
-                       ' If None defaults to `--train-iters`')
-    group.add_argument('--lr-decay-samples', type=int, default=None,
-                       help='number of samples to decay learning rate over,'
-                       ' If None defaults to `--train-samples`')
-    group.add_argument('--lr-decay-tokens', type=int, default=None,
-                       help='number of tokens to decay learning rate over,'
-                       ' If not None will override iter/sample-based decay')
-    group.add_argument('--lr-warmup-fraction', type=float, default=None,
-                       help='fraction of lr-warmup-(iters/samples) to use '
-                       'for warmup (as a float)')
-    group.add_argument('--lr-warmup-iters', type=int, default=0,
-                       help='number of iterations to linearly warmup '
-                       'learning rate over.')
-    group.add_argument('--lr-warmup-samples', type=int, default=0,
-                       help='number of samples to linearly warmup '
-                       'learning rate over.')
-    group.add_argument('--warmup', type=int, default=None,
-                       help='Old lr warmup argument, do not use. Use one of the'
-                       '--lr-warmup-* arguments above')
-    group.add_argument('--min-lr', type=float, default=0.0,
-                       help='Minumum value for learning rate. The scheduler'
-                       'clip values below this threshold.')
-    group.add_argument('--override-lr-scheduler', action='store_true',
-                       help='Reset the values of the scheduler (learning rate,'
-                       'warmup iterations, minimum learning rate, maximum '
-                       'number of iterations, and decay style from input '
-                       'arguments and ignore values from checkpoints. Note'
-                       'that all the above values will be reset.')
-    group.add_argument('--use-checkpoint-lr-scheduler', action='store_true',
-                       help='Use checkpoint to set the values of the scheduler '
-                       '(learning rate, warmup iterations, minimum learning '
-                       'rate, maximum number of iterations, and decay style '
-                       'from checkpoint and ignore input arguments.')
-    group.add_argument('--universal-checkpoint', action='store_true',
-                        help='Loading a universal format checkpoint.')
-
-    return parser
-
-
-def _add_checkpointing_args(parser):
-    group = parser.add_argument_group(title='checkpointing')
-
-    group.add_argument('--save', type=str, default=None,
-                       help='Output directory to save checkpoints to.')
-    group.add_argument('--save-interval', type=int, default=None,
-                       help='Number of iterations between checkpoint saves.')
-    group.add_argument('--no-save-optim', action='store_true', default=None,
-                       help='Do not save current optimizer.')
-    group.add_argument('--no-save-rng', action='store_true', default=None,
-                       help='Do not save current rng state.')
-    group.add_argument('--load', type=str, default=None,
-                       help='Directory containing a model checkpoint.')
-    group.add_argument('--no-load-optim', action='store_true', default=None,
-                       help='Do not load optimizer when loading checkpoint.')
-    group.add_argument('--no-load-rng', action='store_true', default=None,
-                       help='Do not load rng state when loading checkpoint.')
-    group.add_argument('--finetune', action='store_true',
-                       help='Load model for finetuning. Do not load optimizer '
-                       'or rng state from checkpoint and set iteration to 0. '
-                       'Assumed when loading a release checkpoint.')
-
-    return parser
-
-
-def _add_mixed_precision_args(parser):
-    group = parser.add_argument_group(title='mixed precision')
-
-    group.add_argument('--fp16', action='store_true',
-                       help='Run model in fp16 mode.')
-    group.add_argument('--bf16', action='store_true',
-                       help='Run model in bfloat16 mode.')
-    group.add_argument('--loss-scale', type=float, default=None,
-                       help='Static loss scaling, positive power of 2 '
-                       'values can improve fp16 convergence. If None, dynamic'
-                       'loss scaling is used.')
-    group.add_argument('--initial-loss-scale', type=float, default=2**32,
-                       help='Initial loss-scale for dynamic loss scaling.')
-    group.add_argument('--min-loss-scale', type=float, default=1.0,
-                       help='Minimum loss scale for dynamic loss scale.')
-    group.add_argument('--loss-scale-window', type=float, default=1000,
-                       help='Window over which to raise/lower dynamic scale.')
-    group.add_argument('--hysteresis', type=int, default=2,
-                       help='hysteresis for dynamic loss scaling')
-    group.add_argument('--fp32-residual-connection', action='store_true',
-                       help='Move residual connections to fp32.')
-    group.add_argument('--no-query-key-layer-scaling', action='store_false',
-                       help='Do not scale Q * K^T by 1 / layer-number.',
-                       dest='apply_query_key_layer_scaling')
-    group.add_argument('--attention-softmax-in-fp32', action='store_true',
-                       help='Run attention masking and softmax in fp32. '
-                       'This flag is ignored unless '
-                       '--no-query-key-layer-scaling is specified.')
-    group.add_argument('--accumulate-allreduce-grads-in-fp32',
-                       action='store_true',
-                       help='Gradient accumulation and all-reduce in fp32.')
-    group.add_argument('--fp16-lm-cross-entropy', action='store_true',
-                       help='Move the cross entropy unreduced loss calculation'
-                       'for lm head to fp16.')
-
-    return parser
-
-
-def _add_distributed_args(parser):
-    group = parser.add_argument_group(title='distributed')
-
-    group.add_argument('--tensor-model-parallel-size', type=int, default=1,
-                       help='Degree of tensor model parallelism.')
-    group.add_argument('--pipeline-model-parallel-size', type=int, default=1,
-                       help='Degree of pipeline model parallelism.')
-    group.add_argument('--model-parallel-size', type=int, default=None,
-                       help='Old model parallel argument, do not use. Use '
-                       '--tensor-model-parallel-size instead.')
-    group.add_argument('--num-layers-per-virtual-pipeline-stage', type=int, default=None,
-                       help='Number of layers per virtual pipeline stage')
-    group.add_argument('--distributed-backend', default='nccl',
-                       choices=['nccl', 'gloo'],
-                       help='Which backend to use for distributed training.')
-    group.add_argument('--DDP-impl', default='local',
-                       choices=['local', 'torch'],
-                       help='which DistributedDataParallel implementation '
-                       'to use.')
-    group.add_argument('--use-contiguous-buffers-in-ddp', action='store_true',
-                       help='If set, use contiguous buffer in DDP. Note that '
-                       'this option only works woth local DDP.' )
-    group.add_argument('--no-scatter-gather-tensors-in-pipeline', action='store_false',
-                       help='Use scatter/gather to optimize communication of tensors in pipeline',
-                       dest='scatter_gather_tensors_in_pipeline')
-    group.add_argument('--local_rank', type=int, default=None,
-                       help='local rank passed from distributed launcher.')
-    group.add_argument('--lazy-mpu-init', type=bool, required=False,
-                       help='If set to True, initialize_megatron() '
-                       'skips DDP initialization and returns function to '
-                       'complete it instead.Also turns on '
-                       '--use-cpu-initialization flag. This is for '
-                       'external DDP manager.' )
-    group.add_argument('--use-cpu-initialization', action='store_true',
-                       default=None, help='If set, affine parallel weights '
-                       'initialization uses CPU' )
-    group.add_argument('--rank', default=-1, type=int, help='node rank for distributed training')
-    group.add_argument('--dist_url', type=str, default="env://127.0.0.1:23456")
-    group.add_argument('--world_size', type=int, default=-1, help='number of nodes for distributed training')
-    group.add_argument('--dist_backend', default='nccl', type=str, help='distributed backend')
-    return parser
-
-
-def _add_validation_args(parser):
-    group = parser.add_argument_group(title='validation')
-
-    group.add_argument('--eval-iters', type=int, default=100,
-                       help='Number of iterations to run for evaluation'
-                       'validation/test for.')
-    group.add_argument('--eval-interval', type=int, default=1000,
-                       help='Interval between running evaluation on '
-                       'validation set.')
-
-    return parser
-
-
-def _add_data_args(parser):
-    group = parser.add_argument_group(title='data and dataloader')
-
-
-    # option 1 for data loading  (mutually exclusive with option2)
-    group.add_argument('--data-path', nargs='*', default=None,
-                       help='Path to the training dataset. Accepted format:'
-                       '1) a single data path, 2) multiple datasets in the'
-                       'form: dataset1-weight dataset1-path dataset2-weight '
-                       'dataset2-path ...')
-
-    group.add_argument('--split', type=str, default=None,
-                       help='Comma-separated list of proportions for training,'
-                       ' validation, and test split. For example the split '
-                       '`90,5,5` will use 90%% of data for training, 5%% for '
-                       'validation and 5%% for test.')
-
-    # option 2 for data loading (mutually exclusive with option1)
-
-    # helper class to parse the --xxx-weighted-split-paths
-    # note here two args are set: extra valid dataset paths and names
-    class parse_data_paths(argparse.Action):
-        def __call__(self, parser, args, values, option_string=None):
-
-            if option_string == "--train-weighted-split-paths":
-                assert len(values) == 1, 'Only 1 dataset group is allowed to'
-                'be passed for the argument --train-weighted-split-paths'
-
-            # make sure string given in the correct format
-            err_message = 'Each data group should be input on the following format'
-            '"GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2"'
-            'where START < END'
-            for v in values:
-                # each prefix consists several datasets separated by commas
-                prefix = ":".join(v.split(":")[1:]) # remove GIVEN_NAME
-                datasets = prefix.split(",")
-                # check if each dataset is formatted like `WEIGHT START:END PATH`
-                for d in datasets:
-                    assert len(d.split()) == 3, err_message
-                    start, end = d.split()[1].split(":")
-                    assert float(start) < float(end), err_message
-
-            names = [v.split(":")[0] for v in values]
-
-            prefixes = [":".join(v.split(":")[1:]).strip() for v in values]
-            weights = [[d.split()[0] for d in p.split(",")] for p in prefixes]
-            splits = [[d.split()[1] for d in p.split(",")] for p in prefixes]
-            paths = [[d.split()[2] for d in p.split(",")] for p in prefixes]
-
-            # # to keep consistency with Option 1 of data loading (through --data-path)
-            # #  paths will contain strings on the following form
-            # # "WEIGHTS1 PATH1 WEIGHTS2 PATH2 WEIGHTS3 PATH3" for each dataset group
-            # # while data will be parsed in additional arguments below
-            # paths_option1_style = []
-            # for p, w in zip(paths, weights):
-            #   paths_option1_style.append(" ".join([f"{w_i} {p_i}" for p_i, w_i in zip(p,w)]))
-            # setattr(args, self.dest, paths_option1_style)
-            setattr(args, self.dest, paths)
-            setattr(args, self.dest.replace("paths", "weights"), weights)
-            setattr(args, self.dest.replace("paths", "splits"), splits)
-            setattr(args, self.dest.replace("paths","names"), names)
-
-
-    group.add_argument('--train-weighted-split-paths', nargs='*', default=None,
-                    help='Weights, splits and paths to groups of datasets'
-                    'Accepted format: ONE dataset groups could be'
-                    'submitted in the following form between double quotes'
-                    '"GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2"'
-                    'e.g.: "NAME_ABC: 0.6 0:0.6 A, 0.3 0:1 B, 0.1 0:1 C" '
-                    'WEIGHT is used to up and down sample each dataset A,B,C in the group'
-                    'START:END indicates the split portion of the dataset',
-                    action=parse_data_paths)
-
-    group.add_argument('--valid-weighted-split-paths', nargs='*', default=None,
-                    help='Weights, splits and paths to groups of datasets'
-                    'Accepted format: one or many dataset groups could be'
-                    'submitted in the following form each between double quotes'
-                    '"GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2"'
-                    'e.g.: "NAME_ABC: 0.6 0.6:0.8 A, 0.3 0:1 B, 0.1 0:1 C" '
-                    '"NAME_CDE: 0.6 0.6:0.8 C, 0.3 0:1 D, 0.1 0:1 E" '
-                    'validation will be run on each of those groups independently',
-                    action=parse_data_paths)
-
-    group.add_argument('--test-weighted-split-paths', nargs='*', default=None,
-                    help='Weights, splits and paths to groups of datasets'
-                    'Accepted format: one or many dataset groups could be'
-                    'submitted in the following form each between double quotes'
-                    '"GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2"'
-                    'e.g.: "NAME_ABC: 0.6 0.6:0.8 A, 0.3 0:1 B, 0.1 0:1 C" '
-                    '"NAME_CDE: 0.6 0.6:0.8 C, 0.3 0:1 D, 0.1 0:1 E" '
-                    'test will be run on each of those groups independently',
-                    action=parse_data_paths)
-
-    class parse_data_paths_path(argparse.Action):
-        def __call__(self, parser, args, values, option_string=None):
-            expected_option_strings = ["--train-weighted-split-paths-path", "--valid-weighted-split-paths-path", "--test-weighted-split-paths-path"]
-            assert option_string in expected_option_strings, f"Expected {option_string} to be in {expected_option_strings}"
-
-            with open(values, "r") as fi:
-                lines = fi.readlines()
-                assert len(lines) == 1, f"Got multiple lines {len(lines)} instead of 1 expected"
-                assert lines[0][-2:] == "\"\n" and lines[0][0] == "\"", f"Invalid input format, got {lines}"
-                values = lines[0][1:-2].split("\" \"")
-                weighted_split_paths_dest = re.sub(r"_path$", "", self.dest)
-                weighted_split_paths_option = re.sub(r"-path$", "", self.option_strings[0])
-                setattr(args, weighted_split_paths_dest, values)
-                parse_data_paths(option_strings=[weighted_split_paths_option], dest=weighted_split_paths_dest)(parser, args, values, option_string=weighted_split_paths_option)
-
-
-    group.add_argument('--train-weighted-split-paths-path', type=str, action=parse_data_paths_path ,default=None)
-    group.add_argument('--valid-weighted-split-paths-path', type=str, action=parse_data_paths_path, default=None)
-    group.add_argument('--test-weighted-split-paths-path', type=str, action=parse_data_paths_path, default=None)
-
-    group.add_argument('--log-path', type=str, default=None,
-                       help='Path to the save arguments file.')
-    group.add_argument('--vocab-file', type=str, default=None,
-                       help='Path to the vocab file.')
-    group.add_argument('--merge-file', type=str, default=None,
-                       help='Path to the BPE merge file.')
-    group.add_argument('--vocab-extra-ids', type=int, default=0,
-                       help='Number of additional vocabulary tokens. '
-                            'They are used for span masking in the T5 model')
-    group.add_argument('--seq-length', type=int, default=None,
-                       help='Maximum sequence length to process.')
-    group.add_argument('--encoder-seq-length', type=int, default=None,
-                       help='Maximum encoder sequence length to process.'
-                       'This should be exclusive of --seq-length')
-    group.add_argument('--decoder-seq-length', type=int, default=None,
-                       help="Maximum decoder sequence length to process.")
-    group.add_argument('--retriever-seq-length', type=int, default=256,
-                       help='Maximum sequence length for the biencoder model '
-                        ' for retriever')
-    group.add_argument('--sample-rate', type=float, default=1.0,
-                       help='sample rate for training data. Supposed to be 0 '
-                            ' < sample_rate < 1')
-    group.add_argument('--mask-prob', type=float, default=0.15,
-                       help='Probability of replacing a token with mask.')
-    group.add_argument('--short-seq-prob', type=float, default=0.1,
-                       help='Probability of producing a short sequence.')
-    group.add_argument('--mmap-warmup', action='store_true',
-                       help='Warm up mmap files.')
-    group.add_argument('--num-workers', type=int, default=2,
-                       help="Dataloader number of workers.")
-    group.add_argument('--valid-num-workers', type=int, default=2,
-                       help="Dataloader number of workers for validation.")
-    group.add_argument('--tokenizer-type', type=str,
-                       default=None,
-                       choices=['BertWordPieceLowerCase',
-                                'BertWordPieceCase',
-                                'GPT2BPETokenizer',
-                                'PretrainedFromHF'],
-                       help='What type of tokenizer to use.')
-    group.add_argument("--tokenizer-name-or-path", type=str, default=None,
-                       help="Name or path of the huggingface tokenizer.")
-    group.add_argument('--data-impl', type=str, default='infer',
-                       choices=['lazy', 'cached', 'mmap', 'infer'],
-                       help='Implementation of indexed datasets.')
-    group.add_argument('--reset-position-ids', action='store_true',
-                       help='Reset posistion ids after end-of-document token.')
-    group.add_argument('--reset-attention-mask', action='store_true',
-                       help='Reset self attention maske after '
-                       'end-of-document token. Attention between tokens from different documents is null.')
-    group.add_argument('--eod-mask-loss', action='store_true',
-                       help='Mask loss for the end of document tokens.')
-    group.add_argument('--loss-on-targets-only', action='store_true',
-                       help='Mask loss on input sequence.')
-    group.add_argument('--reweight-loss-based-on-position-frequency', action="store_true",
-                       help='Some objectives require us to sample loss_mask. This might introduce bias towards '
-                       'specific positions. This option tries to un-bias the loss by reweighting loss on specific '
-                       'positions based on how frequently we train on that position.'
-                       'This is mostly used for prefix_lm training')
-    group.add_argument("--noise-density", type=float, default=None, help="Span corruption noise density")
-    group.add_argument("--mean-noise-span-length", type=int, default=None, help="Span corruption mean noise span length")
-
-
-    return parser
-
-
-def _add_autoresume_args(parser):
-    group = parser.add_argument_group(title='autoresume')
-
-    group.add_argument('--adlr-autoresume', action='store_true',
-                       help='Enable autoresume on adlr cluster.')
-    group.add_argument('--adlr-autoresume-interval', type=int, default=1000,
-                       help='Intervals over which check for autoresume'
-                       'termination signal')
-
-    return parser
-
-
-def _add_biencoder_args(parser):
-    group = parser.add_argument_group(title='biencoder')
-
-    # network size
-    group.add_argument('--ict-head-size', type=int, default=None,
-                       help='Size of block embeddings to be used in ICT and '
-                        'REALM (paper default: 128)')
-    group.add_argument('--biencoder-projection-dim', type=int, default=0,
-                       help='Size of projection head used in biencoder (paper'
-                        ' default: 128)')
-    group.add_argument('--biencoder-shared-query-context-model', action='store_true',
-                        help='Whether to share the parameters of the query '
-                        'and context models or not')
-
-    # checkpointing
-    group.add_argument('--ict-load', type=str, default=None,
-                       help='Directory containing an ICTBertModel checkpoint')
-    group.add_argument('--bert-load', type=str, default=None,
-                       help='Directory containing an BertModel checkpoint '
-                       '(needed to start ICT and REALM)')
-
-    # data
-    group.add_argument('--titles-data-path', type=str, default=None,
-                       help='Path to titles dataset used for ICT')
-    group.add_argument('--query-in-block-prob', type=float, default=0.1,
-                       help='Probability of keeping query in block for '
-                       'ICT dataset')
-    group.add_argument('--use-one-sent-docs', action='store_true',
-                       help='Whether to use one sentence documents in ICT')
-    group.add_argument('--evidence-data-path', type=str, default=None,
-                       help='Path to Wikipedia Evidence frm DPR paper')
-
-    # training
-    group.add_argument('--retriever-report-topk-accuracies', nargs='+', type=int,
-                        default=[], help="Which top-k accuracies to report "
-                        "(e.g. '1 5 20')")
-    group.add_argument('--retriever-score-scaling', action='store_true',
-                       help='Whether to scale retriever scores by inverse '
-                        'square root of hidden size')
-
-    # faiss index
-    group.add_argument('--block-data-path', type=str, default=None,
-                       help='Where to save/load BlockData to/from')
-    group.add_argument('--embedding-path', type=str, default=None,
-                       help='Where to save/load Open-Retrieval Embedding'
-                        ' data to/from')
-
-    # indexer
-    group.add_argument('--indexer-batch-size', type=int, default=128,
-                       help='How large of batches to use when doing indexing '
-                       'jobs')
-    group.add_argument('--indexer-log-interval', type=int, default=1000,
-                       help='After how many batches should the indexer '
-                       'report progress')
-    return parser
-
-
-def _add_vit_args(parser):
-    group = parser.add_argument_group(title="vit")
-
-    group.add_argument('--num-classes', type=int, default=1000,
-                       help='num of classes in vision classificaiton task')
-    group.add_argument('--img-dim', type=int, default=224,
-                       help='Image size for vision classification task')
-    group.add_argument('--num-channels', type=int, default=3,
-                       help='Number of channels in input image data')
-    group.add_argument('--patch-dim', type=int, default=16,
-                       help='patch dimension used in vit')
-
-    return parser
-
-
-def _add_zero_args(parser):
-    """Text generate arguments."""
-
-    group = parser.add_argument_group('ZeRO configurations', 'configurations')
-    group.add_argument("--zero-stage", type=int, default=1.0)
-    group.add_argument('--zero-reduce-scatter', action='store_true',
-                       help='Use reduce scatter if specified')
-    group.add_argument('--zero-contigious-gradients', action='store_true',
-                       help='Use contigious memory optimizaiton if specified')
-    group.add_argument("--zero-reduce-bucket-size", type=int, default=0.0)
-    group.add_argument("--zero-allgather-bucket-size", type=int, default=0.0)
-    group.add_argument('--remote-device', type=str, default='none', choices=['none', 'cpu', 'nvme'],
-                      help='Remote device for ZeRO-3 initialized parameters.')
-    group.add_argument('--use-pin-memory', action='store_true',
-                     help='Use pinned CPU memory for ZeRO-3 initialized model parameters.')
-    return parser
-
-def _add_memoryopt_args(parser):
-    """Memory optimization arguments."""
-
-    group = parser.add_argument_group('Memory optimizations', 'configurations')
-    group.add_argument("--scattered-embeddings", action='store_true',
-                       help='Save memory by scattering embedding activations. '
-                            'Introduces dropout differences across MP configurations.')
-    group.add_argument("--split-transformers", action='store_true',
-                       help='Save memory by splitting transformer layers into two parts, '
-                       'allowing for more frequent activation checkpoint savings.')
-    group.add_argument("--memory-centric-tiled-linear", action="store_true",
-                       help='Save memory by tiling with deepspeed.zero.TiledLinear.')
-    group.add_argument("--tile-factor", type=int, default=1,
-                       help='Make all linear layers the same size of [hidden/tile_factor, hidden/tile_factor]. '
-                            'Must be enabled with --memory-centric-tiled-linear. '
-                            'Example A: if tile_factor=1, the qkv layer [hidden, 3* hidden] would be converted into [1,3] tiles of size [hidden,hidden]. '
-                            'Example B: if tile_factor=2, the intermediate layer [4*hidden, hidden] will be converted into [8, 2] tiles of size [hidden/2, hidden/2]. '
-                            'Default is 1.')
-
-    return parser
-
-def _add_activation_checkpoint_args(parser):
-    group = parser.add_argument_group('Activation Checkpointing',
-                                      'Checkpointing Configurations')
-    group.add_argument('--deepspeed-activation-checkpointing', action='store_true',
-                       help='uses activation checkpointing from deepspeed')
-    group.add_argument('--partition-activations', action='store_true',
-                       help='partition Activations across GPUs before checkpointing.')
-    group.add_argument('--contigious-checkpointing', action='store_true',
-                       help='Contigious memory checkpointing for activatoins.')
-    group.add_argument('--checkpoint-in-cpu', action='store_true',
-                       help='Move the activation checkpoints to CPU.')
-    group.add_argument('--synchronize-each-layer', action='store_true',
-                       help='does a synchronize at the beginning and end of each checkpointed layer.')
-    group.add_argument('--profile-backward', action='store_true',
-                       help='Enables backward pass profiling for checkpointed layers.')
-    return parser
--- a/megatron/arguments.py-one_node
+++ b/megatron/arguments.py-one_node
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Megatron arguments."""
-
-import argparse
-import collections
-import os
-import re
-import time
-
-import torch
-import deepspeed
-
-from megatron.enums import PositionEmbeddingType
-import megatron
-from megatron.logging import log_levels
-
-
-def parse_args(extra_args_provider=None, defaults={},
-               ignore_unknown_args=False):
-    """Parse all arguments."""
-    parser = argparse.ArgumentParser(description='Megatron-LM Arguments',
-                                     allow_abbrev=False)
-
-    # Standard arguments.
-    parser = _add_network_size_args(parser)
-    parser = _add_regularization_args(parser)
-    parser = _add_training_args(parser)
-    parser = _add_initialization_args(parser)
-    parser = _add_learning_rate_args(parser)
-    parser = _add_checkpointing_args(parser)
-    parser = _add_mixed_precision_args(parser)
-    parser = _add_distributed_args(parser)
-    parser = _add_validation_args(parser)
-    parser = _add_data_args(parser)
-    parser = _add_autoresume_args(parser)
-    parser = _add_biencoder_args(parser)
-    parser = _add_vit_args(parser)
-    parser = _add_logging_args(parser)
-    parser = _add_zero_args(parser)
-    parser = _add_memoryopt_args(parser)
-    parser = _add_activation_checkpoint_args(parser)
-
-    # Custom arguments.
-    if extra_args_provider is not None:
-        parser = extra_args_provider(parser)
-
-    parser = deepspeed.add_config_arguments(parser)
-
-    # Parse.
-    if ignore_unknown_args:
-        args, _ = parser.parse_known_args()
-    else:
-        args = parser.parse_args()
-
-    # Distributed args.
-    args.rank = int(os.getenv('RANK', '0'))
-    args.world_size = int(os.getenv("WORLD_SIZE", '1'))
-    # Tensor model parallel size.
-    args.tensor_model_parallel_size = min(
-        args.tensor_model_parallel_size, args.world_size)
-    assert args.world_size % args.tensor_model_parallel_size == 0, 'world size'\
-        ' ({}) is not divisible by tensor model parallel size ({})'.format(
-            args.world_size, args.tensor_model_parallel_size)
-    # Pipeline model parallel size.
-    args.pipeline_model_parallel_size = min(
-        args.pipeline_model_parallel_size,
-        (args.world_size // args.tensor_model_parallel_size))
-    # Checks.
-    model_parallel_size = args.pipeline_model_parallel_size * \
-                          args.tensor_model_parallel_size
-    assert args.world_size % model_parallel_size == 0, 'world size is not'\
-        ' divisible by tensor parallel size ({}) times pipeline parallel ' \
-        'size ({})'.format(args.world_size, args.tensor_model_parallel_size,
-                           args.pipeline_model_parallel_size)
-    args.data_parallel_size = args.world_size // model_parallel_size
-    if args.rank == 0:
-        print('using world size: {}, data-parallel-size: {}, '
-              'tensor-model-parallel size: {}, '
-              'pipeline-model-parallel size: {} '.format(
-                  args.world_size, args.data_parallel_size,
-                  args.tensor_model_parallel_size,
-                  args.pipeline_model_parallel_size), flush=True)
-
-    # --data-path and --train-weighted-splits-paths
-    message = "Data loading Mode 1: --data-path and --split "\
-            "and Mode 2: --(train|valid|test)-weighted-split-paths"\
-            "are mutually exclusive i.e. cannot be set together."
-
-    if args.data_path:
-        assert args.train_weighted_split_paths is None, message
-        setattr(args, "valid_weighted_split_names", None)
-        setattr(args, "valid_weighted_split_weights", None)
-        setattr(args, "valid_weighted_split_splits", None)
-
-        setattr(args, "test_weighted_split_names", None)
-        setattr(args, "test_weighted_split_weights", None)
-        setattr(args, "test_weighted_split_splits", None)
-
-        # args.split default value in the args is None it is set here in order
-        # to check that it does not to overlap with the 2nd mode of data loading
-        if args.split is None:
-            args.split = "969, 30, 1"
-
-    if args.train_weighted_split_paths or args.valid_weighted_split_paths or \
-                args.test_weighted_split_paths:
-        assert args.data_path is None and args.split is None, message
-
-
-
-    # Deprecated arguments
-    assert args.batch_size is None, '--batch-size argument is no longer ' \
-        'valid, use --micro-batch-size instead'
-    del args.batch_size
-    assert args.warmup is None, '--warmup argument is no longer valid, use ' \
-        '--lr-warmup-fraction instead'
-    del args.warmup
-    assert args.model_parallel_size is None, '--model-parallel-size is no ' \
-        'longer valid, use --tensor-model-parallel-size instead'
-    del args.model_parallel_size
-
-    # Set input defaults.
-    for key in defaults:
-        # For default to be valid, it should not be provided in the
-        # arguments that are passed to the program. We check this by
-        # ensuring the arg is set to None.
-        if getattr(args, key) is not None:
-            if args.rank == 0:
-                print('WARNING: overriding default arguments for {key}:{v} \
-                       with {key}:{v2}'.format(key=key, v=defaults[key],
-                                               v2=getattr(args, key)),
-                                               flush=True)
-        else:
-            setattr(args, key, defaults[key])
-
-    # Batch size.
-    assert args.micro_batch_size is not None
-    assert args.micro_batch_size > 0
-    if args.global_batch_size is None:
-        args.global_batch_size = args.micro_batch_size * args.data_parallel_size
-        if args.rank == 0:
-            print('setting global batch size to {}'.format(
-                args.global_batch_size), flush=True)
-    assert args.global_batch_size > 0
-    if args.num_layers_per_virtual_pipeline_stage is not None:
-        assert args.pipeline_model_parallel_size > 2, \
-            'pipeline-model-parallel size should be greater than 2 with ' \
-            'interleaved schedule'
-        assert args.num_layers % args.num_layers_per_virtual_pipeline_stage == 0, \
-            'number of layers is not divisible by number of layers per virtual ' \
-            'pipeline stage'
-        args.virtual_pipeline_model_parallel_size = \
-            (args.num_layers // args.pipeline_model_parallel_size) // \
-            args.num_layers_per_virtual_pipeline_stage
-    else:
-        args.virtual_pipeline_model_parallel_size = None
-
-    # Parameters dtype.
-    args.params_dtype = torch.float
-    if args.fp16:
-        assert not args.bf16
-        args.params_dtype = torch.half
-    if args.bf16:
-        assert not args.fp16
-        args.params_dtype = torch.bfloat16
-        # bfloat16 requires gradient accumulation and all-reduce to
-        # be done in fp32.
-        if not args.accumulate_allreduce_grads_in_fp32:
-            args.accumulate_allreduce_grads_in_fp32 = True
-            if args.rank == 0:
-                print('accumulate and all-reduce gradients in fp32 for '
-                      'bfloat16 data type.', flush=True)
-
-    if args.rank == 0:
-        print('using {} for parameters ...'.format(args.params_dtype),
-              flush=True)
-
-    # If we do accumulation and all-reduces in fp32, we need to have
-    # local DDP and we should set the use-contiguous-buffers-in-ddp.
-    if args.accumulate_allreduce_grads_in_fp32:
-        assert args.DDP_impl == 'local'
-        args.use_contiguous_buffers_in_ddp = True
-
-    if args.dataloader_type is None:
-        args.dataloader_type = 'single'
-
-    # Consumed tokens.
-    args.consumed_train_samples = 0
-    args.consumed_valid_samples = 0
-    args.consumed_train_tokens = 0
-    args.gigaflos_no_embeds = 0
-
-    # Iteration-based training.
-    if args.train_iters:
-        # If we use iteration-based training, make sure the
-        # sample-based options are off.
-        assert args.train_samples is None, \
-            'expected iteration-based training'
-        assert args.lr_decay_samples is None, \
-            'expected iteration-based learning rate decay'
-        assert args.lr_warmup_samples == 0, \
-            'expected iteration-based learning rate warmup'
-        assert args.rampup_batch_size is None, \
-            'expected no batch-size rampup for iteration-based training'
-        if args.lr_warmup_fraction is not None:
-            assert args.lr_warmup_iters == 0, \
-                'can only specify one of lr-warmup-fraction and lr-warmup-iters'
-
-    # Sample-based training.
-    if args.train_samples:
-        # If we use sample-based training, make sure the
-        # iteration-based options are off.
-        assert args.train_iters is None, \
-            'expected sample-based training'
-        assert args.lr_decay_iters is None, \
-            'expected sample-based learning rate decay'
-        assert args.lr_warmup_iters == 0, \
-            'expected sample-based learnig rate warmup'
-        if args.lr_warmup_fraction is not None:
-            assert args.lr_warmup_samples == 0, \
-                'can only specify one of lr-warmup-fraction ' \
-                'and lr-warmup-samples'
-
-    # Check required arguments.
-    required_args = ['num_layers', 'hidden_size', 'num_attention_heads']
-    for req_arg in required_args:
-        _check_arg_is_not_none(args, req_arg)
-
-    # Checks.
-    if args.ffn_hidden_size is None:
-        args.ffn_hidden_size = 4 * args.hidden_size
-
-    if args.kv_channels is None:
-        assert args.hidden_size % args.num_attention_heads == 0
-        args.kv_channels = args.hidden_size // args.num_attention_heads
-
-    if args.seq_length is not None:
-        assert args.encoder_seq_length is None
-        args.encoder_seq_length = args.seq_length
-    else:
-        assert args.encoder_seq_length is not None
-        args.seq_length = args.encoder_seq_length
-
-    if args.position_embedding_type == PositionEmbeddingType.absolute or args.position_embedding_type == PositionEmbeddingType.alibi:
-        assert args.max_position_embeddings is not None
-        if args.seq_length is not None:
-            assert args.max_position_embeddings >= args.seq_length
-        if args.decoder_seq_length is not None:
-            assert args.max_position_embeddings >= args.decoder_seq_length
-    else:
-        assert args.max_position_embeddings is None
-
-    if args.lr is not None:
-        assert args.min_lr <= args.lr
-    if args.save is not None:
-        assert args.save_interval is not None
-    # Mixed precision checks.
-    if args.fp16_lm_cross_entropy:
-        assert args.fp16, 'lm cross entropy in fp16 only support in fp16 mode.'
-    if args.fp32_residual_connection:
-        assert args.fp16 or args.bf16, \
-            'residual connection in fp32 only supported when using fp16 or bf16.'
-    # Activation checkpointing.
-    if args.distribute_checkpointed_activations:
-        assert args.checkpoint_activations, \
-            'for distribute-checkpointed-activations to work you '\
-            'need to enable checkpoint-activations'
-
-    args.curriculum_learning = False
-
-    # Activation function
-    if args.glu_activation is not None and args.bias_gelu_fusion:
-        raise ValueError("if glu-activation is used, please set --no-bias-gelu-fusion")
-
-    # Skip train iterations
-    if args.skip_train_iteration_range is not None:
-        args.skip_train_iteration_range = [
-            list(map(int, range_.split("-"))) for range_ in args.skip_train_iteration_range
-        ]
-        args.skip_train_iteration_range.sort()
-        skip_train_iteration_range = collections.deque()
-        for range_ in args.skip_train_iteration_range:
-            if len(range_) == 2:
-                start, end = range_
-                assert end >= start, \
-                "end of skip range cannot be smaller than start of skip range"
-                # merge overlapping intervals (e.g. 1-5 2-6 -> 1-6)
-                if not skip_train_iteration_range:
-                    skip_train_iteration_range.append([start, end])
-                elif skip_train_iteration_range[-1][1] >= start:
-                    skip_train_iteration_range[-1][1] = max(end, skip_train_iteration_range[-1][1])
-                else:
-                    skip_train_iteration_range.append([start, end])
-            else:
-                raise ValueError(
-                    "skip train iterations should be specified as two numbers, i.e. start-end"
-                )
-        args.skip_train_iteration_range = skip_train_iteration_range
-
-    if args.use_bnb_optimizer:
-        try:
-            import bitsandbytes as bnb
-        except ModuleNotFoundError:
-            raise ModuleNotFoundError("Please install bitsandbytes from https://github.com/facebookresearch/bitsandbytes.")
-
-    _print_args(args)
-    return args
-
-
-def _print_args(args):
-    """Print arguments."""
-    if args.rank == 0:
-        print('------------------------ arguments ------------------------',
-              flush=True)
-        str_list = []
-        for arg in vars(args):
-            dots = '.' * (48 - len(arg))
-            str_list.append('  {} {} {}'.format(arg, dots, getattr(args, arg)))
-
-        if args.log_path is not None:
-            with open(os.path.join(args.log_path,f'args_{time.strftime("%Y-%m-%dT%H:%M:%S")}.txt'), 'w') as f:
-                for arg in sorted(str_list, key=lambda x: x.lower()):
-                    f.write(arg+"\n")
-                    print(arg, flush=True)
-        else:
-            for arg in sorted(str_list, key=lambda x: x.lower()):
-                print(arg, flush=True)
-        print('-------------------- end of arguments ---------------------',
-              flush=True)
-
-
-def _check_arg_is_not_none(args, arg):
-    assert getattr(args, arg) is not None, '{} argument is None'.format(arg)
-
-
-def _add_network_size_args(parser):
-    group = parser.add_argument_group(title='network size')
-
-    group.add_argument('--num-layers', type=int, default=None,
-                       help='Number of transformer layers.')
-    group.add_argument('--hidden-size', type=int, default=None,
-                       help='Tansformer hidden size.')
-    group.add_argument('--ffn-hidden-size', type=int, default=None,
-                       help='Transformer Feed-Forward Network hidden size. '
-                       'This is set to 4*hidden-size if not provided')
-    group.add_argument('--num-attention-heads', type=int, default=None,
-                       help='Number of transformer attention heads.')
-    group.add_argument('--kv-channels', type=int, default=None,
-                       help='Projection weights dimension in multi-head '
-                       'attention. This is set to '
-                       '   args.hidden_size // args.num_attention_heads '
-                       'if not provided.')
-    group.add_argument('--max-position-embeddings', type=int, default=None,
-                       help='Maximum number of position embeddings to use. '
-                       'This is the size of position embedding.')
-    group.add_argument('--make-vocab-size-divisible-by', type=int, default=128,
-                       help='Pad the vocab size to be divisible by this value.'
-                       'This is added for computational efficieny reasons.')
-    group.add_argument('--pad-vocab-size-to', type=int, default=None,
-                       help='Pad the vocab size to this value.'
-                       'This value must be greater than the initial size of the tokenizer'
-                       ', needs to be divisible by TP size and `make-vocab-size-divisible-by`.')
-    group.add_argument('--layernorm-epsilon', type=float, default=1e-5,
-                       help='Layer norm epsilon.')
-    group.add_argument('--sync-tp-duplicated-parameters', action='store_true',
-                       help='Force syncing duplicated params across TP ranks in forward. '
-                       'This is a workaround for an unresolved bug leading to TP ranks '
-                       'getting out of sync with each other.')
-    group.add_argument('--apply-residual-connection-post-layernorm',
-                       action='store_true',
-                       help='If set, use original BERT residula connection '
-                       'ordering.')
-    group.add_argument('--embed-layernorm', action='store_true',
-                       help='use layernorm for embedding')
-    group.add_argument('--openai-gelu', action='store_true',
-                       help='Use OpenAIs GeLU implementation. This option'
-                       'should not be used unless for backward compatibility'
-                       'reasons.')
-    group.add_argument('--onnx-safe', type=bool, required=False,
-                       help='Use workarounds for known problems with '
-                       'Torch ONNX exporter')
-    group.add_argument('--bert-no-binary-head', action='store_false',
-                       help='Disable BERT binary head.',
-                       dest='bert_binary_head')
-    group.add_argument('--position-embedding-type', type=lambda x: PositionEmbeddingType[x],
-                       choices=list(PositionEmbeddingType),
-                       default=PositionEmbeddingType.absolute,
-                       help='Define position embedding type ("absolute" | "rotary" | "alibi"). "absolute" by default.'
-                       )
-    group.add_argument('--glu-activation', type=str,
-                       choices=megatron.model.glu_activations.GLU_ACTIVATIONS.keys(),
-                       help='GLU activations to use.'
-                       )
-
-    group.add_argument('--kill-switch-path', type=str,
-                       help='path to look for a kill switch, which if found will automatically exit the program'
-                       )
-
-
-    group.add_argument('--log-level', type=str, choices=list(log_levels.keys()),
-                       help="Logger log level to use on the main process. Possible choices are the log levels as strings: 'debug', "
-                       "'info', 'warning', 'error' and 'critical', plus a 'passive' level which doesn't set anything and lets the "
-                       "application set the level."
-                       )
-    group.add_argument('--log-level-replica', type=str, choices=list(log_levels.keys()),
-                       help="Logger log level to use on replicas. Same choices as ``log_level``"
-                       )
-    return parser
-
-
-def _add_logging_args(parser):
-    group = parser.add_argument_group(title='logging')
-
-    group.add_argument('--log-params-norm', action='store_true',
-                       help='If set, calculate and log parameters norm.')
-    group.add_argument('--log-num-zeros-in-grad', action='store_true',
-                       help='If set, calculate and log the number of zeros in gradient.')
-    group.add_argument('--tensorboard-log-interval', type=int, default=1,
-                       help='Report to tensorboard interval.')
-    group.add_argument('--tensorboard-queue-size', type=int, default=1000,
-                       help='Size of the tensorboard queue for pending events '
-                       'and summaries before one of the ‘add’ calls forces a '
-                       'flush to disk.')
-    group.add_argument('--log-timers-to-tensorboard', action='store_true',
-                       help='If set, write timers to tensorboard.')
-    group.add_argument('--log-batch-size-to-tensorboard', action='store_true',
-                       help='If set, write batch-size to tensorboard.')
-    group.add_argument('--no-log-learnig-rate-to-tensorboard',
-                       action='store_false',
-                       help='Disable learning rate logging to tensorboard.',
-                       dest='log_learning_rate_to_tensorboard')
-    group.add_argument('--no-log-loss-scale-to-tensorboard',
-                       action='store_false',
-                       help='Disable loss-scale logging to tensorboard.',
-                       dest='log_loss_scale_to_tensorboard')
-    group.add_argument('--log-validation-ppl-to-tensorboard',
-                       action='store_true',
-                       help='If set, write validation perplexity to '
-                       'tensorboard.')
-
-    return parser
-
-
-def _add_regularization_args(parser):
-    group = parser.add_argument_group(title='regularization')
-
-    group.add_argument('--attention-dropout', type=float, default=0.1,
-                       help='Post attention dropout probability.')
-    group.add_argument('--hidden-dropout', type=float, default=0.1,
-                       help='Dropout probability for hidden state transformer.')
-    group.add_argument('--weight-decay', type=float, default=0.01,
-                       help='Weight decay coefficient for L2 regularization.')
-    group.add_argument('--clip-grad', type=float, default=1.0,
-                       help='Gradient clipping based on global L2 norm.')
-    group.add_argument('--adam-beta1', type=float, default=0.9,
-                       help='First coefficient for computing running averages '
-                       'of gradient and its square')
-    group.add_argument('--adam-beta2', type=float, default=0.999,
-                       help='Second coefficient for computing running averages '
-                       'of gradient and its square')
-    group.add_argument('--adam-eps', type=float, default=1e-08,
-                       help='Term added to the denominator to improve'
-                       'numerical stability')
-    group.add_argument('--sgd-momentum', type=float, default=0.9,
-                       help='Momentum factor for sgd')
-
-    return parser
-
-
-def _add_training_args(parser):
-    group = parser.add_argument_group(title='training')
-
-    group.add_argument('--micro-batch-size', type=int, default=None,
-                       help='Batch size per model instance (local batch size). '
-                       'Global batch size is local batch size times data '
-                       'parallel size times number of micro batches.')
-    group.add_argument('--batch-size', type=int, default=None,
-                       help='Old batch size parameter, do not use. '
-                       'Use --micro-batch-size instead')
-    group.add_argument('--global-batch-size', type=int, default=None,
-                       help='Training batch size. If set, it should be a '
-                       'multiple of micro-batch-size times data-parallel-size. '
-                       'If this value is None, then '
-                       'use micro-batch-size * data-parallel-size as the '
-                       'global batch size. This choice will result in 1 for '
-                       'number of micro-batches.')
-    group.add_argument('--rampup-batch-size', nargs='*', default=None,
-                       help='Batch size ramp up with the following values:'
-                       '  --rampup-batch-size <start batch size> '
-                       '                      <batch size increment> '
-                       '                      <ramp-up samples> '
-                       'For example: '
-                       '   --rampup-batch-size 16 8 300000 '
-                       '   --global-batch-size 1024 '
-                       'will start with global batch size 16 and over '
-                       ' (1024 - 16) / 8 = 126 intervals will increase '
-                       'the batch size linearly to 1024. In each interval '
-                       'we will use approximately 300000 / 126 = 2380 samples.')
-    group.add_argument('--checkpoint-activations', action='store_true',
-                       help='Checkpoint activation to allow for training '
-                       'with larger models, sequences, and batch sizes.')
-    group.add_argument('--distribute-checkpointed-activations',
-                       action='store_true',
-                       help='If set, distribute checkpointed activations '
-                       'across model parallel group.')
-    group.add_argument('--checkpoint-num-layers', type=int, default=1,
-                       help='chunk size (number of layers) for checkpointing.')
-    group.add_argument('--train-iters', type=int, default=None,
-                       help='Total number of iterations to train over all '
-                       'training runs. Note that either train-iters or '
-                       'train-samples should be provided.')
-    group.add_argument('--train-samples', type=int, default=None,
-                       help='Total number of samples to train over all '
-                       'training runs. Note that either train-iters or '
-                       'train-samples should be provided.')
-    group.add_argument('--train-tokens', type=int, default=None,
-                       help='Total number of tokens to train over all '
-                       'training runs.')
-    group.add_argument('--log-interval', type=int, default=100,
-                       help='Report loss and timing interval.')
-    group.add_argument('--exit-interval', type=int, default=None,
-                       help='Exit the program after the iteration is divisible '
-                       'by this value.')
-    group.add_argument('--exit-duration-in-mins', type=int, default=None,
-                       help='Exit the program after this many minutes.')
-    group.add_argument('--tensorboard-dir', type=str, default=None,
-                       help='Write TensorBoard logs to this directory.')
-    group.add_argument('--no-masked-softmax-fusion',
-                       action='store_false',
-                       help='Disable fusion of query_key_value scaling, '
-                       'masking, and softmax.',
-                       dest='masked_softmax_fusion')
-    group.add_argument('--no-bias-gelu-fusion', action='store_false',
-                       help='Disable bias and gelu fusion.',
-                       dest='bias_gelu_fusion')
-    group.add_argument('--no-bias-dropout-fusion', action='store_false',
-                       help='Disable bias and dropout fusion.',
-                       dest='bias_dropout_fusion')
-    group.add_argument('--optimizer', type=str, default='adam',
-                       choices=['adam', 'sgd'],
-                       help='Optimizer function')
-    group.add_argument('--use-bnb-optimizer', action='store_true',
-                       help='Use bitsandbytes optimizer for efficient training,'
-                       'please refer https://github.com/facebookresearch/bitsandbytes.',
-                       dest='use_bnb_optimizer')
-    group.add_argument('--dataloader-type', type=str, default=None,
-                       choices=['single', 'cyclic'],
-                       help='Single pass vs multiple pass data loader')
-    group.add_argument('--cpu-optimizer', action='store_true',
-                       help='Run optimizer on CPU')
-    group.add_argument('--cpu_torch_adam', action='store_true',
-                       help='Use Torch Adam as optimizer on CPU.')
-    group.add_argument('--codecarbon-dir', type=str, default=None,
-                       help='Write CodeCarbon logs to this directory.')
-    group.add_argument('--eval-only', type=bool, required=False,
-                       help='If set to True, no train step will be performed.'
-                       'and only the evaluation on the `valid` and `test` sets '
-                       'will be performed' )
-    group.add_argument('--skip-train-iteration-range', type=str, nargs='+', default=None,
-                       help='Iteration ranges to skip. The values are one or more dash-separated ranges. e.g., 101-200 251-300.')
-    group.add_argument('--inference', action='store_true',
-                       help='Very basic inference mode: not allocating optim/lr - requires ZERO_STAGE=0')
-    group.add_argument('--abort-on-unmet-fused-kernel-constraints', action='store_true',
-                       help="If set to True, the program will abort if the constraints for loading a fused kernel aren't met")
-    group.add_argument('--pp-partition-method', type=str, default=None,
-                       help="Use to override the pipeline stages partitioning method. e.g., 'type:transformer|embedding'")
-
-    return parser
-
-
-def _add_initialization_args(parser):
-    group = parser.add_argument_group(title='initialization')
-
-    group.add_argument('--seed', type=int, default=1234,
-                       help='Random seed used for python, numpy, '
-                       'pytorch, and cuda.')
-    group.add_argument('--init-method-std', type=float, default=0.02,
-                       help='Standard deviation of the zero mean normal '
-                       'distribution used for weight initialization.')
-    group.add_argument('--init-method-xavier-uniform', action='store_true',
-                       help='Enable Xavier uniform parameter initialization')
-
-    return parser
-
-
-def _add_learning_rate_args(parser):
-    group = parser.add_argument_group(title='learning rate')
-
-    group.add_argument('--lr', type=float, default=None,
-                       help='Initial learning rate. Depending on decay style '
-                       'and initial warmup, the learing rate at each '
-                       'iteration would be different.')
-    group.add_argument('--lr-decay-style', type=str, default='linear',
-                       choices=['constant', 'linear', 'cosine'],
-                       help='Learning rate decay function.')
-    group.add_argument('--lr-decay-iters', type=int, default=None,
-                       help='number of iterations to decay learning rate over,'
-                       ' If None defaults to `--train-iters`')
-    group.add_argument('--lr-decay-samples', type=int, default=None,
-                       help='number of samples to decay learning rate over,'
-                       ' If None defaults to `--train-samples`')
-    group.add_argument('--lr-decay-tokens', type=int, default=None,
-                       help='number of tokens to decay learning rate over,'
-                       ' If not None will override iter/sample-based decay')
-    group.add_argument('--lr-warmup-fraction', type=float, default=None,
-                       help='fraction of lr-warmup-(iters/samples) to use '
-                       'for warmup (as a float)')
-    group.add_argument('--lr-warmup-iters', type=int, default=0,
-                       help='number of iterations to linearly warmup '
-                       'learning rate over.')
-    group.add_argument('--lr-warmup-samples', type=int, default=0,
-                       help='number of samples to linearly warmup '
-                       'learning rate over.')
-    group.add_argument('--warmup', type=int, default=None,
-                       help='Old lr warmup argument, do not use. Use one of the'
-                       '--lr-warmup-* arguments above')
-    group.add_argument('--min-lr', type=float, default=0.0,
-                       help='Minumum value for learning rate. The scheduler'
-                       'clip values below this threshold.')
-    group.add_argument('--override-lr-scheduler', action='store_true',
-                       help='Reset the values of the scheduler (learning rate,'
-                       'warmup iterations, minimum learning rate, maximum '
-                       'number of iterations, and decay style from input '
-                       'arguments and ignore values from checkpoints. Note'
-                       'that all the above values will be reset.')
-    group.add_argument('--use-checkpoint-lr-scheduler', action='store_true',
-                       help='Use checkpoint to set the values of the scheduler '
-                       '(learning rate, warmup iterations, minimum learning '
-                       'rate, maximum number of iterations, and decay style '
-                       'from checkpoint and ignore input arguments.')
-    group.add_argument('--universal-checkpoint', action='store_true',
-                        help='Loading a universal format checkpoint.')
-
-    return parser
-
-
-def _add_checkpointing_args(parser):
-    group = parser.add_argument_group(title='checkpointing')
-
-    group.add_argument('--save', type=str, default=None,
-                       help='Output directory to save checkpoints to.')
-    group.add_argument('--save-interval', type=int, default=None,
-                       help='Number of iterations between checkpoint saves.')
-    group.add_argument('--no-save-optim', action='store_true', default=None,
-                       help='Do not save current optimizer.')
-    group.add_argument('--no-save-rng', action='store_true', default=None,
-                       help='Do not save current rng state.')
-    group.add_argument('--load', type=str, default=None,
-                       help='Directory containing a model checkpoint.')
-    group.add_argument('--no-load-optim', action='store_true', default=None,
-                       help='Do not load optimizer when loading checkpoint.')
-    group.add_argument('--no-load-rng', action='store_true', default=None,
-                       help='Do not load rng state when loading checkpoint.')
-    group.add_argument('--finetune', action='store_true',
-                       help='Load model for finetuning. Do not load optimizer '
-                       'or rng state from checkpoint and set iteration to 0. '
-                       'Assumed when loading a release checkpoint.')
-
-    return parser
-
-
-def _add_mixed_precision_args(parser):
-    group = parser.add_argument_group(title='mixed precision')
-
-    group.add_argument('--fp16', action='store_true',
-                       help='Run model in fp16 mode.')
-    group.add_argument('--bf16', action='store_true',
-                       help='Run model in bfloat16 mode.')
-    group.add_argument('--loss-scale', type=float, default=None,
-                       help='Static loss scaling, positive power of 2 '
-                       'values can improve fp16 convergence. If None, dynamic'
-                       'loss scaling is used.')
-    group.add_argument('--initial-loss-scale', type=float, default=2**32,
-                       help='Initial loss-scale for dynamic loss scaling.')
-    group.add_argument('--min-loss-scale', type=float, default=1.0,
-                       help='Minimum loss scale for dynamic loss scale.')
-    group.add_argument('--loss-scale-window', type=float, default=1000,
-                       help='Window over which to raise/lower dynamic scale.')
-    group.add_argument('--hysteresis', type=int, default=2,
-                       help='hysteresis for dynamic loss scaling')
-    group.add_argument('--fp32-residual-connection', action='store_true',
-                       help='Move residual connections to fp32.')
-    group.add_argument('--no-query-key-layer-scaling', action='store_false',
-                       help='Do not scale Q * K^T by 1 / layer-number.',
-                       dest='apply_query_key_layer_scaling')
-    group.add_argument('--attention-softmax-in-fp32', action='store_true',
-                       help='Run attention masking and softmax in fp32. '
-                       'This flag is ignored unless '
-                       '--no-query-key-layer-scaling is specified.')
-    group.add_argument('--accumulate-allreduce-grads-in-fp32',
-                       action='store_true',
-                       help='Gradient accumulation and all-reduce in fp32.')
-    group.add_argument('--fp16-lm-cross-entropy', action='store_true',
-                       help='Move the cross entropy unreduced loss calculation'
-                       'for lm head to fp16.')
-
-    return parser
-
-
-def _add_distributed_args(parser):
-    group = parser.add_argument_group(title='distributed')
-
-    group.add_argument('--tensor-model-parallel-size', type=int, default=1,
-                       help='Degree of tensor model parallelism.')
-    group.add_argument('--pipeline-model-parallel-size', type=int, default=1,
-                       help='Degree of pipeline model parallelism.')
-    group.add_argument('--model-parallel-size', type=int, default=None,
-                       help='Old model parallel argument, do not use. Use '
-                       '--tensor-model-parallel-size instead.')
-    group.add_argument('--num-layers-per-virtual-pipeline-stage', type=int, default=None,
-                       help='Number of layers per virtual pipeline stage')
-    group.add_argument('--distributed-backend', default='nccl',
-                       choices=['nccl', 'gloo'],
-                       help='Which backend to use for distributed training.')
-    group.add_argument('--DDP-impl', default='local',
-                       choices=['local', 'torch'],
-                       help='which DistributedDataParallel implementation '
-                       'to use.')
-    group.add_argument('--use-contiguous-buffers-in-ddp', action='store_true',
-                       help='If set, use contiguous buffer in DDP. Note that '
-                       'this option only works woth local DDP.' )
-    group.add_argument('--no-scatter-gather-tensors-in-pipeline', action='store_false',
-                       help='Use scatter/gather to optimize communication of tensors in pipeline',
-                       dest='scatter_gather_tensors_in_pipeline')
-    group.add_argument('--local_rank', type=int, default=None,
-                       help='local rank passed from distributed launcher.')
-    group.add_argument('--lazy-mpu-init', type=bool, required=False,
-                       help='If set to True, initialize_megatron() '
-                       'skips DDP initialization and returns function to '
-                       'complete it instead.Also turns on '
-                       '--use-cpu-initialization flag. This is for '
-                       'external DDP manager.' )
-    group.add_argument('--use-cpu-initialization', action='store_true',
-                       default=None, help='If set, affine parallel weights '
-                       'initialization uses CPU' )
-    #group.add_argument('--rank', default=-1, type=int, help='node rank for distributed training')
-    #group.add_argument('--dist_url', type=str, default="env://127.0.0.1:23456")
-    #group.add_argument('--world_size', type=int, default=-1, help='number of nodes for distributed training')
-    #group.add_argument('--dist_backend', default='nccl', type=str, help='distributed backend')
-    return parser
-
-
-def _add_validation_args(parser):
-    group = parser.add_argument_group(title='validation')
-
-    group.add_argument('--eval-iters', type=int, default=100,
-                       help='Number of iterations to run for evaluation'
-                       'validation/test for.')
-    group.add_argument('--eval-interval', type=int, default=1000,
-                       help='Interval between running evaluation on '
-                       'validation set.')
-
-    return parser
-
-
-def _add_data_args(parser):
-    group = parser.add_argument_group(title='data and dataloader')
-
-
-    # option 1 for data loading  (mutually exclusive with option2)
-    group.add_argument('--data-path', nargs='*', default=None,
-                       help='Path to the training dataset. Accepted format:'
-                       '1) a single data path, 2) multiple datasets in the'
-                       'form: dataset1-weight dataset1-path dataset2-weight '
-                       'dataset2-path ...')
-
-    group.add_argument('--split', type=str, default=None,
-                       help='Comma-separated list of proportions for training,'
-                       ' validation, and test split. For example the split '
-                       '`90,5,5` will use 90%% of data for training, 5%% for '
-                       'validation and 5%% for test.')
-
-    # option 2 for data loading (mutually exclusive with option1)
-
-    # helper class to parse the --xxx-weighted-split-paths
-    # note here two args are set: extra valid dataset paths and names
-    class parse_data_paths(argparse.Action):
-        def __call__(self, parser, args, values, option_string=None):
-
-            if option_string == "--train-weighted-split-paths":
-                assert len(values) == 1, 'Only 1 dataset group is allowed to'
-                'be passed for the argument --train-weighted-split-paths'
-
-            # make sure string given in the correct format
-            err_message = 'Each data group should be input on the following format'
-            '"GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2"'
-            'where START < END'
-            for v in values:
-                # each prefix consists several datasets separated by commas
-                prefix = ":".join(v.split(":")[1:]) # remove GIVEN_NAME
-                datasets = prefix.split(",")
-                # check if each dataset is formatted like `WEIGHT START:END PATH`
-                for d in datasets:
-                    assert len(d.split()) == 3, err_message
-                    start, end = d.split()[1].split(":")
-                    assert float(start) < float(end), err_message
-
-            names = [v.split(":")[0] for v in values]
-
-            prefixes = [":".join(v.split(":")[1:]).strip() for v in values]
-            weights = [[d.split()[0] for d in p.split(",")] for p in prefixes]
-            splits = [[d.split()[1] for d in p.split(",")] for p in prefixes]
-            paths = [[d.split()[2] for d in p.split(",")] for p in prefixes]
-
-            # # to keep consistency with Option 1 of data loading (through --data-path)
-            # #  paths will contain strings on the following form
-            # # "WEIGHTS1 PATH1 WEIGHTS2 PATH2 WEIGHTS3 PATH3" for each dataset group
-            # # while data will be parsed in additional arguments below
-            # paths_option1_style = []
-            # for p, w in zip(paths, weights):
-            #   paths_option1_style.append(" ".join([f"{w_i} {p_i}" for p_i, w_i in zip(p,w)]))
-            # setattr(args, self.dest, paths_option1_style)
-            setattr(args, self.dest, paths)
-            setattr(args, self.dest.replace("paths", "weights"), weights)
-            setattr(args, self.dest.replace("paths", "splits"), splits)
-            setattr(args, self.dest.replace("paths","names"), names)
-
-
-    group.add_argument('--train-weighted-split-paths', nargs='*', default=None,
-                    help='Weights, splits and paths to groups of datasets'
-                    'Accepted format: ONE dataset groups could be'
-                    'submitted in the following form between double quotes'
-                    '"GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2"'
-                    'e.g.: "NAME_ABC: 0.6 0:0.6 A, 0.3 0:1 B, 0.1 0:1 C" '
-                    'WEIGHT is used to up and down sample each dataset A,B,C in the group'
-                    'START:END indicates the split portion of the dataset',
-                    action=parse_data_paths)
-
-    group.add_argument('--valid-weighted-split-paths', nargs='*', default=None,
-                    help='Weights, splits and paths to groups of datasets'
-                    'Accepted format: one or many dataset groups could be'
-                    'submitted in the following form each between double quotes'
-                    '"GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2"'
-                    'e.g.: "NAME_ABC: 0.6 0.6:0.8 A, 0.3 0:1 B, 0.1 0:1 C" '
-                    '"NAME_CDE: 0.6 0.6:0.8 C, 0.3 0:1 D, 0.1 0:1 E" '
-                    'validation will be run on each of those groups independently',
-                    action=parse_data_paths)
-
-    group.add_argument('--test-weighted-split-paths', nargs='*', default=None,
-                    help='Weights, splits and paths to groups of datasets'
-                    'Accepted format: one or many dataset groups could be'
-                    'submitted in the following form each between double quotes'
-                    '"GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2"'
-                    'e.g.: "NAME_ABC: 0.6 0.6:0.8 A, 0.3 0:1 B, 0.1 0:1 C" '
-                    '"NAME_CDE: 0.6 0.6:0.8 C, 0.3 0:1 D, 0.1 0:1 E" '
-                    'test will be run on each of those groups independently',
-                    action=parse_data_paths)
-
-    class parse_data_paths_path(argparse.Action):
-        def __call__(self, parser, args, values, option_string=None):
-            expected_option_strings = ["--train-weighted-split-paths-path", "--valid-weighted-split-paths-path", "--test-weighted-split-paths-path"]
-            assert option_string in expected_option_strings, f"Expected {option_string} to be in {expected_option_strings}"
-
-            with open(values, "r") as fi:
-                lines = fi.readlines()
-                assert len(lines) == 1, f"Got multiple lines {len(lines)} instead of 1 expected"
-                assert lines[0][-2:] == "\"\n" and lines[0][0] == "\"", f"Invalid input format, got {lines}"
-                values = lines[0][1:-2].split("\" \"")
-                weighted_split_paths_dest = re.sub(r"_path$", "", self.dest)
-                weighted_split_paths_option = re.sub(r"-path$", "", self.option_strings[0])
-                setattr(args, weighted_split_paths_dest, values)
-                parse_data_paths(option_strings=[weighted_split_paths_option], dest=weighted_split_paths_dest)(parser, args, values, option_string=weighted_split_paths_option)
-
-
-    group.add_argument('--train-weighted-split-paths-path', type=str, action=parse_data_paths_path ,default=None)
-    group.add_argument('--valid-weighted-split-paths-path', type=str, action=parse_data_paths_path, default=None)
-    group.add_argument('--test-weighted-split-paths-path', type=str, action=parse_data_paths_path, default=None)
-
-    group.add_argument('--log-path', type=str, default=None,
-                       help='Path to the save arguments file.')
-    group.add_argument('--vocab-file', type=str, default=None,
-                       help='Path to the vocab file.')
-    group.add_argument('--merge-file', type=str, default=None,
-                       help='Path to the BPE merge file.')
-    group.add_argument('--vocab-extra-ids', type=int, default=0,
-                       help='Number of additional vocabulary tokens. '
-                            'They are used for span masking in the T5 model')
-    group.add_argument('--seq-length', type=int, default=None,
-                       help='Maximum sequence length to process.')
-    group.add_argument('--encoder-seq-length', type=int, default=None,
-                       help='Maximum encoder sequence length to process.'
-                       'This should be exclusive of --seq-length')
-    group.add_argument('--decoder-seq-length', type=int, default=None,
-                       help="Maximum decoder sequence length to process.")
-    group.add_argument('--retriever-seq-length', type=int, default=256,
-                       help='Maximum sequence length for the biencoder model '
-                        ' for retriever')
-    group.add_argument('--sample-rate', type=float, default=1.0,
-                       help='sample rate for training data. Supposed to be 0 '
-                            ' < sample_rate < 1')
-    group.add_argument('--mask-prob', type=float, default=0.15,
-                       help='Probability of replacing a token with mask.')
-    group.add_argument('--short-seq-prob', type=float, default=0.1,
-                       help='Probability of producing a short sequence.')
-    group.add_argument('--mmap-warmup', action='store_true',
-                       help='Warm up mmap files.')
-    group.add_argument('--num-workers', type=int, default=2,
-                       help="Dataloader number of workers.")
-    group.add_argument('--valid-num-workers', type=int, default=2,
-                       help="Dataloader number of workers for validation.")
-    group.add_argument('--tokenizer-type', type=str,
-                       default=None,
-                       choices=['BertWordPieceLowerCase',
-                                'BertWordPieceCase',
-                                'GPT2BPETokenizer',
-                                'PretrainedFromHF'],
-                       help='What type of tokenizer to use.')
-    group.add_argument("--tokenizer-name-or-path", type=str, default=None,
-                       help="Name or path of the huggingface tokenizer.")
-    group.add_argument('--data-impl', type=str, default='infer',
-                       choices=['lazy', 'cached', 'mmap', 'infer'],
-                       help='Implementation of indexed datasets.')
-    group.add_argument('--reset-position-ids', action='store_true',
-                       help='Reset posistion ids after end-of-document token.')
-    group.add_argument('--reset-attention-mask', action='store_true',
-                       help='Reset self attention maske after '
-                       'end-of-document token. Attention between tokens from different documents is null.')
-    group.add_argument('--eod-mask-loss', action='store_true',
-                       help='Mask loss for the end of document tokens.')
-    group.add_argument('--loss-on-targets-only', action='store_true',
-                       help='Mask loss on input sequence.')
-    group.add_argument('--reweight-loss-based-on-position-frequency', action="store_true",
-                       help='Some objectives require us to sample loss_mask. This might introduce bias towards '
-                       'specific positions. This option tries to un-bias the loss by reweighting loss on specific '
-                       'positions based on how frequently we train on that position.'
-                       'This is mostly used for prefix_lm training')
-    group.add_argument("--noise-density", type=float, default=None, help="Span corruption noise density")
-    group.add_argument("--mean-noise-span-length", type=int, default=None, help="Span corruption mean noise span length")
-
-
-    return parser
-
-
-def _add_autoresume_args(parser):
-    group = parser.add_argument_group(title='autoresume')
-
-    group.add_argument('--adlr-autoresume', action='store_true',
-                       help='Enable autoresume on adlr cluster.')
-    group.add_argument('--adlr-autoresume-interval', type=int, default=1000,
-                       help='Intervals over which check for autoresume'
-                       'termination signal')
-
-    return parser
-
-
-def _add_biencoder_args(parser):
-    group = parser.add_argument_group(title='biencoder')
-
-    # network size
-    group.add_argument('--ict-head-size', type=int, default=None,
-                       help='Size of block embeddings to be used in ICT and '
-                        'REALM (paper default: 128)')
-    group.add_argument('--biencoder-projection-dim', type=int, default=0,
-                       help='Size of projection head used in biencoder (paper'
-                        ' default: 128)')
-    group.add_argument('--biencoder-shared-query-context-model', action='store_true',
-                        help='Whether to share the parameters of the query '
-                        'and context models or not')
-
-    # checkpointing
-    group.add_argument('--ict-load', type=str, default=None,
-                       help='Directory containing an ICTBertModel checkpoint')
-    group.add_argument('--bert-load', type=str, default=None,
-                       help='Directory containing an BertModel checkpoint '
-                       '(needed to start ICT and REALM)')
-
-    # data
-    group.add_argument('--titles-data-path', type=str, default=None,
-                       help='Path to titles dataset used for ICT')
-    group.add_argument('--query-in-block-prob', type=float, default=0.1,
-                       help='Probability of keeping query in block for '
-                       'ICT dataset')
-    group.add_argument('--use-one-sent-docs', action='store_true',
-                       help='Whether to use one sentence documents in ICT')
-    group.add_argument('--evidence-data-path', type=str, default=None,
-                       help='Path to Wikipedia Evidence frm DPR paper')
-
-    # training
-    group.add_argument('--retriever-report-topk-accuracies', nargs='+', type=int,
-                        default=[], help="Which top-k accuracies to report "
-                        "(e.g. '1 5 20')")
-    group.add_argument('--retriever-score-scaling', action='store_true',
-                       help='Whether to scale retriever scores by inverse '
-                        'square root of hidden size')
-
-    # faiss index
-    group.add_argument('--block-data-path', type=str, default=None,
-                       help='Where to save/load BlockData to/from')
-    group.add_argument('--embedding-path', type=str, default=None,
-                       help='Where to save/load Open-Retrieval Embedding'
-                        ' data to/from')
-
-    # indexer
-    group.add_argument('--indexer-batch-size', type=int, default=128,
-                       help='How large of batches to use when doing indexing '
-                       'jobs')
-    group.add_argument('--indexer-log-interval', type=int, default=1000,
-                       help='After how many batches should the indexer '
-                       'report progress')
-    return parser
-
-
-def _add_vit_args(parser):
-    group = parser.add_argument_group(title="vit")
-
-    group.add_argument('--num-classes', type=int, default=1000,
-                       help='num of classes in vision classificaiton task')
-    group.add_argument('--img-dim', type=int, default=224,
-                       help='Image size for vision classification task')
-    group.add_argument('--num-channels', type=int, default=3,
-                       help='Number of channels in input image data')
-    group.add_argument('--patch-dim', type=int, default=16,
-                       help='patch dimension used in vit')
-
-    return parser
-
-
-def _add_zero_args(parser):
-    """Text generate arguments."""
-
-    group = parser.add_argument_group('ZeRO configurations', 'configurations')
-    group.add_argument("--zero-stage", type=int, default=1.0)
-    group.add_argument('--zero-reduce-scatter', action='store_true',
-                       help='Use reduce scatter if specified')
-    group.add_argument('--zero-contigious-gradients', action='store_true',
-                       help='Use contigious memory optimizaiton if specified')
-    group.add_argument("--zero-reduce-bucket-size", type=int, default=0.0)
-    group.add_argument("--zero-allgather-bucket-size", type=int, default=0.0)
-    group.add_argument('--remote-device', type=str, default='none', choices=['none', 'cpu', 'nvme'],
-                      help='Remote device for ZeRO-3 initialized parameters.')
-    group.add_argument('--use-pin-memory', action='store_true',
-                     help='Use pinned CPU memory for ZeRO-3 initialized model parameters.')
-    return parser
-
-def _add_memoryopt_args(parser):
-    """Memory optimization arguments."""
-
-    group = parser.add_argument_group('Memory optimizations', 'configurations')
-    group.add_argument("--scattered-embeddings", action='store_true',
-                       help='Save memory by scattering embedding activations. '
-                            'Introduces dropout differences across MP configurations.')
-    group.add_argument("--split-transformers", action='store_true',
-                       help='Save memory by splitting transformer layers into two parts, '
-                       'allowing for more frequent activation checkpoint savings.')
-    group.add_argument("--memory-centric-tiled-linear", action="store_true",
-                       help='Save memory by tiling with deepspeed.zero.TiledLinear.')
-    group.add_argument("--tile-factor", type=int, default=1,
-                       help='Make all linear layers the same size of [hidden/tile_factor, hidden/tile_factor]. '
-                            'Must be enabled with --memory-centric-tiled-linear. '
-                            'Example A: if tile_factor=1, the qkv layer [hidden, 3* hidden] would be converted into [1,3] tiles of size [hidden,hidden]. '
-                            'Example B: if tile_factor=2, the intermediate layer [4*hidden, hidden] will be converted into [8, 2] tiles of size [hidden/2, hidden/2]. '
-                            'Default is 1.')
-
-    return parser
-
-def _add_activation_checkpoint_args(parser):
-    group = parser.add_argument_group('Activation Checkpointing',
-                                      'Checkpointing Configurations')
-    group.add_argument('--deepspeed-activation-checkpointing', action='store_true',
-                       help='uses activation checkpointing from deepspeed')
-    group.add_argument('--partition-activations', action='store_true',
-                       help='partition Activations across GPUs before checkpointing.')
-    group.add_argument('--contigious-checkpointing', action='store_true',
-                       help='Contigious memory checkpointing for activatoins.')
-    group.add_argument('--checkpoint-in-cpu', action='store_true',
-                       help='Move the activation checkpoints to CPU.')
-    group.add_argument('--synchronize-each-layer', action='store_true',
-                       help='does a synchronize at the beginning and end of each checkpointed layer.')
-    group.add_argument('--profile-backward', action='store_true',
-                       help='Enables backward pass profiling for checkpointed layers.')
-    return parser
--- a/megatron/create-data.sh
+++ b/megatron/create-data.sh
+python tools/preprocess_data.py \
+    --input oscar-1GB.jsonl \
+    --output-prefix ./data/my-gpt2 \
+    --vocab gpt2-vocab.json \
+    --dataset-impl mmap \
+    --tokenizer-type GPT2BPETokenizer \
+    --merge-file gpt2-merges.txt \
+    --append-eod \
+    --workers 8
\ No newline at end of file
--- a/megatron/run-one-node.sh
+++ b/megatron/run-one-node.sh
+#!/bin/bash
+
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export MIOPEN_FIND_MODE=3
+
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+RANK=$OMPI_COMM_WORLD_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+
+MODEL_NAME=gpt2-4tp
+DATA_OUTPUT_PATH=./
+LOGS_PATH=$DATA_OUTPUT_PATH/logs
+CHECKPOINT_PATH=checkpoint/$MODEL_NAME
+DATA_PATH=./data/my-gpt2_text_document
+
+TENSORBOARD_PATH=output_dir/tensorboard/$MODEL_NAME
+CODECARBON_PATH=output_dir/codecarbon/$MODEL_NAME
+
+N_GPUS=4
+
+TP_SIZE=4   # always fixed to the size of a single node
+PP_SIZE=1 #128 #96    # NLAYERS must be a multiple of PP_SIZE here
+
+
+MICRO_BATCH_SIZE=2
+GLOBAL_BATCH_SIZE=32 #256 #1536
+NLAYERS=24
+NHIDDEN=1024 #12480
+NHEADS=16
+SEQ_LEN=1024
+SAVE_INTERVAL=1000
+
+#rampup-batch-size 16 16 5859375
+OPTIMIZER_ARGS=" \
+    --optimizer adam \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --adam-eps 1e-8 \
+    --lr 6.0e-5 \
+    --min-lr 6.0e-6 \
+    --lr-decay-style cosine \
+    --clip-grad 1.0 \
+    --weight-decay 1e-1 \
+    "
+
+
+GPT_ARGS=" \
+    --num-layers $NLAYERS \
+    --hidden-size $NHIDDEN \
+    --num-attention-heads $NHEADS \
+    --seq-length $SEQ_LEN \
+    --max-position-embeddings $SEQ_LEN \
+    --micro-batch-size $MICRO_BATCH_SIZE \
+    --global-batch-size $GLOBAL_BATCH_SIZE \
+    --train-iters  50 \
+    --loss-scale 12 \
+    --vocab-file gpt2-vocab.json \
+    --merge-file gpt2-merges.txt \
+    --clip-grad 1.0 \
+    --fp16 \
+    --checkpoint-activations \
+    --seed 42
+    $OPTIMIZER_ARGS \
+    "
+
+OUTPUT_ARGS=" \
+    --log-interval 1 \
+    --save-interval $SAVE_INTERVAL \
+    --eval-interval 10 \
+    --eval-iters 40 \
+    --tensorboard-dir $TENSORBOARD_PATH \
+    --tensorboard-queue-size 5 \
+    --log-timers-to-tensorboard \
+    --log-batch-size-to-tensorboard \
+    --log-validation-ppl-to-tensorboard \
+    "
+
+DATA_ARGS=" \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH \
+    --data-path $DATA_PATH \
+    "
+ZERO_STAGE=1
+config_json="./${MODEL_NAME}_ds_config.json"
+
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
+  "train_batch_size": $GLOBAL_BATCH_SIZE,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE
+  },
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 12
+  },
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false
+}
+EOT
+
+DEEPSPEED_ARGS=" \
+    --deepspeed \
+    --deepspeed_config ${config_json} \
+    --zero-stage ${ZERO_STAGE} \
+    --deepspeed-activation-checkpointing \
+    "
+APP="python pretrain_gpt.py \
+    --tensor-model-parallel-size $TP_SIZE \
+    --pipeline-model-parallel-size $PP_SIZE \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    --data-impl mmap \
+    --split 949,50,1 \
+    --distributed-backend nccl \
+    $DEEPSPEED_ARGS \
+    --rank ${RANK} \
+    --world_size ${WORLD_SIZE} \
+    --dist_url env://127.0.0.1::34566
+    "
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_1:1
+  export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_2:1
+  export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_3:1
+  export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac
+
--- a/run-16B.sh
+++ b/run-16B.sh
+#!/bin/bash
+#SBATCH -p tydexclu01
+#SBATCH -N 16
+#SBATCH --cpus-per-task=1
+#SBATCH --ntasks-per-node=32
+#SBATCH --mem 0
+#SBATCH --gres=dcu:4
+#SBATCH -J gpt2
+#SBATCH -o logs/gpt2-16B-%j.out
+#SBATCH -e logs/gpt2-16B-%j.out
+ulimit -u 200000
+export NCCL_IB_HCA=mlx5
+export NCCL_SOCKET_IFNAME=ib0
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export OMP_NUM_THREADS=1
+echo "START TIME: $(date)"
+
+rm -f ./hostfile/*
+
+
+hostfile=./hostfile/$SLURM_JOB_ID
+scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
+for i in `cat $hostfile`
+do
+    echo ${i} slots=4 >> `pwd`/hostfile/hostfile-dl-$SLURM_JOB_ID
+done
+np=$(cat $hostfile|sort|uniq |wc -l)
+np=$(($np*4))
+nodename=$(cat $hostfile |sed -n "1p")
+dist_url=`echo $nodename | awk '{print $1}'`
+mpirun -np $np --allow-run-as-root --hostfile hostfile/hostfile-dl-$SLURM_JOB_ID --bind-to none `pwd`/single-16B.sh $dist_url
--- a/run-inf-gpus.sh
+++ b/run-inf-gpus.sh
+#!/bin/bash
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+RANK=$OMPI_COMM_WORLD_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+
+CHECKPOINT_PATH=./conver-4tp-model
+
+
+VOCAB_FILE=gpt2-vocab.json
+MERGE_FILE=gpt2-merges.txt
+
+
+APP="python tools/generate_samples_gpt.py \
+       --tensor-model-parallel-size 4 \
+       --num-layers 40 \
+       --hidden-size 5760 \
+       --load $CHECKPOINT_PATH \
+       --num-attention-heads 24 \
+       --max-position-embeddings 2048 \
+       --tokenizer-type GPT2BPETokenizer \
+       --fp16 \
+       --micro-batch-size 2 \
+       --seq-length 2048 \
+       --out-seq-length 128 \
+       --temperature 1.0 \
+       --vocab-file $VOCAB_FILE \
+       --merge-file $MERGE_FILE \
+       --genfile gpt2-genfile.json \
+       --num-samples 4 \
+       --top_p 0.9 \
+       --recompute \
+       --rank ${RANK} \
+       --world_size ${WORLD_SIZE}"
+
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_1:1
+  export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_2:1
+  export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_3:1
+  export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac
--- a/run-inf.sh
+++ b/run-inf.sh
+#!/bin/bash
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+RANK=$OMPI_COMM_WORLD_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+
+CHECKPOINT_PATH=./checkopints/megatron-1tp
+
+
+VOCAB_FILE=gpt2-vocab.json
+MERGE_FILE=gpt2-merges.txt
+
+APP="python tools/generate_samples_gpt.py \
+       --tensor-model-parallel-size 1 \
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --load $CHECKPOINT_PATH \
+       --num-attention-heads 16 \
+       --max-position-embeddings 1024 \
+       --tokenizer-type GPT2BPETokenizer \
+       --fp16 \
+       --micro-batch-size 2 \
+       --seq-length 1024 \
+       --out-seq-length 128 \
+       --temperature 1.0 \
+       --vocab-file $VOCAB_FILE \
+       --merge-file $MERGE_FILE \
+       --genfile gpt2-genfile.json \
+       --num-samples 4 \
+       --top_p 0.9 \
+       --recompute \
+       --rank ${RANK} \
+       --world_size ${WORLD_SIZE}"
+
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_1:1
+  export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_2:1
+  export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_3:1
+  export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac
--- a/run-one-node.sh
+++ b/run-one-node.sh
+#!/bin/bash
+
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export MIOPEN_FIND_MODE=3
+
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+RANK=$OMPI_COMM_WORLD_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+
+MODEL_NAME=gpt2-4tp
+DATA_OUTPUT_PATH=./
+LOGS_PATH=$DATA_OUTPUT_PATH/logs
+CHECKPOINT_PATH=checkpoint/$MODEL_NAME
+DATA_PATH=my-gpt2_text_document
+
+TENSORBOARD_PATH=output_dir/tensorboard/$MODEL_NAME
+CODECARBON_PATH=output_dir/codecarbon/$MODEL_NAME
+
+TP_SIZE=4   # always fixed to the size of a single node
+PP_SIZE=1   # NLAYERS must be a multiple of PP_SIZE here
+
+
+MICRO_BATCH_SIZE=2
+GLOBAL_BATCH_SIZE=32
+NLAYERS=24
+NHIDDEN=1024
+NHEADS=16
+SEQ_LEN=1024
+SAVE_INTERVAL=1000
+
+OPTIMIZER_ARGS=" \
+    --optimizer adam \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --adam-eps 1e-8 \
+    --lr 6.0e-5 \
+    --min-lr 6.0e-6 \
+    --lr-decay-style cosine \
+    --clip-grad 1.0 \
+    --weight-decay 1e-1 \
+    "
+
+
+GPT_ARGS=" \
+    --num-layers $NLAYERS \
+    --hidden-size $NHIDDEN \
+    --num-attention-heads $NHEADS \
+    --seq-length $SEQ_LEN \
+    --max-position-embeddings $SEQ_LEN \
+    --micro-batch-size $MICRO_BATCH_SIZE \
+    --global-batch-size $GLOBAL_BATCH_SIZE \
+    --train-iters  1000 \
+    --loss-scale 12 \
+    --vocab-file gpt2-vocab.json \
+    --merge-file gpt2-merges.txt \
+    --clip-grad 1.0 \
+    --fp16 \
+    --checkpoint-activations \
+    --seed 42
+    $OPTIMIZER_ARGS \
+    "
+
+OUTPUT_ARGS=" \
+    --log-interval 1 \
+    --save-interval $SAVE_INTERVAL \
+    --eval-interval 10 \
+    --eval-iters 40 \
+    --tensorboard-dir $TENSORBOARD_PATH \
+    --tensorboard-queue-size 5 \
+    --log-timers-to-tensorboard \
+    --log-batch-size-to-tensorboard \
+    --log-validation-ppl-to-tensorboard \
+    "
+
+DATA_ARGS=" \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH \
+    --data-path $DATA_PATH \
+    "
+ZERO_STAGE=1
+config_json="./${MODEL_NAME}_ds_config.json"
+
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
+  "train_batch_size": $GLOBAL_BATCH_SIZE,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE
+  },
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 12
+  },
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false
+}
+EOT
+
+DEEPSPEED_ARGS=" \
+    --deepspeed \
+    --deepspeed_config ${config_json} \
+    --zero-stage ${ZERO_STAGE} \
+    --deepspeed-activation-checkpointing \
+    "
+APP="python pretrain_gpt.py \
+    --tensor-model-parallel-size $TP_SIZE \
+    --pipeline-model-parallel-size $PP_SIZE \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    --data-impl mmap \
+    --split 949,50,1 \
+    --distributed-backend nccl \
+    $DEEPSPEED_ARGS \
+    --rank ${RANK} \
+    --world_size ${WORLD_SIZE} \
+    --dist_url env://127.0.0.1::34566
+    "
+    
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_1:1
+  export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_2:1
+  export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_3:1
+  export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac
+
--- a/single-16B.sh
+++ b/single-16B.sh
+#!/bin/bash
+export NCCL_SOCKET_IFNAME=ib0
+export NCCL_IB_HCA=mlx5
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export MIOPEN_FIND_MODE=3
+
+
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+RANK=$OMPI_COMM_WORLD_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+
+
+MODEL_NAME=gpt2-oscar_16B-4tp
+DATA_OUTPUT_PATH=./
+LOGS_PATH=$DATA_OUTPUT_PATH/logs
+CHECKPOINT_PATH=checkopints/$MODEL_NAME
+DATA_PATH=my-gpt2_text_document
+
+TENSORBOARD_PATH=output_dir/tensorboard/$MODEL_NAME
+CODECARBON_PATH=output_dir/codecarbon/$MODEL_NAME
+
+TP_SIZE=4   # always fixed to the size of a single node
+PP_SIZE=8   # NLAYERS must be a multiple of PP_SIZE here
+
+
+MICRO_BATCH_SIZE=1
+GLOBAL_BATCH_SIZE=128 
+NLAYERS=40
+NHIDDEN=5760
+NHEADS=24
+SEQ_LEN=2048
+SAVE_INTERVAL=1000
+
+
+OPTIMIZER_ARGS=" \
+    --optimizer adam \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --adam-eps 1e-8 \
+    --lr 6.0e-5 \
+    --min-lr 6.0e-6 \
+    --lr-decay-style cosine \
+    --clip-grad 1.0 \
+    --weight-decay 1e-1 \
+    "
+
+
+GPT_ARGS=" \
+    --num-layers $NLAYERS \
+    --hidden-size $NHIDDEN \
+    --num-attention-heads $NHEADS \
+    --seq-length $SEQ_LEN \
+    --max-position-embeddings $SEQ_LEN \
+    --micro-batch-size $MICRO_BATCH_SIZE \
+    --global-batch-size $GLOBAL_BATCH_SIZE \
+    --train_iters 7000 \
+    --loss-scale 12 \
+    --vocab-file gpt2-vocab.json \
+    --merge-file gpt2-merges.txt \
+    --clip-grad 1.0 \
+    --checkpoint-activations \
+    --seed 42
+    $OPTIMIZER_ARGS \
+    "
+
+OUTPUT_ARGS=" \
+    --log-interval 1 \
+    --save-interval $SAVE_INTERVAL \
+    --eval-interval 1000 \
+    --eval-iters 40 \
+    --tensorboard-dir $TENSORBOARD_PATH \
+    --tensorboard-queue-size 5 \
+    --log-timers-to-tensorboard \
+    --log-batch-size-to-tensorboard \
+    --log-validation-ppl-to-tensorboard \
+    "
+
+DATA_ARGS=" \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH \
+    --data-path $DATA_PATH \
+    "
+ZERO_STAGE=1
+config_json="./${MODEL_NAME}_ds_config.json"
+
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
+  "train_batch_size": $GLOBAL_BATCH_SIZE,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE
+  },
+  "fp16": {
+    "enabled": false,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 12
+  },
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false
+}
+EOT
+
+DEEPSPEED_ARGS=" \
+    --deepspeed \
+    --deepspeed_config ${config_json} \
+    --zero-stage ${ZERO_STAGE} \
+    --deepspeed-activation-checkpointing \
+    "
+export CMD=" \
+    --tensor-model-parallel-size $TP_SIZE \
+    --pipeline-model-parallel-size $PP_SIZE \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    --data-impl mmap \
+    --split 949,50,1 \
+    --distributed-backend nccl \
+     $DEEPSPEED_ARGS \
+    "
+
+APP="python3 -u `pwd`/pretrain_gpt.py \
+    --rank ${RANK} \
+    --world_size ${WORLD_SIZE} \
+    --dist_url tcp://${1}:34566 \
+    --num-workers 2 \
+    ${CMD} \
+    "
+
+
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_1:1
+  export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_2:1
+  export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_3:1
+  export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac