Initial commit

12415ddb · dongchy920 · 12415ddb · 12415ddb · 12415ddb · 12415ddb
Commit 12415ddb authored Jul 04, 2025 by dongchy920
20 changed files
--- a/assets/wechat_1.jpg
+++ b/assets/wechat_1.jpg
--- a/assets/wechat_2.jpg
+++ b/assets/wechat_2.jpg
--- a/docker/dockerfile
+++ b/docker/dockerfile
+FROM image.sourcefind.cn:5000/dcu/admin/base/custom:vllm0.8.5-ubuntu22.04-dtk25.04-rc7-das1.5-py3.10-20250521-fixpy-rocblas0521-beta2
+RUN source /opt/dtk/env.sh
\ No newline at end of file
--- a/finetune.py
+++ b/finetune.py
+# Code adapted from https://github.com/bigcode-project/starcoder2/blob/main/finetune.py
+import argparse
+import multiprocessing
+import os
+import torch
+import transformers
+from accelerate import PartialState
+from datasets import load_dataset
+from peft import LoraConfig
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    logging,
+    set_seed,
+)
+import numpy as np
+import random
+import warnings
+import sys, pdb
+from trl import SFTTrainer
+from trl.trainer import ConstantLengthDataset
+
+os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_id", type=str, default="aiXcoder/aixcoder-7b-base")
+    parser.add_argument("--dataset_name", type=str, default="bigcode/the-stack-smol")
+    parser.add_argument("--subset", type=str, default="data/rust")
+    parser.add_argument("--split", type=str, default="train")
+    parser.add_argument("--fim_rate", type=float, default=0.5)
+    parser.add_argument("--dataset_text_field", type=str, default="content")
+
+    parser.add_argument("--max_seq_length", type=int, default=1024)
+    parser.add_argument("--max_steps", type=int, default=100)
+    parser.add_argument("--micro_batch_size", type=int, default=1)
+    parser.add_argument("--gradient_accumulation_steps", type=int, default=8)
+    parser.add_argument("--weight_decay", type=float, default=0.01)
+    parser.add_argument("--bf16", type=bool, default=True)
+
+    parser.add_argument("--attention_dropout", type=float, default=0.1)
+    parser.add_argument("--learning_rate", type=float, default=5e-6)
+    parser.add_argument("--lr_scheduler_type", type=str, default="cosine")
+    parser.add_argument("--warmup_steps", type=int, default=20)
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--output_dir", type=str, default="finetune_aix_7b")
+    parser.add_argument("--num_proc", type=int, default=None)
+    parser.add_argument("--push_to_hub", type=bool, default=False)
+    return parser.parse_args()
+
+
+def print_rank_0(message):
+    if torch.distributed.is_initialized():
+        if torch.distributed.get_rank() == 0:
+            print(message, flush=True, file=sys.stderr)
+    else:
+        print(message, flush=True)
+
+def print_trainable_parameters(model):
+    """
+    Prints the number of trainable parameters in the model.
+    """
+    trainable_params = 0
+    all_param = 0
+    for _, param in model.named_parameters():
+        all_param += param.numel()
+        if param.requires_grad:
+            trainable_params += param.numel()
+    print_rank_0(
+        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
+    )
+
+
+class RandomFIMDataset(ConstantLengthDataset):
+    """
+        This class supports the random fill-in-the-middle (FIM) task. If `fim_rate` is greater than 0, 
+        it constructs data in the fill-in-the-middle format with a probability of `fim_rate`. 
+        The aiXcoder-7b-base model uses structured FIM during pre-training, 
+        where a complete code block is constructed as the MIDDLE. 
+        However, creating such training data involves syntactic parsing, 
+        and we currently do not plan to open source the processing code.
+    
+    """
+    def __init__(self, tokenizer, dataset, dataset_text_field=None, fim_rate=0, formatting_func=None, infinite=False, seq_length=1024, num_of_sequences=1024, chars_per_token=3.6, eos_token_id=0, shuffle=True, append_concat_token=True, add_special_tokens=True):
+        self.fim_rate = fim_rate
+        self.fim_spm_rate = 0.5
+        self.np_rand = np.random.RandomState(seed=3574)
+        if self.fim_rate > 0:
+            print_rank_0(f"constructing data wit FIM: fim_rate: {self.fim_rate}")
+        super().__init__(tokenizer, dataset, dataset_text_field, formatting_func, infinite, seq_length, num_of_sequences, chars_per_token, eos_token_id, shuffle, append_concat_token, add_special_tokens)
+    
+    def __iter__(self):
+        iterator = iter(self.dataset)
+        more_examples = True
+        while more_examples:
+            buffer, buffer_len = [], 0
+            while True:
+                if buffer_len >= self.max_buffer_size:
+                    break
+                try:
+                    if self.fim_rate > 0:
+                        if self.np_rand.binomial(1, self.fim_rate): # sample bernoulli dist
+ 
+                            contents = self.formatting_func(next(iterator))
+                            
+                            try:
+                                boundaries = list(self.np_rand.randint(low=0, high=len(contents) + 1, size=2))
+                                boundaries.sort()
+                            except ValueError as e:
+                                print(len(contents), contents)
+                                print(e)
+                                raise e
+
+                            prefix = contents[:boundaries[0]]
+                            middle = contents[boundaries[0]:boundaries[1]]
+                            suffix = contents[boundaries[1]:]
+                            if self.np_rand.binomial(1, self.fim_spm_rate):
+                                contents = f"<s>▁<AIX-SPAN-PRE>▁<AIX-SPAN-POST>{suffix}▁<AIX-SPAN-MIDDLE>{prefix}{middle}</s>"
+                            else:
+                                contents = f"<s>▁<AIX-SPAN-PRE>{prefix}▁<AIX-SPAN-POST>{suffix}▁<AIX-SPAN-MIDDLE>{middle}</s>"
+                        else:
+                            contents = f"<s>{self.formatting_func(next(iterator))}</s>"
+                    else:
+                        contents = f"<s>{self.formatting_func(next(iterator))}</s>"
+                            
+                    buffer.append(contents)
+                    buffer_len += len(buffer[-1])
+                except StopIteration:
+                    if self.infinite:
+                        iterator = iter(self.dataset)
+                        warnings.warn("The dataset reached end and the iterator is reset to the start.")
+                    else:
+                        more_examples = False
+                        break
+            tokenized_inputs = self.tokenizer(buffer, add_special_tokens=self.add_special_tokens, truncation=False)[
+                "input_ids"
+            ]
+            all_token_ids = []
+            for tokenized_input in tokenized_inputs:
+                all_token_ids.extend(tokenized_input)
+            examples = []
+            for i in range(0, len(all_token_ids), self.seq_length):
+                input_ids = all_token_ids[i : i + self.seq_length]
+                if len(input_ids) == self.seq_length:
+                    examples.append(input_ids)
+            if self.shuffle:
+                random.shuffle(examples)
+            for example in examples:
+                self.current_size += 1
+                yield {
+                    "input_ids": torch.LongTensor(example),
+                    "labels": torch.LongTensor(example),
+                }
+
+
+def main(args):
+    # config
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16,
+    )
+    lora_config = LoraConfig(
+        r=8,
+        target_modules=[
+            "q_proj",
+            "o_proj",
+            "k_proj",
+            "v_proj",
+            "gate_proj",
+            "up_proj",
+            "down_proj",
+        ],
+        task_type="CAUSAL_LM",
+    )
+
+    # load model and dataset
+    token = os.environ.get("HF_TOKEN", None)
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model_id,
+        quantization_config=bnb_config,
+        device_map={"": PartialState().process_index},
+        attention_dropout=args.attention_dropout,
+        attn_implementation='flash_attention_2'
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model_id)
+    print_trainable_parameters(model)
+
+    # pdb.set_trace()
+    data = load_dataset(
+        args.dataset_name,
+        data_dir=args.subset,
+        split=args.split,
+        token=token,
+        num_proc=args.num_proc if args.num_proc else multiprocessing.cpu_count(),
+    )
+
+    train_data = RandomFIMDataset(
+        tokenizer=tokenizer, dataset=data, fim_rate=args.fim_rate, dataset_text_field=args.dataset_text_field,
+        infinite=True, seq_length=args.max_seq_length, eos_token_id=tokenizer.eos_token_id
+    )
+
+    # setup the trainer
+    trainer = SFTTrainer(
+        model=model,
+        train_dataset=train_data,
+        max_seq_length=args.max_seq_length,
+        args=transformers.TrainingArguments(
+            per_device_train_batch_size=args.micro_batch_size,
+            gradient_accumulation_steps=args.gradient_accumulation_steps,
+            warmup_steps=args.warmup_steps,
+            max_steps=args.max_steps,
+            learning_rate=args.learning_rate,
+            lr_scheduler_type=args.lr_scheduler_type,
+            weight_decay=args.weight_decay,
+            bf16=args.bf16,
+            logging_strategy="steps",
+            logging_steps=10,
+            output_dir=args.output_dir,
+            optim="paged_adamw_8bit",
+            seed=args.seed,
+            run_name=f"train-{args.model_id.split('/')[-1]}",
+            report_to="none",
+        ),
+        peft_config=lora_config,
+        dataset_text_field=args.dataset_text_field,
+    )
+
+    # launch
+    print_rank_0("Training...")
+    trainer.train()
+
+    print_rank_0("Saving the last checkpoint of the model")
+    model.save_pretrained(os.path.join(args.output_dir, "final_checkpoint/"))
+    if args.push_to_hub:
+        trainer.push_to_hub("Upload model")
+    print_rank_0("Training Done! ")
+
+
+if __name__ == "__main__":
+    args = get_args()
+    set_seed(args.seed)
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    logging.set_verbosity_error()
+
+    main(args)
\ No newline at end of file
--- a/hf_mini/filter.py
+++ b/hf_mini/filter.py
--- a/hf_mini/utils.py
+++ b/hf_mini/utils.py
--- a/icon.png
+++ b/icon.png
--- a/megatron_mini/__init__.py
+++ b/megatron_mini/__init__.py
+
+from .global_vars import get_args
+from .initialize  import initialize_megatron
+
--- a/megatron_mini/arguments.py
+++ b/megatron_mini/arguments.py
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""Megatron arguments."""
+
+import argparse
+import os
+import sys
+import torch
+from megatron_mini.model.module import PositionEmbeddingType
+
+
+def parse_args(extra_args_provider=None, ignore_unknown_args=False):
+    """Parse all arguments."""
+    parser = argparse.ArgumentParser(description='Megatron-LM Arguments',
+                                     allow_abbrev=False)
+
+    # Standard arguments.
+    parser = _add_network_size_args(parser)
+    parser = _add_training_args(parser)
+    parser = _add_mixed_precision_args(parser)
+    parser = _add_distributed_args(parser)
+    parser = _add_inference_args(parser)
+
+    # Custom arguments.
+    if extra_args_provider is not None:
+        parser = extra_args_provider(parser)
+
+    # Parse.
+    if ignore_unknown_args:
+        args, _ = parser.parse_known_args()
+    else:
+        args = parser.parse_args()
+
+    # Args from environment
+    args.rank = int(os.getenv('RANK', '0'))
+    args.world_size = int(os.getenv("WORLD_SIZE", '1'))
+        
+    return args
+
+def validate_args(args, defaults={}, aix_config=None):
+    # Tensor model parallel size.
+    args.tensor_model_parallel_size = min(
+        args.tensor_model_parallel_size, args.world_size)
+    assert args.world_size % args.tensor_model_parallel_size == 0, 'world size'\
+        ' ({}) is not divisible by tensor model parallel size ({})'.format(
+            args.world_size, args.tensor_model_parallel_size)
+    # Pipeline model parallel size.
+    args.pipeline_model_parallel_size = min(
+        args.pipeline_model_parallel_size,
+        (args.world_size // args.tensor_model_parallel_size))
+    args.transformer_pipeline_model_parallel_size = (args.pipeline_model_parallel_size
+    )
+    # Checks.
+    model_parallel_size = args.pipeline_model_parallel_size * \
+                          args.tensor_model_parallel_size
+    assert args.world_size % model_parallel_size == 0, 'world size is not'\
+        ' divisible by tensor parallel size ({}) times pipeline parallel ' \
+        'size ({})'.format(args.world_size, args.tensor_model_parallel_size,
+                           args.pipeline_model_parallel_size)
+    args.data_parallel_size = args.world_size // model_parallel_size
+    if args.rank == 0:
+        print('using world size: {}, data-parallel-size: {}, '
+              'tensor-model-parallel size: {}, '
+              'pipeline-model-parallel size: {} '.format(
+                  args.world_size, args.data_parallel_size,
+                  args.tensor_model_parallel_size,
+                  args.pipeline_model_parallel_size), flush=True)
+    if args.pipeline_model_parallel_size > 1:
+        if args.pipeline_model_parallel_split_rank is not None:
+            assert args.pipeline_model_parallel_split_rank < \
+                    args.pipeline_model_parallel_size, 'split rank needs'\
+                    ' to be less than pipeline model parallel size ({})'.format(
+                            args.pipeline_model_parallel_size)
+
+    # Deprecated arguments
+    assert args.model_parallel_size is None, '--model-parallel-size is no ' \
+        'longer valid, use --tensor-model-parallel-size instead'
+    del args.model_parallel_size
+
+    # Set input defaults.
+    for key in defaults:
+        # For default to be valid, it should not be provided in the
+        # arguments that are passed to the program. We check this by
+        # ensuring the arg is set to None.
+        if getattr(args, key) is not None:
+            if args.rank == 0:
+                print('WARNING: overriding default arguments for {key}:{v} \
+                       with {key}:{v2}'.format(key=key, v=defaults[key],
+                                               v2=getattr(args, key)),
+                                               flush=True)
+        else:
+            setattr(args, key, defaults[key])
+
+
+    if aix_config is not None:
+        # Set args for load Aix model
+        for key in aix_config:
+            if getattr(args, key) is not None:
+                if args.rank == 0:
+                    print(
+                            "Overriding arguments for \"{key}:{v2}\" with AixConfig \"{key}:{v}\"".format(
+                                key=key, v=aix_config[key], v2=getattr(args, key)
+                            ),
+                            flush=True, file=sys.stderr
+                            
+                        )
+                setattr(args, key, aix_config[key])
+            else:
+                if args.rank == 0:
+                    print(
+                            "Setting arguments with AixConfig \"{key}:{v}\"".format(
+                                key=key, v=aix_config[key]
+                            ),
+                            flush=True, file=sys.stderr
+                        )
+                setattr(args, key, aix_config[key])
+
+    # Batch size.
+    assert args.micro_batch_size is not None
+    assert args.micro_batch_size > 0
+
+    # Parameters dtype.
+    args.params_dtype = torch.float
+    if args.fp16:
+        assert not args.bf16
+        args.params_dtype = torch.half
+    if args.bf16:
+        assert not args.fp16
+        args.params_dtype = torch.bfloat16
+
+    if args.rank == 0:
+        print('using {} for parameters ...'.format(args.params_dtype),
+              flush=True)
+
+    # Consumed tokens.
+    args.consumed_train_samples = 0
+    args.consumed_valid_samples = 0
+
+    # Support for variable sequence lengths across batches/microbatches.
+    # set it if the dataloader supports generation of variable sequence lengths
+    # across batches/microbatches. Due to additional communication overhead
+    # during pipeline parallelism, it should not be set if sequence length
+    # is constant during training.
+    args.variable_seq_lengths = False
+
+
+    if args.num_layers is not None:
+        assert args.encoder_num_layers is None, \
+            'cannot have both num-layers and encoder-num-layers specified'
+        args.encoder_num_layers = args.num_layers
+    else:
+        assert args.encoder_num_layers is not None, \
+            'either num-layers or encoder-num-layers should be specified'
+        args.num_layers = args.encoder_num_layers
+
+    # Check required arguments.
+    required_args = ['num_layers', 'hidden_size', 'num_attention_heads']
+    for req_arg in required_args:
+        _check_arg_is_not_none(args, req_arg)
+
+    # Checks.
+    if args.ffn_hidden_size is None:
+        args.ffn_hidden_size = 4 * args.hidden_size
+
+    if args.kv_channels is None:
+        assert args.hidden_size % args.num_attention_heads == 0
+        args.kv_channels = args.hidden_size // args.num_attention_heads
+    
+    if args.seq_length is not None:
+        assert args.max_position_embeddings >= args.seq_length
+
+    TORCH_MAJOR = int(torch.__version__.split('.')[0])
+    TORCH_MINOR = int(torch.__version__.split('.')[1])
+    # Persistent fused layer norm.
+    if TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 11):
+        args.no_persist_layer_norm = True
+        if args.rank == 0:
+            print('Persistent fused layer norm kernel is supported from '
+                  'pytorch v1.11 (nvidia pytorch container paired with v1.11). '
+                  'Defaulting to no_persist_layer_norm=True')
+
+    if args.fp16:
+        assert args.transformer_impl == 'local', \
+            'transformer-engine not yet approved for fp16 training and inference'
+
+    # disable sequence parallelism when tp=1
+    # to avoid change in numerics when
+    # sequence_parallelism is enabled.
+    if args.tensor_model_parallel_size == 1:
+        args.sequence_parallel = False
+    
+    args.async_tensor_model_parallel_allreduce = False
+
+    # disable async_tensor_model_parallel_allreduce when
+    # model parallel memory optimization is enabled
+    if args.sequence_parallel:
+        args.async_tensor_model_parallel_allreduce = False
+    
+    args.no_gradient_accumulation_fusion = True
+
+
+    if os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1":
+        if args.sequence_parallel:
+            raise RuntimeError(
+                "Using sequence parallelism requires setting the environment variable "
+                "CUDA_DEVICE_MAX_CONNECTIONS to 1")
+        if args.async_tensor_model_parallel_allreduce:
+            raise RuntimeError(
+                "Using async gradient all reduce requires setting the environment "
+                "variable CUDA_DEVICE_MAX_CONNECTIONS to 1")
+
+
+    _print_args(args)
+    return args
+
+
+def _print_args(args):
+    """Print arguments."""
+    if args.rank == 0:
+        print('------------------------ arguments ------------------------',
+              flush=True)
+        str_list = []
+        for arg in vars(args):
+            dots = '.' * (48 - len(arg))
+            str_list.append('  {} {} {}'.format(arg, dots, getattr(args, arg)))
+        for arg in sorted(str_list, key=lambda x: x.lower()):
+            print(arg, flush=True)
+        print('-------------------- end of arguments ---------------------',
+              flush=True)
+
+
+def _check_arg_is_not_none(args, arg):
+    assert getattr(args, arg) is not None, '{} argument is None'.format(arg)
+
+
+def _add_inference_args(parser):
+    group = parser.add_argument_group(title='inference')
+
+    group.add_argument('--inference-batch-times-seqlen-threshold',
+                       type=int, default=512,
+                       help='During inference, if batch-size times '
+                       'sequence-length is smaller than this threshold '
+                       'then we will not use pipelining, otherwise we will.')
+    
+    group.add_argument('--max-tokens-to-oom',
+                       type=int, default=12000,
+                       help='Maximum number of tokens during inference'
+                       'tokens here is # in prompt + # to generate'
+                       'Allows us to throw an error before OOM crashes server')
+    return parser
+
+    
+def _add_network_size_args(parser):
+    group = parser.add_argument_group(title='network size')
+
+    group.add_argument('--num-layers', type=int, default=None,
+                       help='Number of transformer layers.')
+    group.add_argument('--encoder-num-layers', type=int, default=None,
+                       help='Number of encoder transformer layers.')
+    group.add_argument('--decoder-num-layers', type=int, default=None,
+                       help='Number of decoder transformer layers.')
+    group.add_argument('--hidden-size', type=int, default=None,
+                       help='Tansformer hidden size.')
+    group.add_argument('--no-query-key-layer-scaling', action='store_false',
+                       help='Do not scale Q * K^T by 1 / layer-number.',
+                       dest='apply_query_key_layer_scaling')
+    group.add_argument('--ffn-hidden-size', type=int, default=None,
+                       help='Transformer Feed-Forward Network hidden size. '
+                       'This is set to 4*hidden-size if not provided')
+    group.add_argument('--num-attention-heads', type=int, default=None,
+                       help='Number of transformer attention heads.')
+    group.add_argument('--kv-channels', type=int, default=None,
+                       help='Projection weights dimension in multi-head '
+                       'attention. This is set to '
+                       '   args.hidden_size // args.num_attention_heads '
+                       'if not provided.')
+    group.add_argument('--num-kv-heads', type=int, default=8,
+                       help='Number of transformer attention heads in MultiQuery\' s Key&Value.')
+    group.add_argument('--position-embedding-type', type=lambda x: PositionEmbeddingType[x],
+                       choices=list(PositionEmbeddingType),
+                       default=PositionEmbeddingType.absolute,
+                       help='Define position embedding type ("absolute" | "rotary" | "alibi"). "absolute" by default.'
+                       )
+    group.add_argument('--kv-lru-capacity', type=int, default=0,
+                       help='Maximum number of lru cache.')
+    group.add_argument('--rope-theta', type=int, default=10000,
+                       help='Maximum number of theat in cos and sin. '
+                       'This is the size of position embedding.')
+    group.add_argument('--rope-linear-scaling-factor', type=int, default=1,
+                       help='Maximum number of theat in cos and sin. '
+                       'This is the size of position embedding.')
+    group.add_argument('--inner-hidden-dim', type=int, default=None,
+                       help='Projection weights dimension in Gated linear unit. This is set to '
+                       'int(2 * (args.hidden_size * ffn_expand_rate) / 3), '
+                       'if not provided.')
+    group.add_argument('--seq-length', type=int, default=1024,
+                       help='Maximum sequence length to process.')
+    group.add_argument(
+        "--dist-timeout",
+        type=int,
+        default=60*24*4,
+        help="Timeout for Pytorch Distributed backend (in minutes).",
+    )
+    parser.add_argument(
+        '--attention-head-type', type=str, default='multiquery', choices=['multihead', 'multiquery', 'groupedquery'])
+    group.add_argument('--max-position-embeddings', type=int, default=1024,
+                       help='Maximum number of position embeddings to use. '
+                       'This is the size of position embedding.')
+    group.add_argument('--make-vocab-size-divisible-by', type=int, default=128,
+                       help='Pad the vocab size to be divisible by this value.'
+                       'This is added for computational efficieny reasons.')
+    group.add_argument('--layernorm-epsilon', type=float, default=1e-5,
+                       help='Layer norm epsilon.')
+    group.add_argument('--apply-residual-connection-post-layernorm',
+                       action='store_true',
+                       help='If set, use original BERT residula connection '
+                       'ordering.')
+    group.add_argument('--openai-gelu', action='store_true',
+                       help='Use OpenAIs GeLU implementation. This option'
+                       'should not be used unless for backward compatibility'
+                       'reasons.')
+    group.add_argument('--onnx-safe', type=bool, required=False,
+                       help='Use workarounds for known problems with '
+                       'Torch ONNX exporter')
+    group.add_argument('--bert-no-binary-head', action='store_false',
+                       help='Disable BERT binary head.',
+                       dest='bert_binary_head')
+    group.add_argument('--num-experts', type=int, default=None,
+                       help='Number of Experts in Switch Transformer (None means no Switch)')
+    group.add_argument('--is-extend-seq', action='store_true',
+                       help='Enable sequence parallel optimization.')
+    return parser
+
+
+def _add_training_args(parser):
+    group = parser.add_argument_group(title='training')
+
+    group.add_argument('--micro-batch-size', type=int, default=None,
+                       help='Batch size per model instance (local batch size). '
+                       'Global batch size is local batch size times data '
+                       'parallel size times number of micro batches.')
+    group.add_argument('--no-masked-softmax-fusion',
+                       action='store_false',
+                       help='Disable fusion of query_key_value scaling, '
+                       'masking, and softmax.',
+                       dest='masked_softmax_fusion')
+    group.add_argument('--use-flash-attn', action='store_true',
+                       help='use FlashAttention implementation of attention. '
+                       'https://arxiv.org/abs/2205.14135')
+    return parser
+
+
+
+def _add_mixed_precision_args(parser):
+    group = parser.add_argument_group(title='mixed precision')
+
+    group.add_argument('--fp16', action='store_true',
+                       help='Run model in fp16 mode.')
+    group.add_argument('--bf16', action='store_true',
+                       help='Run model in bfloat16 mode.')
+
+    return parser
+
+
+def _add_distributed_args(parser):
+    group = parser.add_argument_group(title='distributed')
+
+    group.add_argument('--tensor-model-parallel-size', type=int, default=1,
+                       help='Degree of tensor model parallelism.')
+    group.add_argument('--pipeline-model-parallel-size', type=int, default=1,
+                       help='Degree of pipeline model parallelism.')
+    group.add_argument('--pipeline-model-parallel-split-rank',
+                       type=int, default=None,
+                       help='Rank where encoder and decoder should be split.')
+    group.add_argument('--model-parallel-size', type=int, default=None,
+                       help='Old model parallel argument, do not use. Use '
+                       '--tensor-model-parallel-size instead.')
+    group.add_argument('--distributed-backend', default='nccl',
+                       choices=['nccl', 'gloo'],
+                       help='Which backend to use for distributed training.')
+    group.add_argument('--local_rank', type=int, default=None,
+                       help='local rank passed from distributed launcher.')
+    group.add_argument('--use-cpu-initialization', action='store_true',
+                       default=None, help='If set, affine parallel weights '
+                       'initialization uses CPU' )
+
+    return parser
+
--- a/megatron_mini/core/__init__.py
+++ b/megatron_mini/core/__init__.py
+from megatron_mini.core import parallel_state
+import megatron_mini.core.tensor_parallel
+import megatron_mini.core.utils
+
+# Alias parallel_state as mpu, its legacy name
+mpu = parallel_state
+
+__all__ = [
+    "parallel_state",
+    "tensor_parallel",
+    "utils",
+]
--- a/megatron_mini/core/parallel_state.py
+++ b/megatron_mini/core/parallel_state.py
--- a/megatron_mini/core/tensor_parallel/__init__.py
+++ b/megatron_mini/core/tensor_parallel/__init__.py
+from .layers import (
+    ColumnParallelLinear,
+    RowParallelLinear,
+    VocabParallelEmbedding,
+    set_tensor_model_parallel_attributes,
+    set_defaults_if_not_set_tensor_model_parallel_attributes,
+    copy_tensor_model_parallel_attributes,
+    param_is_not_tensor_parallel_duplicate,
+    linear_with_grad_accumulation_and_async_allreduce
+
+)
+
+from .mappings import (
+    copy_to_tensor_model_parallel_region,
+    gather_from_tensor_model_parallel_region,
+    gather_from_sequence_parallel_region,
+    scatter_to_tensor_model_parallel_region,
+    scatter_to_sequence_parallel_region,
+    reduce_from_tensor_model_parallel_region,
+)
+
+from .random import (
+    checkpoint,
+    get_cuda_rng_tracker,
+    model_parallel_cuda_manual_seed,
+)
+
+from .utils import (
+    split_tensor_along_last_dim,
+    split_tensor_into_1d_equal_chunks,
+    gather_split_1d_tensor,
+)
+
+__all__ = [
+    #layers.py
+    "ColumnParallelLinear",
+    "RowParallelLinear",
+    "VocabParallelEmbedding",
+    "set_tensor_model_parallel_attributes",
+    "set_defaults_if_not_set_tensor_model_parallel_attributes",
+    "copy_tensor_model_parallel_attributes",
+    "param_is_not_tensor_parallel_duplicate",
+    "linear_with_grad_accumulation_and_async_allreduce",
+    # mappings.py
+    "copy_to_tensor_model_parallel_region",
+    "gather_from_tensor_model_parallel_region",
+    "gather_from_sequence_parallel_region",
+   "reduce_from_tensor_model_parallel_region",
+    "scatter_to_tensor_model_parallel_region",
+    "scatter_to_sequence_parallel_region",
+    # random.py
+    "checkpoint",
+    "get_cuda_rng_tracker",
+    "model_parallel_cuda_manual_seed",
+    # utils.py
+    "split_tensor_along_last_dim",
+    "split_tensor_into_1d_equal_chunks",
+    "gather_split_1d_tensor",
+]
--- a/megatron_mini/core/tensor_parallel/layers.py
+++ b/megatron_mini/core/tensor_parallel/layers.py
--- a/megatron_mini/core/tensor_parallel/mappings.py
+++ b/megatron_mini/core/tensor_parallel/mappings.py
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+import torch
+
+from megatron_mini.core.parallel_state import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    get_tensor_model_parallel_group,
+)
+from .utils import split_tensor_along_last_dim
+
+
+def _reduce(input_):
+    """All-reduce the input tensor across model parallel group."""
+
+    # Bypass the function if we are using only 1 GPU.
+    if get_tensor_model_parallel_world_size()==1:
+        return input_
+
+    # All-reduce.
+    torch.distributed.all_reduce(input_, group=get_tensor_model_parallel_group())
+
+    return input_
+
+
+def _split_along_last_dim(input_):
+    """Split the tensor along its last dimension and keep the
+    corresponding slice."""
+
+    world_size = get_tensor_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    # Split along last dimension.
+    input_list = split_tensor_along_last_dim(input_, world_size)
+
+    # Note: torch.split does not create contiguous tensors by default.
+    rank = get_tensor_model_parallel_rank()
+    output = input_list[rank].contiguous()
+
+    return output
+
+
+def _split_along_first_dim(input_):
+    """Split the tensor along its first dimension and keep the
+    corresponding slice."""
+
+    world_size = get_tensor_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    # Split along first dimension.
+    dim_size = input_.size()[0]
+    assert dim_size % world_size == 0, \
+        "First dimension of the tensor should be divisible by tensor parallel size"
+    local_dim_size = dim_size // world_size
+    rank = get_tensor_model_parallel_rank()
+    dim_offset = rank * local_dim_size
+
+    output = input_[dim_offset:dim_offset+local_dim_size].contiguous()
+
+    return output
+
+
+def _gather_along_last_dim(input_):
+    """Gather tensors and concatinate along the last dimension."""
+
+    world_size = get_tensor_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    # Size and dimension.
+    last_dim = input_.dim() - 1
+    rank = get_tensor_model_parallel_rank()
+
+    tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
+    tensor_list[rank] = input_
+    torch.distributed.all_gather(tensor_list, input_, group=get_tensor_model_parallel_group())
+
+    # Note: torch.cat already creates a contiguous tensor.
+    output = torch.cat(tensor_list, dim=last_dim).contiguous()
+
+    return output
+
+
+def _gather_along_first_dim(input_):
+    """Gather tensors and concatinate along the first dimension."""
+
+    world_size = get_tensor_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    dim_size = list(input_.size())
+    dim_size[0] = dim_size[0] * world_size
+
+    output = torch.empty(dim_size, dtype=input_.dtype,
+                         device=torch.cuda.current_device())
+    torch.distributed._all_gather_base(output, input_.contiguous(),
+                                       group=get_tensor_model_parallel_group())
+
+    return output
+
+def _reduce_scatter_along_first_dim(input_):
+    """Reduce-scatter the input tensor across model parallel group."""
+    world_size = get_tensor_model_parallel_world_size()
+    # Bypass the function if we are using only 1 GPU.
+    if world_size == 1:
+        return input_
+
+    dim_size = list(input_.size())
+    assert dim_size[0] % world_size == 0, \
+        "First dimension of the tensor should be divisible by tensor parallel size"
+    
+    dim_size[0] = dim_size[0] // world_size
+   
+    output = torch.empty(dim_size, dtype=input_.dtype,
+                         device=torch.cuda.current_device())
+    torch.distributed._reduce_scatter_base(output, input_.contiguous(), 
+                                           group=get_tensor_model_parallel_group())
+    return output
+
+
+class _CopyToModelParallelRegion(torch.autograd.Function):
+    """Pass the input to the model parallel region."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return input_
+    
+    @staticmethod
+    def forward(ctx, input_):
+        return input_
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _reduce(grad_output)
+
+
+class _ReduceFromModelParallelRegion(torch.autograd.Function):
+    """All-reduce the input from the model parallel region."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _reduce(input_)
+    
+    @staticmethod
+    def forward(ctx, input_):
+        return _reduce(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output
+
+
+class _ScatterToModelParallelRegion(torch.autograd.Function):
+    """Split the input and keep only the corresponding chuck to the rank."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _split_along_last_dim(input_)
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _split_along_last_dim(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _gather_along_last_dim(grad_output)
+
+
+class _GatherFromModelParallelRegion(torch.autograd.Function):
+    """Gather the input from model parallel region and concatinate."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _gather_along_last_dim(input_)
+    
+    @staticmethod
+    def forward(ctx, input_):
+        return _gather_along_last_dim(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _split_along_last_dim(grad_output)
+
+
+class _ScatterToSequenceParallelRegion(torch.autograd.Function):
+    """Split the input and keep only the corresponding chuck to the rank."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _split_along_first_dim(input_)
+
+    @staticmethod
+    def forward(ctx, input_):
+        return _split_along_first_dim(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _gather_along_first_dim(grad_output)
+
+
+class _GatherFromSequenceParallelRegion(torch.autograd.Function):
+    """Gather the input from sequence parallel region and concatinate.""" 
+
+    @staticmethod
+    def symbolic(graph, input_, tensor_parallel_output_grad=True):
+        return _gather_along_first_dim(input_)
+    
+    @staticmethod
+    def forward(ctx, input_, tensor_parallel_output_grad=True):
+        ctx.tensor_parallel_output_grad = tensor_parallel_output_grad
+        return _gather_along_first_dim(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        tensor_parallel_output_grad = ctx.tensor_parallel_output_grad
+
+        # If the computation graph after the gather operation is
+        # in the tensor parallel mode, output gradients need to reduce 
+        # scattered and whereas if the computation is duplicated, 
+        # output gradients need to be scattered.
+        if tensor_parallel_output_grad:
+            return _reduce_scatter_along_first_dim(grad_output), None
+        else:
+            return _split_along_first_dim(grad_output), None
+
+
+class _ReduceScatterToSequenceParallelRegion(torch.autograd.Function):
+    """Reduce scatter the input from the model parallel region."""
+
+    @staticmethod
+    def symbolic(graph, input_):
+        return _reduce_scatter_along_first_dim(input_)
+    
+    @staticmethod
+    def forward(ctx, input_):
+        return _reduce_scatter_along_first_dim(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _gather_along_first_dim(grad_output)
+
+
+# -----------------
+# Helper functions.
+# -----------------
+
+def copy_to_tensor_model_parallel_region(input_):
+    return _CopyToModelParallelRegion.apply(input_)
+
+
+def reduce_from_tensor_model_parallel_region(input_):
+    return _ReduceFromModelParallelRegion.apply(input_)
+
+
+def scatter_to_tensor_model_parallel_region(input_):
+    return _ScatterToModelParallelRegion.apply(input_)
+
+
+def gather_from_tensor_model_parallel_region(input_):
+    return _GatherFromModelParallelRegion.apply(input_)
+
+
+def scatter_to_sequence_parallel_region(input_):
+    return _ScatterToSequenceParallelRegion.apply(input_)
+
+
+def gather_from_sequence_parallel_region(input_, tensor_parallel_output_grad=True):
+    return _GatherFromSequenceParallelRegion.apply(input_, tensor_parallel_output_grad)
+
+
+def reduce_scatter_to_sequence_parallel_region(input_):
+    return _ReduceScatterToSequenceParallelRegion.apply(input_)
+
--- a/megatron_mini/core/tensor_parallel/random.py
+++ b/megatron_mini/core/tensor_parallel/random.py
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+# Parts of the code here are adapted from PyTorch
+# repo: https://github.com/pytorch/pytorch
+
+import contextlib
+
+import torch
+from torch import _C
+from torch.cuda import _lazy_call, device as device_ctx_manager
+from torch.utils.checkpoint import detach_variable
+
+from megatron_mini.core.parallel_state import (
+    get_data_parallel_rank,
+    get_tensor_model_parallel_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+
+from .utils import (
+    split_tensor_into_1d_equal_chunks,
+    gather_split_1d_tensor,
+)
+
+from megatron_mini.core.utils import safely_set_viewless_tensor_data
+
+# Default name for the model parallel rng tracker.
+_MODEL_PARALLEL_RNG_TRACKER_NAME = 'model-parallel-rng'
+
+
+def _set_cuda_rng_state(new_state, device=-1):
+    """Sets the random number generator state of the current GPU.
+
+    Argumentss:
+        new_state (torch.ByteTensor): The desired state
+    This function is adapted from PyTorch repo (torch.cuda.set_rng_state)
+    with a single change: the input state is not cloned. Cloning caused
+    major performance issues for +4 GPU cases.
+    """
+    if hasattr(_C, '_cuda_setRNGState') and callable(_C._cuda_setRNGState):
+        # older PyTorch
+        def cb():
+            with device_ctx_manager(device):
+                _C._cuda_setRNGState(new_state)
+    else:
+        # newer PyTorch
+        if device == -1:
+            device = torch.device('cuda')
+        elif isinstance(device, str):
+            device = torch.device(device)
+        elif isinstance(device, int):
+            device = torch.device('cuda', device)
+
+        def cb():
+            idx = device.index
+            if idx is None:
+                idx = torch.cuda.current_device()
+            default_generator = torch.cuda.default_generators[idx]
+            default_generator.set_state(new_state)
+
+    _lazy_call(cb)
+
+
+
+class CudaRNGStatesTracker:
+    """Tracker for the cuda RNG states.
+
+    Using the `add` method, a cuda rng state is initialized based on
+    the input `seed` and is assigned to `name`. Later, by forking the
+    rng state, we can perform operations and return to our starting
+    cuda state.
+    """
+
+    def __init__(self):
+        # Map from a string name to the cuda rng state.
+        self.states_ = {}
+        # Seeds are just for book keeping and ensure no seed is set twice.
+        self.seeds_ = set()
+
+    def reset(self):
+        """Set to the initial state (no tracker)."""
+        self.states_ = {}
+        self.seeds_ = set()
+
+    def get_states(self):
+        """Get rng states. Copy the dictionary so we have direct
+        pointers to the states, not just a pointer to the dictionary."""
+        states = {}
+        for name in self.states_:
+            states[name] = self.states_[name]
+        return states
+
+    def set_states(self, states):
+        """Set the rng states. For efficiency purposes, we do not check
+        the size of seed for compatibility."""
+        self.states_ = states
+
+    def add(self, name, seed):
+        """Track the rng state."""
+        # Check seed is not already used.
+        if seed in self.seeds_:
+            raise Exception('seed {} already exists'.format(seed))
+        self.seeds_.add(seed)
+        # Check that state is not already defined.
+        if name in self.states_:
+            raise Exception('cuda rng state {} already exists'.format(name))
+        # Get the current rng state.
+        orig_rng_state = torch.cuda.get_rng_state()
+        # Set the new state and store it.
+        torch.cuda.manual_seed(seed)
+        self.states_[name] = torch.cuda.get_rng_state()
+        # Reset rng state to what it was.
+        _set_cuda_rng_state(orig_rng_state)
+
+    @contextlib.contextmanager
+    def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME):
+        """Fork the cuda rng state, perform operations, and exit with
+        the original state."""
+        # Check if we have added the state
+        if name not in self.states_:
+            raise Exception('cuda rng state {} is not added'.format(name))
+        # Store current rng state.
+        orig_cuda_rng_state = torch.cuda.get_rng_state()
+        # Set rng state to the desired one
+        _set_cuda_rng_state(self.states_[name])
+        # Do the stuff we wanted to do.
+        try:
+            yield
+        finally:
+            # Update the current rng state for later use.
+            self.states_[name] = torch.cuda.get_rng_state()
+            # And set the state to the original state we started with.
+            _set_cuda_rng_state(orig_cuda_rng_state)
+
+
+# RNG tracker object.
+_CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker()
+
+
+def get_cuda_rng_tracker():
+    """Get cuda rng tracker."""
+    return _CUDA_RNG_STATE_TRACKER
+
+
+def model_parallel_cuda_manual_seed(seed):
+    """Initialize model parallel cuda seed.
+
+    This function should be called after the model parallel is
+    initialized. Also, no torch.cuda.manual_seed should be called
+    after this function. Basically, this is replacement for that
+    function.
+    Two set of RNG states are tracked:
+        default state: This is for data parallelism and is the same among a
+                       set of model parallel GPUs but different across
+                       different model paralle groups. This is used for
+                       example for dropout in the non-tensor-model-parallel regions.
+        tensor-model-parallel state: This state is different among a set of model
+                              parallel GPUs, but the same across data parallel
+                              groups. This is used for example for dropout in
+                              model parallel regions.
+    """
+    # 2718 is just for fun and any POSITIVE value will work.
+    offset = seed + 2718
+    tensor_model_parallel_seed = offset + get_tensor_model_parallel_rank()
+    # Data parallel gets the original seed.
+    data_parallel_seed = seed
+
+    _CUDA_RNG_STATE_TRACKER.reset()
+    # Set the default state.
+    torch.cuda.manual_seed(data_parallel_seed)
+    # and model parallel state.
+    _CUDA_RNG_STATE_TRACKER.add(_MODEL_PARALLEL_RNG_TRACKER_NAME,
+                                tensor_model_parallel_seed)
+
+
+class CheckpointFunction(torch.autograd.Function):
+    """This function is adapted from torch.utils.checkpoint with
+       two main changes:
+           1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state`
+           2) the states in the model parallel tracker are also properly
+              tracked/set/reset.
+    """
+    @staticmethod
+    def forward(ctx, run_function, distribute_saved_activations, *args):
+        ctx.run_function = run_function
+        ctx.distribute_saved_activations \
+            = distribute_saved_activations
+
+        # Copy the rng states.
+        ctx.fwd_cpu_rng_state = torch.get_rng_state()
+        ctx.fwd_cuda_rng_state = torch.cuda.get_rng_state()
+        ctx.fwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+
+        with torch.no_grad():
+            outputs = run_function(*args)
+
+        # Divide hidden states across model parallel group and only keep
+        # the chunk corresponding to the current rank.
+        if distribute_saved_activations:
+            ctx.input_0_shape = args[0].data.shape
+            safely_set_viewless_tensor_data(
+                args[0],
+                split_tensor_into_1d_equal_chunks(args[0].data, new_buffer=True))
+
+        # Store everything.
+        ctx.save_for_backward(*args)
+
+        return outputs
+
+    @staticmethod
+    def backward(ctx, *args):
+        if not torch.autograd._is_checkpoint_valid():
+            raise RuntimeError("Checkpointing is not compatible with .grad(), "
+                               "please use .backward() if possible")
+        inputs = ctx.saved_tensors
+        if ctx.distribute_saved_activations:
+            safely_set_viewless_tensor_data(
+                inputs[0],
+                gather_split_1d_tensor(inputs[0].data).view(ctx.input_0_shape))
+
+        # Store the current states.
+        bwd_cpu_rng_state = torch.get_rng_state()
+        bwd_cuda_rng_state = torch.cuda.get_rng_state()
+        bwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+
+        # Set the states to what it used to be before the forward pass.
+        torch.set_rng_state(ctx.fwd_cpu_rng_state)
+        _set_cuda_rng_state(ctx.fwd_cuda_rng_state)
+        get_cuda_rng_tracker().set_states(ctx.fwd_cuda_rng_state_tracker)
+
+        # Compute the forward pass.
+        detached_inputs = detach_variable(inputs)
+        with torch.enable_grad():
+            outputs = ctx.run_function(*detached_inputs)
+
+        # Set the states back to what it was at the start of this function.
+        torch.set_rng_state(bwd_cpu_rng_state)
+        _set_cuda_rng_state(bwd_cuda_rng_state)
+        get_cuda_rng_tracker().set_states(bwd_cuda_rng_state_tracker)
+
+        if isinstance(outputs, torch.Tensor):
+            outputs = (outputs,)
+        torch.autograd.backward(outputs, args)
+        grads = tuple(inp.grad if isinstance(inp, torch.Tensor) else inp
+                      for inp in detached_inputs)
+        return (None, None) + grads
+
+
+def checkpoint(function, distribute_saved_activations, *args):
+    """Checkpoint a model or part of the model.
+    This has been directly copied from torch.utils.checkpoint."""
+    return CheckpointFunction.apply(function,
+                                    distribute_saved_activations, *args)
--- a/megatron_mini/core/tensor_parallel/utils.py
+++ b/megatron_mini/core/tensor_parallel/utils.py
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+import torch
+from typing import List, Sequence, Tuple
+
+from megatron_mini.core.utils import divide
+from megatron_mini.core import parallel_state
+
+def split_tensor_along_last_dim(
+    tensor: torch.Tensor,
+    num_partitions: int,
+    contiguous_split_chunks: bool = False,
+) -> List[torch.Tensor]:
+    """ Split a tensor along its last dimension.
+
+        Arguments:
+            tensor: input tensor.
+            num_partitions: number of partitions to split the tensor
+            contiguous_split_chunks: If True, make each chunk contiguous
+                                     in memory.
+
+        Returns:
+            A list of Tensors
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    last_dim_size = divide(tensor.size()[last_dim], num_partitions)
+    # Split.
+    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+    # Note: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return [chunk.contiguous() for chunk in tensor_list]
+
+    return list(tensor_list)
+
+def split_tensor_into_1d_equal_chunks(tensor, new_buffer=False):
+    """ Break a tensor into equal 1D chunks across tensor parallel ranks.
+
+        Returns a Tensor or View with this rank's portion of the data.
+
+        Arguments:
+            tensor: The tensor to split
+
+        Keyword Arguments:
+            new_buffer (bool): If True, returns a new Tensor.
+                               If False, returns a view into the existing Tensor.
+                               Default is False
+
+    """
+    partition_size = torch.numel(tensor) // \
+        parallel_state.get_tensor_model_parallel_world_size()
+    start_index = partition_size * parallel_state.get_tensor_model_parallel_rank()
+    end_index = start_index + partition_size
+    if new_buffer:
+        data = torch.empty(partition_size, dtype=tensor.dtype,
+                           device=torch.cuda.current_device(),
+                           requires_grad=False)
+        data.copy_(tensor.view(-1)[start_index:end_index])
+    else:
+        data = tensor.view(-1)[start_index:end_index]
+    return data
+
+
+def gather_split_1d_tensor(tensor):
+    """ Opposite of split_tensor_into_1d_equal_chunks. Gather values from tensor
+        model parallel ranks.
+
+        Returns a new Tensor with the gathered data.
+
+        Arguments:
+            tensor: A Tensor or view of this rank's portion of the data.
+    """
+    numel_gathered = torch.numel(tensor) * \
+        parallel_state.get_tensor_model_parallel_world_size()
+    gathered = torch.empty(numel_gathered, dtype=tensor.dtype,
+                           device=torch.cuda.current_device(),
+                           requires_grad=False)
+    # TODO: This API is experimental in pytorch (as of Feb 2022) and
+    # this might break in future pytorch releases. We chose this API
+    # as opposed to torch.distributed.all_gather for efficiency reasons.
+    # This API calls directly NCCL all-gather versus the former does
+    # internal copies and can potentially cause slow down.
+    torch.distributed._all_gather_base(gathered, tensor,
+                                       group=parallel_state.get_tensor_model_parallel_group())
+    return gathered
+
+
+class VocabUtility:
+    """ Split the vocabulary into `world_size` chunks and return the first
+        and last index of the vocabulary belonging to the `rank`
+        partition: Note that indices in [fist, last)
+
+    """
+
+    @staticmethod
+    def vocab_range_from_per_partition_vocab_size(
+        per_partition_vocab_size: int, rank, world_size: int
+    ) -> Sequence[int]:
+        index_f = rank * per_partition_vocab_size
+        index_l = index_f + per_partition_vocab_size
+        return index_f, index_l
+
+    @staticmethod
+    def vocab_range_from_global_vocab_size(global_vocab_size: int, rank: int, world_size: int) -> Sequence[int]:
+        per_partition_vocab_size = divide(global_vocab_size, world_size)
+        return VocabUtility.vocab_range_from_per_partition_vocab_size(
+            per_partition_vocab_size, rank, world_size
+        )
--- a/megatron_mini/core/utils.py
+++ b/megatron_mini/core/utils.py
--- a/megatron_mini/filter.py
+++ b/megatron_mini/filter.py
--- a/megatron_mini/global_vars.py
+++ b/megatron_mini/global_vars.py
--- a/megatron_mini/initialize.py
+++ b/megatron_mini/initialize.py