GPT2 base on megatron-deepspeed

8ec5d678 · hepj987 · 8ec5d678 · 8ec5d678 · 8ec5d678 · 8ec5d678
Commit 8ec5d678 authored Apr 03, 2023 by hepj987
20 changed files
--- a/megatron-deepspeed_dtk22.10/gpt2-merges.txt
+++ b/megatron-deepspeed_dtk22.10/gpt2-merges.txt
--- a/megatron-deepspeed_dtk22.10/gpt2-vocab.json
+++ b/megatron-deepspeed_dtk22.10/gpt2-vocab.json
--- a/megatron-deepspeed_dtk22.10/images/cases_april2021.png
+++ b/megatron-deepspeed_dtk22.10/images/cases_april2021.png
--- a/megatron-deepspeed_dtk22.10/megatron/__init__.py
+++ b/megatron-deepspeed_dtk22.10/megatron/__init__.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from .package_info import (
+    __description__,
+    __contact_names__,
+    __url__,
+    __download_url__,
+    __keywords__,
+    __license__,
+    __package_name__,
+    __version__,
+)
+from .global_vars import get_args
+from .global_vars import get_current_global_batch_size
+from .global_vars import get_num_microbatches
+from .global_vars import update_num_microbatches
+from .global_vars import get_tokenizer
+from .global_vars import get_tensorboard_writer
+from .global_vars import get_adlr_autoresume
+from .global_vars import get_timers
+from .initialize  import initialize_megatron
+def print_rank_0(message):
+    """If distributed is initialized, print only on rank 0."""
+    if torch.distributed.is_initialized():
+        if torch.distributed.get_rank() == 0:
+            print(message, flush=True)
+    else:
+        print(message, flush=True)
+def is_last_rank():
+    return torch.distributed.get_rank() == (
+        torch.distributed.get_world_size() - 1)
+def print_rank_last(message):
+    """If distributed is initialized, print only on last rank."""
+    if torch.distributed.is_initialized():
+        if is_last_rank():
+            print(message, flush=True)
+    else:
+        print(message, flush=True)
--- a/megatron-deepspeed_dtk22.10/megatron/arguments.py
+++ b/megatron-deepspeed_dtk22.10/megatron/arguments.py
--- a/megatron-deepspeed_dtk22.10/megatron/arguments.py-nodes
+++ b/megatron-deepspeed_dtk22.10/megatron/arguments.py-nodes
--- a/megatron-deepspeed_dtk22.10/megatron/arguments.py-one_node
+++ b/megatron-deepspeed_dtk22.10/megatron/arguments.py-one_node
--- a/megatron-deepspeed_dtk22.10/megatron/checkpointing.py
+++ b/megatron-deepspeed_dtk22.10/megatron/checkpointing.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Input/output checkpointing."""
+import os
+import random
+import sys
+import numpy as np
+import torch
+from megatron.global_vars import codecarbon_tracker_flush
+from megatron import (get_args,
+                      mpu,
+                      print_rank_0,
+                      update_num_microbatches,
+                      utils,
+                      get_tokenizer)
+from megatron.enums import PositionEmbeddingType
+_CHECKPOINT_VERSION = None
+def set_checkpoint_version(value):
+    global _CHECKPOINT_VERSION
+    if _CHECKPOINT_VERSION is not None:
+        assert _CHECKPOINT_VERSION == value, \
+            "checkpoint versions do not match"
+    _CHECKPOINT_VERSION = value
+def get_checkpoint_version():
+    global _CHECKPOINT_VERSION
+    return _CHECKPOINT_VERSION
+def check_checkpoint_args(checkpoint_args):
+    """Ensure fixed arguments for a model are the same for the input
+    arguments and the one retrieved from checkpoint."""
+    args = get_args()
+    def _compare(arg_name, old_arg_name=None):
+        if old_arg_name is not None:
+            checkpoint_value = getattr(checkpoint_args, old_arg_name)
+        else:
+            checkpoint_value = getattr(checkpoint_args, arg_name)
+        args_value = getattr(args, arg_name)
+        error_message = '{} value from checkpoint ({}) is not equal to the ' \
+                        'input argument value ({}).'.format(
+                            arg_name, checkpoint_value, args_value)
+        assert checkpoint_value == args_value, error_message
+    _compare('num_layers')
+    _compare('hidden_size')
+    _compare('num_attention_heads')
+    _compare('position_embedding_type')
+    # with alibi we can change `max_position_embeddings`
+    if args.position_embedding_type != PositionEmbeddingType.alibi:
+        _compare('max_position_embeddings')
+    if args.vocab_file:
+        _compare('make_vocab_size_divisible_by')
+        _compare('padded_vocab_size')
+        _compare('tokenizer_type')
+    if get_checkpoint_version() < 3.0:
+        _compare('tensor_model_parallel_size',
+                 old_arg_name='model_parallel_size')
+    if get_checkpoint_version() >= 3.0:
+        _compare('tensor_model_parallel_size')
+        _compare('pipeline_model_parallel_size')
+def ensure_directory_exists(filename):
+    """Build filename's path if it does not already exists."""
+    dirname = os.path.dirname(filename)
+    if not os.path.exists(dirname):
+        os.makedirs(dirname)
+def get_checkpoint_name(checkpoints_path, iteration,
+                        release=False):
+    """A unified checkpoint name."""
+    if release:
+        directory = 'release'
+    else:
+        directory = 'iter_{:07d}'.format(iteration)
+    # Use both the tensor and pipeline MP rank.
+    if mpu.get_pipeline_model_parallel_world_size() == 1:
+        return os.path.join(checkpoints_path, directory,
+                            'mp_rank_{:02d}'.format(
+                                mpu.get_tensor_model_parallel_rank()),
+                            'model_optim_rng.pt')
+    return os.path.join(checkpoints_path, directory,
+                        'mp_rank_{:02d}_{:03d}'.format(
+                            mpu.get_tensor_model_parallel_rank(),
+                            mpu.get_pipeline_model_parallel_rank()),
+                        'model_optim_rng.pt')
+def get_checkpoint_tracker_filename(checkpoints_path):
+    """Tracker file rescords the latest chckpoint during
+    training to restart from."""
+    return os.path.join(checkpoints_path, 'latest_checkpointed_iteration.txt')
+def save_checkpoint(iteration, model, optimizer, lr_scheduler):
+    """Save a model checkpoint."""
+    args = get_args()
+    # Only rank zero of the data parallel writes to the disk.
+    if not args.deepspeed:
+        model = utils.unwrap_model(model)
+    print_rank_0('saving checkpoint at iteration {:7d} to {}'.format(
+        iteration, args.save))
+    if not torch.distributed.is_initialized() or mpu.get_data_parallel_rank() == 0 \
+        or args.deepspeed:
+        # Arguments, iteration, and model.
+        state_dict = {}
+        state_dict['args'] = args
+        state_dict['checkpoint_version'] = 3.0
+        state_dict['iteration'] = iteration
+        state_dict['tokens'] = args.consumed_train_tokens
+        state_dict['checkpoint_info'] = _checkpoint_info()
+        # DeepSpeed saves the model/optimizer/scheduler
+        if not args.deepspeed:
+            if len(model) == 1:
+                state_dict['model'] = model[0].state_dict_for_save_checkpoint()
+            else:
+                for i in range(len(model)):
+                    mpu.set_virtual_pipeline_model_parallel_rank(i)
+                    state_dict['model%d' % i] = model[i].state_dict_for_save_checkpoint()
+            # Optimizer stuff.
+            if not args.no_save_optim:
+                if optimizer is not None:
+                    state_dict['optimizer'] = optimizer.state_dict()
+                if lr_scheduler is not None:
+                    state_dict['lr_scheduler'] = lr_scheduler.state_dict()
+        # RNG states.
+        if not args.no_save_rng:
+            state_dict['random_rng_state'] = random.getstate()
+            state_dict['np_rng_state'] = np.random.get_state()
+            state_dict['torch_rng_state'] = torch.get_rng_state()
+            state_dict['cuda_rng_state'] = torch.cuda.get_rng_state()
+            state_dict['rng_tracker_states'] \
+                = mpu.get_cuda_rng_tracker().get_states()
+        # Save.
+        checkpoint_name = get_checkpoint_name(args.save, iteration)
+        if not args.deepspeed:
+            ensure_directory_exists(checkpoint_name)
+            torch.save(state_dict, checkpoint_name)
+    if args.deepspeed:
+        # Saving is a collective communication
+        checkpoint_name = get_checkpoint_name(args.save, iteration)
+        # Trim off the filename and mp_rank_* directory.
+        for _ in range(3):
+            checkpoint_name = os.path.dirname(checkpoint_name)
+        model[0].save_checkpoint(checkpoint_name, client_state=state_dict)
+    # Wait so everyone is done (necessary)
+    if torch.distributed.is_initialized():
+        torch.distributed.barrier()
+    print_rank_0('  successfully saved checkpoint at iteration {:7d} to {}'.format(
+        iteration, args.save))
+    # And update the latest iteration
+    if (not args.deepspeed
+        and (not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0)):
+        tracker_filename = get_checkpoint_tracker_filename(args.save)
+        with open(tracker_filename, 'w') as f:
+            f.write(str(iteration))
+    # Wait so everyone is done (not necessary)
+    if torch.distributed.is_initialized():
+        torch.distributed.barrier()
+    # since the code can be exited or aborted in various places we use the checkpoint saving as
+    # a save saving point for the codecarbon tracker. If the program doesn't run to its normal
+    # end, then only the data since the last saved checkpoint will be lost.
+    codecarbon_tracker_flush()
+def _transpose_first_dim(t, num_splits, num_splits_first, model):
+    input_shape = t.size()
+    # We use a self_attention module but the values extracted aren't
+    # specific to self attention so should work for cross attention as well
+    while hasattr(model, 'module'):
+        model = model.module
+    attention_module = model.language_model.encoder.layers[0].self_attention
+    hidden_size_per_attention_head = attention_module.hidden_size_per_attention_head
+    num_attention_heads_per_partition = attention_module.num_attention_heads_per_partition
+    if num_splits_first:
+        """[num_splits * np * hn, h]
+        -->(view) [num_splits, np, hn, h]
+        -->(tranpose) [np, num_splits, hn, h]
+        -->(view) [np * num_splits * hn, h] """
+        intermediate_shape = \
+            (num_splits, num_attention_heads_per_partition,
+             hidden_size_per_attention_head) + input_shape[1:]
+        t = t.view(*intermediate_shape)
+        t = t.transpose(0, 1).contiguous()
+    else:
+        """[np * hn * num_splits, h]
+        -->(view) [np, hn, num_splits, h]
+        -->(tranpose) [np, num_splits, hn, h]
+        -->(view) [np * num_splits * hn, h] """
+        intermediate_shape = \
+            (num_attention_heads_per_partition,
+             hidden_size_per_attention_head, num_splits) +\
+             input_shape[1:]
+        t = t.view(*intermediate_shape)
+        t = t.transpose(1, 2).contiguous()
+    t = t.view(*input_shape)
+    return t
+def fix_query_key_value_ordering(model, checkpoint_version):
+    """Fix up query/key/value matrix ordering if checkpoint
+    version is smaller than 2.0
+    """
+    if checkpoint_version < 2.0:
+        if isinstance(model, list):
+            assert len(model)==1
+            model = model[0]
+        for name, param in model.named_parameters():
+            if name.endswith(('.query_key_value.weight', '.query_key_value.bias')):
+                if checkpoint_version == 0:
+                    fixed_param = _transpose_first_dim(param.data, 3, True, model)
+                elif checkpoint_version == 1.0:
+                    fixed_param = _transpose_first_dim(param.data, 3, False, model)
+                else:
+                    print_rank_0(f"Invalid checkpoint version {checkpoint_version}.")
+                    sys.exit()
+                param.data.copy_(fixed_param)
+            if name.endswith(('.key_value.weight', '.key_value.bias')):
+                if checkpoint_version == 0:
+                    fixed_param = _transpose_first_dim(param.data, 2, True, model)
+                elif checkpoint_version == 1.0:
+                    fixed_param = _transpose_first_dim(param.data, 2, False, model)
+                else:
+                    print_rank_0(f"Invalid checkpoint version {checkpoint_version}.")
+                    sys.exit()
+                param.data.copy_(fixed_param)
+        print_rank_0(" succesfully fixed query-key-values ordering for"
+                    " checkpoint version {}".format(checkpoint_version))
+def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True):
+    """Load a model checkpoint and return the iteration.
+    strict (bool): whether to strictly enforce that the keys in
+        :attr:`state_dict` of the checkpoint match the names of
+        parameters and buffers in model.
+    """
+    args = get_args()
+    load_dir = getattr(args, load_arg)
+    if args.deepspeed:
+        load_optimizer_states = False if args.no_load_optim else True
+        loaded_dir, state_dict = model[0].load_checkpoint(load_dir, load_optimizer_states=load_optimizer_states)
+        if loaded_dir is None:
+            print_rank_0('WARNING: could not find the metadata file {} '.format(
+                load_dir))
+            print_rank_0('    will not load any checkpoints and will start from '
+                        'random')
+            return 0
+        release = False
+    else:
+        model = utils.unwrap_model(model)
+        # Read the tracker file and set the iteration.
+        tracker_filename = get_checkpoint_tracker_filename(load_dir)
+        # If no tracker file, return iretation zero.
+        if not os.path.isfile(tracker_filename):
+            print_rank_0('WARNING: could not find the metadata file {} '.format(
+                tracker_filename))
+            print_rank_0('    will not load any checkpoints and will start from '
+                        'random')
+            return 0
+        # Otherwise, read the tracker file and either set the iteration or
+        # mark it as a release checkpoint.
+        iteration = 0
+        release = False
+        with open(tracker_filename, 'r') as f:
+            metastring = f.read().strip()
+            try:
+                iteration = int(metastring)
+            except ValueError:
+                release = metastring == 'release'
+                if not release:
+                    print_rank_0('ERROR: Invalid metadata file {}. Exiting'.format(
+                        tracker_filename))
+                    sys.exit()
+        assert iteration > 0 or release, 'error parsing metadata file {}'.format(
+            tracker_filename)
+        # Checkpoint.
+        checkpoint_name = get_checkpoint_name(load_dir, iteration, release)
+        print_rank_0(f' loading checkpoint from {args.load} at iteration {iteration}')
+        # Load the checkpoint.
+        try:
+            state_dict = torch.load(checkpoint_name, map_location='cpu')
+        except ModuleNotFoundError:
+            from megatron.fp16_deprecated import loss_scaler
+            # For backward compatibility.
+            print_rank_0(' > deserializing using the old code structure ...')
+            sys.modules['fp16.loss_scaler'] = sys.modules[
+                'megatron.fp16_deprecated.loss_scaler']
+            sys.modules['megatron.fp16.loss_scaler'] = sys.modules[
+                'megatron.fp16_deprecated.loss_scaler']
+            state_dict = torch.load(checkpoint_name, map_location='cpu')
+            sys.modules.pop('fp16.loss_scaler', None)
+            sys.modules.pop('megatron.fp16.loss_scaler', None)
+        except BaseException as e:
+            print_rank_0('could not load the checkpoint')
+            print_rank_0(e)
+            sys.exit()
+    # set checkpoint version
+    set_checkpoint_version(state_dict.get('checkpoint_version', 0))
+    # Set iteration.
+    if args.finetune or release:
+        iteration = 0
+    else:
+        try:
+            iteration = state_dict['iteration']
+            if 'tokens' in state_dict:
+                args.consumed_train_tokens = state_dict['tokens']
+        except KeyError:
+            try:  # Backward compatible with older checkpoints
+                iteration = state_dict['total_iters']
+            except KeyError:
+                print_rank_0('A metadata file exists but unable to load '
+                             'iteration from checkpoint {}, exiting'.format(
+                                 checkpoint_name))
+                sys.exit()
+    # Check arguments.
+    assert args.consumed_train_samples == 0
+    assert args.consumed_valid_samples == 0
+    if 'args' in state_dict:
+        checkpoint_args = state_dict['args']
+        if not args.universal_checkpoint:
+            check_checkpoint_args(checkpoint_args)
+        args.consumed_train_samples = getattr(checkpoint_args,
+                                              'consumed_train_samples', 0)
+        update_num_microbatches(consumed_samples=args.consumed_train_samples)
+        args.consumed_valid_samples = getattr(checkpoint_args,
+                                              'consumed_valid_samples', 0)
+        args.gigaflos_no_embeds = getattr(checkpoint_args,
+                                          'gigaflos_no_embeds', 0)
+    else:
+        print_rank_0('could not find arguments in the checkpoint ...')
+    # Model.
+    if not args.deepspeed:
+        if len(model) == 1:
+            model[0].load_state_dict(state_dict['model'], strict=strict)
+        else:
+            for i in range(len(model)):
+                mpu.set_virtual_pipeline_model_parallel_rank(i)
+                model[i].load_state_dict(state_dict['model%d' % i], strict=strict)
+    # Fix up query/key/value matrix ordering if needed
+    checkpoint_version = get_checkpoint_version()
+    print_rank_0(f' checkpoint version {checkpoint_version}')
+    fix_query_key_value_ordering(model, checkpoint_version)
+    # Optimizer.
+    if not args.deepspeed:
+        if not release and not args.finetune and not args.no_load_optim:
+            try:
+                if optimizer is not None:
+                    optimizer.load_state_dict(state_dict['optimizer'])
+                if lr_scheduler is not None:
+                    lr_scheduler.load_state_dict(state_dict['lr_scheduler'])
+            except KeyError:
+                print_rank_0('Unable to load optimizer from checkpoint {}. '
+                            'Specify --no-load-optim or --finetune to prevent '
+                            'attempting to load the optimizer state, '
+                            'exiting ...'.format(checkpoint_name))
+                sys.exit()
+    # rng states.
+    if not release and not args.finetune and not args.no_load_rng:
+        try:
+            random.setstate(state_dict['random_rng_state'])
+            np.random.set_state(state_dict['np_rng_state'])
+            torch.set_rng_state(state_dict['torch_rng_state'])
+            torch.cuda.set_rng_state(state_dict['cuda_rng_state'])
+            # Check for empty states array
+            if not state_dict['rng_tracker_states']:
+                raise KeyError
+            mpu.get_cuda_rng_tracker().set_states(
+                state_dict['rng_tracker_states'])
+        except KeyError:
+            print_rank_0('Unable to load rng state from checkpoint {}. '
+                         'Specify --no-load-rng or --finetune to prevent '
+                         'attempting to load the rng state, '
+                         'exiting ...'.format(checkpoint_name))
+            sys.exit()
+    # Some utilities want to load a checkpoint without distributed being initialized
+    if torch.distributed.is_initialized():
+        torch.distributed.barrier()
+    print_rank_0(f'  successfully loaded checkpoint from {args.load} '
+                 f'at iteration {iteration}')
+    return iteration
+def load_biencoder_checkpoint(model, only_query_model=False,
+        only_context_model=False, custom_load_path=None):
+    """
+    selectively load retrieval models for indexing/retrieving
+    from saved checkpoints
+    """
+    args = get_args()
+    model = utils.unwrap_model(model)
+    load_path = custom_load_path if custom_load_path is not None else args.load
+    tracker_filename = get_checkpoint_tracker_filename(load_path)
+    with open(tracker_filename, 'r') as f:
+        iteration = int(f.read().strip())
+    checkpoint_name = get_checkpoint_name(load_path, iteration, False)
+    if mpu.get_data_parallel_rank() == 0:
+        print('global rank {} is loading checkpoint {}'.format(
+            torch.distributed.get_rank(), checkpoint_name))
+    state_dict = torch.load(checkpoint_name, map_location='cpu')
+    ret_state_dict = state_dict['model']
+    if only_query_model:
+        ret_state_dict.pop('context_model')
+    if only_context_model:
+        ret_state_dict.pop('query_model')
+    assert len(model) == 1
+    model[0].load_state_dict(ret_state_dict)
+    torch.distributed.barrier()
+    if mpu.get_data_parallel_rank() == 0:
+        print(' successfully loaded {}'.format(checkpoint_name))
+    return model
+def _checkpoint_info():
+    args = get_args()
+    tokenizer = get_tokenizer()
+    return {
+        "padded_vocab_size": args.padded_vocab_size,
+        "original_vocab_size": tokenizer.vocab_size,
+    }
\ No newline at end of file
--- a/megatron-deepspeed_dtk22.10/megatron/data/Makefile
+++ b/megatron-deepspeed_dtk22.10/megatron/data/Makefile
+PYTHON3CONFIG := $(shell command -v python3-config 2> /dev/null)
+ifndef PYTHON3CONFIG
+    $(error "python3-config is not available. Please install it. It may be in a python-dev or another package")
+endif
+CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
+CPPFLAGS += $(shell python3 -m pybind11 --includes)
+LIBNAME = helpers
+LIBEXT = $(shell python3-config --extension-suffix)
+default: $(LIBNAME)$(LIBEXT)
+%$(LIBEXT): %.cpp
+	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
--- a/megatron-deepspeed_dtk22.10/megatron/data/__init__.py
+++ b/megatron-deepspeed_dtk22.10/megatron/data/__init__.py
+from . import indexed_dataset
--- a/megatron-deepspeed_dtk22.10/megatron/data/autoaugment.py
+++ b/megatron-deepspeed_dtk22.10/megatron/data/autoaugment.py
+"""AutoAugment data augmentation policy for ImageNet.
+-- Begin license text.
+MIT License
+Copyright (c) 2018 Philip Popien
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+-- End license text.
+Code adapted from https://github.com/DeepVoltaire/AutoAugment.
+This module implements the fixed AutoAugment data augmentation policy for ImageNet provided in
+Appendix A, Table 9 of reference [1]. It does not include any of the search code for augmentation
+policies.
+Reference:
+[1] https://arxiv.org/abs/1805.09501
+"""
+import random
+import numpy as np
+from PIL import Image
+from PIL import ImageEnhance
+from PIL import ImageOps
+_MAX_LEVEL = 10  # Maximum integer strength of an augmentation, if applicable.
+class ImageNetPolicy:
+    """Definition of an ImageNetPolicy.
+    Implements a fixed AutoAugment data augmentation policy targeted at
+    ImageNet training by randomly applying at runtime one of the 25 pre-defined
+    data augmentation sub-policies provided in Reference [1].
+    Usage example as a Pytorch Transform:
+    >>> transform=transforms.Compose([transforms.Resize(256),
+    >>>                               ImageNetPolicy(),
+    >>>                               transforms.ToTensor()])
+    """
+    def __init__(self, fillcolor=(128, 128, 128)):
+        """Initialize an ImageNetPolicy.
+        Args:
+            fillcolor (tuple): RGB color components of the color to be used for
+            filling when needed (default: (128, 128, 128), which
+            corresponds to gray).
+        """
+        # Instantiate a list of sub-policies.
+        # Each entry of the list is a SubPolicy which consists of
+        # two augmentation operations,
+        # each of those parametrized as operation, probability, magnitude.
+        # Those two operations are applied sequentially on the image upon call.
+        self.policies = [
+            SubPolicy("posterize", 0.4, 8, "rotate", 0.6, 9, fillcolor),
+            SubPolicy("solarize", 0.6, 5, "autocontrast", 0.6, 5, fillcolor),
+            SubPolicy("equalize", 0.8, 8, "equalize", 0.6, 3, fillcolor),
+            SubPolicy("posterize", 0.6, 7, "posterize", 0.6, 6, fillcolor),
+            SubPolicy("equalize", 0.4, 7, "solarize", 0.2, 4, fillcolor),
+            SubPolicy("equalize", 0.4, 4, "rotate", 0.8, 8, fillcolor),
+            SubPolicy("solarize", 0.6, 3, "equalize", 0.6, 7, fillcolor),
+            SubPolicy("posterize", 0.8, 5, "equalize", 1.0, 2, fillcolor),
+            SubPolicy("rotate", 0.2, 3, "solarize", 0.6, 8, fillcolor),
+            SubPolicy("equalize", 0.6, 8, "posterize", 0.4, 6, fillcolor),
+            SubPolicy("rotate", 0.8, 8, "color", 0.4, 0, fillcolor),
+            SubPolicy("rotate", 0.4, 9, "equalize", 0.6, 2, fillcolor),
+            SubPolicy("equalize", 0.0, 7, "equalize", 0.8, 8, fillcolor),
+            SubPolicy("invert", 0.6, 4, "equalize", 1.0, 8, fillcolor),
+            SubPolicy("color", 0.6, 4, "contrast", 1.0, 8, fillcolor),
+            SubPolicy("rotate", 0.8, 8, "color", 1.0, 2, fillcolor),
+            SubPolicy("color", 0.8, 8, "solarize", 0.8, 7, fillcolor),
+            SubPolicy("sharpness", 0.4, 7, "invert", 0.6, 8, fillcolor),
+            SubPolicy("shearX", 0.6, 5, "equalize", 1.0, 9, fillcolor),
+            SubPolicy("color", 0.4, 0, "equalize", 0.6, 3, fillcolor),
+            SubPolicy("equalize", 0.4, 7, "solarize", 0.2, 4, fillcolor),
+            SubPolicy("solarize", 0.6, 5, "autocontrast", 0.6, 5, fillcolor),
+            SubPolicy("invert", 0.6, 4, "equalize", 1.0, 8, fillcolor),
+            SubPolicy("color", 0.6, 4, "contrast", 1.0, 8, fillcolor),
+            SubPolicy("equalize", 0.8, 8, "equalize", 0.6, 3, fillcolor),
+        ]
+    def __call__(self, img):
+        """Define call method for ImageNetPolicy class."""
+        policy_idx = random.randint(0, len(self.policies) - 1)
+        return self.policies[policy_idx](img)
+    def __repr__(self):
+        """Define repr method for ImageNetPolicy class."""
+        return "ImageNetPolicy"
+class SubPolicy:
+    """Definition of a SubPolicy.
+    A SubPolicy consists of two augmentation operations,
+    each of those parametrized as operation, probability, magnitude.
+    The two operations are applied sequentially on the image upon call.
+    """
+    def __init__(
+        self,
+        operation1,
+        probability1,
+        magnitude_idx1,
+        operation2,
+        probability2,
+        magnitude_idx2,
+        fillcolor,
+    ):
+        """Initialize a SubPolicy.
+        Args:
+            operation1 (str): Key specifying the first augmentation operation.
+            There are fourteen key values altogether (see supported_ops below
+            listing supported operations). probability1 (float): Probability
+            within [0., 1.] of applying the first augmentation operation.
+            magnitude_idx1 (int): Integer specifiying the strength of the first
+            operation as an index further used to derive the magnitude from a
+            range of possible values.
+            operation2 (str): Key specifying the second augmentation operation.
+            probability2 (float): Probability within [0., 1.] of applying the
+            second augmentation operation.
+            magnitude_idx2 (int): Integer specifiying the strength of the
+            second operation as an index further used to derive the magnitude
+            from a range of possible values.
+            fillcolor (tuple): RGB color components of the color to be used for
+            filling.
+        Returns:
+        """
+        # List of supported operations for operation1 and operation2.
+        supported_ops = [
+            "shearX",
+            "shearY",
+            "translateX",
+            "translateY",
+            "rotate",
+            "color",
+            "posterize",
+            "solarize",
+            "contrast",
+            "sharpness",
+            "brightness",
+            "autocontrast",
+            "equalize",
+            "invert",
+        ]
+        assert (operation1 in supported_ops) and (
+            operation2 in supported_ops
+        ), "SubPolicy:one of oper1 or oper2 refers to an unsupported operation."
+        assert (
+            0.0 <= probability1 <= 1.0 and 0.0 <= probability2 <= 1.0
+        ), "SubPolicy: prob1 and prob2 should be within [0., 1.]."
+        assert (
+            isinstance(magnitude_idx1, int) and 0 <= magnitude_idx1 <= 10
+        ), "SubPolicy: idx1 should be specified as an integer within [0, 10]."
+        assert (
+            isinstance(magnitude_idx2, int) and 0 <= magnitude_idx2 <= 10
+        ), "SubPolicy: idx2 should be specified as an integer within [0, 10]."
+        # Define a dictionary where each key refers to a specific type of
+        # augmentation and the corresponding value is a range of ten possible
+        # magnitude values for that augmentation.
+        num_levels = _MAX_LEVEL + 1
+        ranges = {
+            "shearX": np.linspace(0, 0.3, num_levels),
+            "shearY": np.linspace(0, 0.3, num_levels),
+            "translateX": np.linspace(0, 150 / 331, num_levels),
+            "translateY": np.linspace(0, 150 / 331, num_levels),
+            "rotate": np.linspace(0, 30, num_levels),
+            "color": np.linspace(0.0, 0.9, num_levels),
+            "posterize": np.round(np.linspace(8, 4, num_levels), 0).astype(
+                np.int
+            ),
+            "solarize": np.linspace(256, 0, num_levels),  # range [0, 256]
+            "contrast": np.linspace(0.0, 0.9, num_levels),
+            "sharpness": np.linspace(0.0, 0.9, num_levels),
+            "brightness": np.linspace(0.0, 0.9, num_levels),
+            "autocontrast": [0]
+            * num_levels,  # This augmentation doesn't use magnitude parameter.
+            "equalize": [0]
+            * num_levels,  # This augmentation doesn't use magnitude parameter.
+            "invert": [0]
+            * num_levels,  # This augmentation doesn't use magnitude parameter.
+        }
+        def rotate_with_fill(img, magnitude):
+            """Define rotation transformation with fill.
+            The input image is first rotated, then it is blended together with
+            a gray mask of the same size. Note that fillcolor as defined
+            elsewhere in this module doesn't apply here.
+            Args:
+                magnitude (float): rotation angle in degrees.
+            Returns:
+                rotated_filled (PIL Image): rotated image with gray filling for
+                disoccluded areas unveiled by the rotation.
+            """
+            rotated = img.convert("RGBA").rotate(magnitude)
+            rotated_filled = Image.composite(
+                rotated, Image.new("RGBA", rotated.size, (128,) * 4), rotated
+            )
+            return rotated_filled.convert(img.mode)
+        # Define a dictionary of augmentation functions where each key refers
+        # to a specific type of augmentation and the corresponding value defines
+        # the augmentation itself using a lambda function.
+        # pylint: disable=unnecessary-lambda
+        func_dict = {
+            "shearX": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, magnitude * random.choice([-1, 1]), 0, 0, 1, 0),
+                Image.BICUBIC,
+                fillcolor=fillcolor,
+            ),
+            "shearY": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, 0, 0, magnitude * random.choice([-1, 1]), 1, 0),
+                Image.BICUBIC,
+                fillcolor=fillcolor,
+            ),
+            "translateX": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (
+                    1,
+                    0,
+                    magnitude * img.size[0] * random.choice([-1, 1]),
+                    0,
+                    1,
+                    0,
+                ),
+                fillcolor=fillcolor,
+            ),
+            "translateY": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (
+                    1,
+                    0,
+                    0,
+                    0,
+                    1,
+                    magnitude * img.size[1] * random.choice([-1, 1]),
+                ),
+                fillcolor=fillcolor,
+            ),
+            "rotate": lambda img, magnitude: rotate_with_fill(img, magnitude),
+            "color": lambda img, magnitude: ImageEnhance.Color(img).enhance(
+                1 + magnitude * random.choice([-1, 1])
+            ),
+            "posterize": lambda img, magnitude: ImageOps.posterize(
+                img, magnitude
+            ),
+            "solarize": lambda img, magnitude: ImageOps.solarize(
+                img, magnitude
+            ),
+            "contrast": lambda img, magnitude: ImageEnhance.Contrast(
+                img
+            ).enhance(1 + magnitude * random.choice([-1, 1])),
+            "sharpness": lambda img, magnitude: ImageEnhance.Sharpness(
+                img
+            ).enhance(1 + magnitude * random.choice([-1, 1])),
+            "brightness": lambda img, magnitude: ImageEnhance.Brightness(
+                img
+            ).enhance(1 + magnitude * random.choice([-1, 1])),
+            "autocontrast": lambda img, magnitude: ImageOps.autocontrast(img),
+            "equalize": lambda img, magnitude: ImageOps.equalize(img),
+            "invert": lambda img, magnitude: ImageOps.invert(img),
+        }
+        # Store probability, function and magnitude of the first augmentation
+        # for the sub-policy.
+        self.probability1 = probability1
+        self.operation1 = func_dict[operation1]
+        self.magnitude1 = ranges[operation1][magnitude_idx1]
+        # Store probability, function and magnitude of the second augmentation
+        # for the sub-policy.
+        self.probability2 = probability2
+        self.operation2 = func_dict[operation2]
+        self.magnitude2 = ranges[operation2][magnitude_idx2]
+    def __call__(self, img):
+        """Define call method for SubPolicy class."""
+        # Randomly apply operation 1.
+        if random.random() < self.probability1:
+            img = self.operation1(img, self.magnitude1)
+        # Randomly apply operation 2.
+        if random.random() < self.probability2:
+            img = self.operation2(img, self.magnitude2)
+        return img
--- a/megatron-deepspeed_dtk22.10/megatron/data/bert_dataset.py
+++ b/megatron-deepspeed_dtk22.10/megatron/data/bert_dataset.py
--- a/megatron-deepspeed_dtk22.10/megatron/data/biencoder_dataset_utils.py
+++ b/megatron-deepspeed_dtk22.10/megatron/data/biencoder_dataset_utils.py
--- a/megatron-deepspeed_dtk22.10/megatron/data/blendable_dataset.py
+++ b/megatron-deepspeed_dtk22.10/megatron/data/blendable_dataset.py
--- a/megatron-deepspeed_dtk22.10/megatron/data/data_samplers.py
+++ b/megatron-deepspeed_dtk22.10/megatron/data/data_samplers.py
--- a/megatron-deepspeed_dtk22.10/megatron/data/dataset_utils.py
+++ b/megatron-deepspeed_dtk22.10/megatron/data/dataset_utils.py
--- a/megatron-deepspeed_dtk22.10/megatron/data/decoder_packed_mtf_dataset.py
+++ b/megatron-deepspeed_dtk22.10/megatron/data/decoder_packed_mtf_dataset.py
--- a/megatron-deepspeed_dtk22.10/megatron/data/distdata.py
+++ b/megatron-deepspeed_dtk22.10/megatron/data/distdata.py
--- a/megatron-deepspeed_dtk22.10/megatron/data/gpt_dataset.py
+++ b/megatron-deepspeed_dtk22.10/megatron/data/gpt_dataset.py
--- a/megatron-deepspeed_dtk22.10/megatron/data/helpers.cpp
+++ b/megatron-deepspeed_dtk22.10/megatron/data/helpers.cpp