Merge branch 'vision_transformer' into 'main'

Vision transformer See merge request ADLR/megatron-lm!204

Merge branch 'vision_transformer' into 'main'
Vision transformer See merge request ADLR/megatron-lm!204
996aca68 · Jared Casper · c0400306 · 85589322 · 996aca68 · 996aca68
Commit 996aca68 authored Jan 26, 2021 by Jared Casper
19 changed files
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -40,6 +40,7 @@ def parse_args(extra_args_provider=None, defaults={},
    parser = _add_data_args(parser)
    parser = _add_autoresume_args(parser)
    parser = _add_realm_args(parser)
+    parser = _add_vit_args(parser)
    # Custom arguments.
    if extra_args_provider is not None:
@@ -123,6 +124,9 @@ def parse_args(extra_args_provider=None, defaults={},
        print('using {} for parameters ...'.format(args.params_dtype),
              flush=True)
+    if args.dataloader_type is None:
+        args.dataloader_type = 'single'
    # Consumed tokens.
    args.consumed_train_samples = 0
    args.consumed_valid_samples = 0
@@ -289,6 +293,8 @@ def _add_regularization_args(parser):
    group.add_argument('--adam-eps', type=float, default=1e-08,
                       help='Term added to the denominator to improve'
                       'numerical stability')
+    group.add_argument('--sgd-momentum', type=float, default=0.9,
+                       help='Momentum factor for sgd')
    return parser
@@ -359,7 +365,12 @@ def _add_training_args(parser):
    group.add_argument('--no-bias-dropout-fusion', action='store_false',
                       help='Disable bias and dropout fusion.',
                       dest='bias_dropout_fusion')
+    group.add_argument('--optimizer', type=str, default='adam',
+                       choices=['adam', 'sgd'],
+                       help='Optimizer function')
+    group.add_argument('--dataloader-type', type=str, default=None,
+                       choices=['single', 'cyclic'],
+                       help='Single pass vs multiple pass data loader')
    return parser
@@ -372,6 +383,8 @@ def _add_initialization_args(parser):
    group.add_argument('--init-method-std', type=float, default=0.02,
                       help='Standard deviation of the zero mean normal '
                       'distribution used for weight initialization.')
+    group.add_argument('--init-method-xavier-uniform', action='store_true',
+                       help='Enable Xavier uniform parameter initialization')
    return parser
@@ -625,3 +638,18 @@ def _add_realm_args(parser):
    group.add_argument('--indexer-log-interval', type=int, default=1000,
                       help='After how many batches should the indexer report progress')
    return parser
+def _add_vit_args(parser):
+    group = parser.add_argument_group(title="vit")
+    group.add_argument('--num-classes', type=int, default=1000,
+                       help='num of classes in vision classificaiton task')
+    group.add_argument('--img-dim', type=int, default=224,
+                       help='Image size for vision classification task')
+    group.add_argument('--num-channels', type=int, default=3,
+                       help='Number of channels in input image data')
+    group.add_argument('--patch-dim', type=int, default=16,
+                       help='patch dimension used in vit')
+    return parser
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -60,9 +60,10 @@ def check_checkpoint_args(checkpoint_args):
    _compare('hidden_size')
    _compare('num_attention_heads')
    _compare('max_position_embeddings')
-    _compare('make_vocab_size_divisible_by')
+    if args.vocab_file:
-    _compare('padded_vocab_size')
+        _compare('make_vocab_size_divisible_by')
-    _compare('tokenizer_type')
+        _compare('padded_vocab_size')
+        _compare('tokenizer_type')
    if get_checkpoint_version() < 3.0:
        _compare('tensor_model_parallel_size',
                 old_arg_name='model_parallel_size')
@@ -163,8 +164,12 @@ def save_checkpoint(iteration, model, optimizer, lr_scheduler):
        torch.distributed.barrier()
-def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load'):
+def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True):
-    """Load a model checkpoint and return the iteration."""
+    """Load a model checkpoint and return the iteration.
+    strict (bool): whether to strictly enforce that the keys in
+        :attr:`state_dict` of the checkpoint match the names of
+        parameters and buffers in model.
+    """
    args = get_args()
    load_dir = getattr(args, load_arg)
@@ -254,7 +259,7 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load'):
        print_rank_0('could not find arguments in the checkpoint ...')
    # Model.
-    model.load_state_dict(state_dict['model'])
+    model.load_state_dict(state_dict['model'], strict=strict)
    # Optimizer.
    if not release and not args.finetune and not args.no_load_optim:

--- a/megatron/data/autoaugment.py
+++ b/megatron/data/autoaugment.py
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+"""AutoAugment data augmentation policy for ImageNet.
+Implements the fixed AutoAugment data augmentation policy for ImageNet
+provided in Appendix A, Table 9 in reference [1]. Does not include any
+of the search code.
+Reference:
+[1] https://arxiv.org/abs/1805.09501
+Code adapted from:
+https://github.com/DeepVoltaire/AutoAugment
+"""
+import random
+import numpy as np
+from PIL import Image
+from PIL import ImageEnhance
+from PIL import ImageOps
+_MAX_LEVEL = 10  # Maximum integer strength of an augmentation, if applicable.
+class ImageNetPolicy:
+    """Definition of an ImageNetPolicy.
+    Implements a fixed AutoAugment data augmentation policy targeted at
+    ImageNet training by randomly applying at runtime one of the 25 pre-defined
+    data augmentation sub-policies provided in Reference [1].
+    Usage example as a Pytorch Transform:
+    >>> transform=transforms.Compose([transforms.Resize(256),
+    >>>                               ImageNetPolicy(),
+    >>>                               transforms.ToTensor()])
+    """
+    def __init__(self, fillcolor=(128, 128, 128)):
+        """Initialize an ImageNetPolicy.
+        Args:
+            fillcolor (tuple): RGB color components of the color to be used for
+            filling when needed (default: (128, 128, 128), which
+            corresponds to gray).
+        """
+        # Instantiate a list of sub-policies.
+        # Each entry of the list is a SubPolicy which consists of
+        # two augmentation operations,
+        # each of those parametrized as operation, probability, magnitude.
+        # Those two operations are applied sequentially on the image upon call.
+        self.policies = [
+            SubPolicy("posterize", 0.4, 8, "rotate", 0.6, 9, fillcolor),
+            SubPolicy("solarize", 0.6, 5, "autocontrast", 0.6, 5, fillcolor),
+            SubPolicy("equalize", 0.8, 8, "equalize", 0.6, 3, fillcolor),
+            SubPolicy("posterize", 0.6, 7, "posterize", 0.6, 6, fillcolor),
+            SubPolicy("equalize", 0.4, 7, "solarize", 0.2, 4, fillcolor),
+            SubPolicy("equalize", 0.4, 4, "rotate", 0.8, 8, fillcolor),
+            SubPolicy("solarize", 0.6, 3, "equalize", 0.6, 7, fillcolor),
+            SubPolicy("posterize", 0.8, 5, "equalize", 1.0, 2, fillcolor),
+            SubPolicy("rotate", 0.2, 3, "solarize", 0.6, 8, fillcolor),
+            SubPolicy("equalize", 0.6, 8, "posterize", 0.4, 6, fillcolor),
+            SubPolicy("rotate", 0.8, 8, "color", 0.4, 0, fillcolor),
+            SubPolicy("rotate", 0.4, 9, "equalize", 0.6, 2, fillcolor),
+            SubPolicy("equalize", 0.0, 7, "equalize", 0.8, 8, fillcolor),
+            SubPolicy("invert", 0.6, 4, "equalize", 1.0, 8, fillcolor),
+            SubPolicy("color", 0.6, 4, "contrast", 1.0, 8, fillcolor),
+            SubPolicy("rotate", 0.8, 8, "color", 1.0, 2, fillcolor),
+            SubPolicy("color", 0.8, 8, "solarize", 0.8, 7, fillcolor),
+            SubPolicy("sharpness", 0.4, 7, "invert", 0.6, 8, fillcolor),
+            SubPolicy("shearX", 0.6, 5, "equalize", 1.0, 9, fillcolor),
+            SubPolicy("color", 0.4, 0, "equalize", 0.6, 3, fillcolor),
+            SubPolicy("equalize", 0.4, 7, "solarize", 0.2, 4, fillcolor),
+            SubPolicy("solarize", 0.6, 5, "autocontrast", 0.6, 5, fillcolor),
+            SubPolicy("invert", 0.6, 4, "equalize", 1.0, 8, fillcolor),
+            SubPolicy("color", 0.6, 4, "contrast", 1.0, 8, fillcolor),
+            SubPolicy("equalize", 0.8, 8, "equalize", 0.6, 3, fillcolor),
+        ]
+    def __call__(self, img):
+        """Define call method for ImageNetPolicy class."""
+        policy_idx = random.randint(0, len(self.policies) - 1)
+        return self.policies[policy_idx](img)
+    def __repr__(self):
+        """Define repr method for ImageNetPolicy class."""
+        return "ImageNetPolicy"
+class SubPolicy:
+    """Definition of a SubPolicy.
+    A SubPolicy consists of two augmentation operations,
+    each of those parametrized as operation, probability, magnitude.
+    The two operations are applied sequentially on the image upon call.
+    """
+    def __init__(
+        self,
+        operation1,
+        probability1,
+        magnitude_idx1,
+        operation2,
+        probability2,
+        magnitude_idx2,
+        fillcolor,
+    ):
+        """Initialize a SubPolicy.
+        Args:
+            operation1 (str): Key specifying the first augmentation operation.
+            There are fourteen key values altogether (see supported_ops below
+            listing supported operations). probability1 (float): Probability
+            within [0., 1.] of applying the first augmentation operation.
+            magnitude_idx1 (int): Integer specifiying the strength of the first
+            operation as an index further used to derive the magnitude from a
+            range of possible values.
+            operation2 (str): Key specifying the second augmentation operation.
+            probability2 (float): Probability within [0., 1.] of applying the
+            second augmentation operation.
+            magnitude_idx2 (int): Integer specifiying the strength of the
+            second operation as an index further used to derive the magnitude
+            from a range of possible values.
+            fillcolor (tuple): RGB color components of the color to be used for
+            filling.
+        Returns:
+        """
+        # List of supported operations for operation1 and operation2.
+        supported_ops = [
+            "shearX",
+            "shearY",
+            "translateX",
+            "translateY",
+            "rotate",
+            "color",
+            "posterize",
+            "solarize",
+            "contrast",
+            "sharpness",
+            "brightness",
+            "autocontrast",
+            "equalize",
+            "invert",
+        ]
+        assert (operation1 in supported_ops) and (
+            operation2 in supported_ops
+        ), "SubPolicy:one of oper1 or oper2 refers to an unsupported operation."
+        assert (
+            0.0 <= probability1 <= 1.0 and 0.0 <= probability2 <= 1.0
+        ), "SubPolicy: prob1 and prob2 should be within [0., 1.]."
+        assert (
+            isinstance(magnitude_idx1, int) and 0 <= magnitude_idx1 <= 10
+        ), "SubPolicy: idx1 should be specified as an integer within [0, 10]."
+        assert (
+            isinstance(magnitude_idx2, int) and 0 <= magnitude_idx2 <= 10
+        ), "SubPolicy: idx2 should be specified as an integer within [0, 10]."
+        # Define a dictionary where each key refers to a specific type of
+        # augmentation and the corresponding value is a range of ten possible
+        # magnitude values for that augmentation.
+        num_levels = _MAX_LEVEL + 1
+        ranges = {
+            "shearX": np.linspace(0, 0.3, num_levels),
+            "shearY": np.linspace(0, 0.3, num_levels),
+            "translateX": np.linspace(0, 150 / 331, num_levels),
+            "translateY": np.linspace(0, 150 / 331, num_levels),
+            "rotate": np.linspace(0, 30, num_levels),
+            "color": np.linspace(0.0, 0.9, num_levels),
+            "posterize": np.round(np.linspace(8, 4, num_levels), 0).astype(
+                np.int
+            ),
+            "solarize": np.linspace(256, 0, num_levels),  # range [0, 256]
+            "contrast": np.linspace(0.0, 0.9, num_levels),
+            "sharpness": np.linspace(0.0, 0.9, num_levels),
+            "brightness": np.linspace(0.0, 0.9, num_levels),
+            "autocontrast": [0]
+            * num_levels,  # This augmentation doesn't use magnitude parameter.
+            "equalize": [0]
+            * num_levels,  # This augmentation doesn't use magnitude parameter.
+            "invert": [0]
+            * num_levels,  # This augmentation doesn't use magnitude parameter.
+        }
+        def rotate_with_fill(img, magnitude):
+            """Define rotation transformation with fill.
+            The input image is first rotated, then it is blended together with
+            a gray mask of the same size. Note that fillcolor as defined
+            elsewhere in this module doesn't apply here.
+            Args:
+                magnitude (float): rotation angle in degrees.
+            Returns:
+                rotated_filled (PIL Image): rotated image with gray filling for
+                disoccluded areas unveiled by the rotation.
+            """
+            rotated = img.convert("RGBA").rotate(magnitude)
+            rotated_filled = Image.composite(
+                rotated, Image.new("RGBA", rotated.size, (128,) * 4), rotated
+            )
+            return rotated_filled.convert(img.mode)
+        # Define a dictionary of augmentation functions where each key refers
+        # to a specific type of augmentation and the corresponding value defines
+        # the augmentation itself using a lambda function.
+        # pylint: disable=unnecessary-lambda
+        func_dict = {
+            "shearX": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, magnitude * random.choice([-1, 1]), 0, 0, 1, 0),
+                Image.BICUBIC,
+                fillcolor=fillcolor,
+            ),
+            "shearY": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, 0, 0, magnitude * random.choice([-1, 1]), 1, 0),
+                Image.BICUBIC,
+                fillcolor=fillcolor,
+            ),
+            "translateX": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (
+                    1,
+                    0,
+                    magnitude * img.size[0] * random.choice([-1, 1]),
+                    0,
+                    1,
+                    0,
+                ),
+                fillcolor=fillcolor,
+            ),
+            "translateY": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (
+                    1,
+                    0,
+                    0,
+                    0,
+                    1,
+                    magnitude * img.size[1] * random.choice([-1, 1]),
+                ),
+                fillcolor=fillcolor,
+            ),
+            "rotate": lambda img, magnitude: rotate_with_fill(img, magnitude),
+            "color": lambda img, magnitude: ImageEnhance.Color(img).enhance(
+                1 + magnitude * random.choice([-1, 1])
+            ),
+            "posterize": lambda img, magnitude: ImageOps.posterize(
+                img, magnitude
+            ),
+            "solarize": lambda img, magnitude: ImageOps.solarize(
+                img, magnitude
+            ),
+            "contrast": lambda img, magnitude: ImageEnhance.Contrast(
+                img
+            ).enhance(1 + magnitude * random.choice([-1, 1])),
+            "sharpness": lambda img, magnitude: ImageEnhance.Sharpness(
+                img
+            ).enhance(1 + magnitude * random.choice([-1, 1])),
+            "brightness": lambda img, magnitude: ImageEnhance.Brightness(
+                img
+            ).enhance(1 + magnitude * random.choice([-1, 1])),
+            "autocontrast": lambda img, magnitude: ImageOps.autocontrast(img),
+            "equalize": lambda img, magnitude: ImageOps.equalize(img),
+            "invert": lambda img, magnitude: ImageOps.invert(img),
+        }
+        # Store probability, function and magnitude of the first augmentation
+        # for the sub-policy.
+        self.probability1 = probability1
+        self.operation1 = func_dict[operation1]
+        self.magnitude1 = ranges[operation1][magnitude_idx1]
+        # Store probability, function and magnitude of the second augmentation
+        # for the sub-policy.
+        self.probability2 = probability2
+        self.operation2 = func_dict[operation2]
+        self.magnitude2 = ranges[operation2][magnitude_idx2]
+    def __call__(self, img):
+        """Define call method for SubPolicy class."""
+        # Randomly apply operation 1.
+        if random.random() < self.probability1:
+            img = self.operation1(img, self.magnitude1)
+        # Randomly apply operation 2.
+        if random.random() < self.probability2:
+            img = self.operation2(img, self.magnitude2)
+        return img
--- a/megatron/data/data_loaders.py
+++ b/megatron/data/data_loaders.py
@@ -17,7 +17,7 @@
 import torch
+import random
 from megatron import get_args
 from megatron import mpu
@@ -30,12 +30,23 @@ def build_pretraining_data_loader(dataset, consumed_samples):
    args = get_args()
    # Megatron sampler
-    batch_sampler = MegatronPretrainingSampler(
+    if args.dataloader_type == 'single':
-        total_samples=len(dataset),
+        batch_sampler = MegatronPretrainingSampler(
-        consumed_samples=consumed_samples,
+            total_samples=len(dataset),
-        micro_batch_size=args.micro_batch_size,
+            consumed_samples=consumed_samples,
-        data_parallel_rank=mpu.get_data_parallel_rank(),
+            micro_batch_size=args.micro_batch_size,
-        data_parallel_size=mpu.get_data_parallel_world_size())
+            data_parallel_rank=mpu.get_data_parallel_rank(),
+            data_parallel_size=mpu.get_data_parallel_world_size())
+    elif args.dataloader_type == 'cyclic':
+        batch_sampler = MegatronPretrainingRandomSampler(
+            total_samples=len(dataset),
+            consumed_samples=consumed_samples,
+            micro_batch_size=args.micro_batch_size,
+            data_parallel_rank=mpu.get_data_parallel_rank(),
+            data_parallel_size=mpu.get_data_parallel_world_size())
+    else:
+        raise Exception('{} dataloader type is not supported.'.format(
+                args.dataloader_type))
    # Torch dataloader.
    return torch.utils.data.DataLoader(dataset,
@@ -43,10 +54,8 @@ def build_pretraining_data_loader(dataset, consumed_samples):
                                       num_workers=args.num_workers,
                                       pin_memory=True)
 class MegatronPretrainingSampler:
    def __init__(self, total_samples, consumed_samples, micro_batch_size,
                 data_parallel_rank, data_parallel_size):
        # Keep a copy of input params for later use.
@@ -54,8 +63,8 @@ class MegatronPretrainingSampler:
        self.consumed_samples = consumed_samples
        self.micro_batch_size = micro_batch_size
        self.data_parallel_rank = data_parallel_rank
-        self.micro_batch_times_data_parallel_size = self.micro_batch_size * \
+        self.micro_batch_times_data_parallel_size = \
-                                                    data_parallel_size
+            self.micro_batch_size * data_parallel_size
        # Sanity checks.
        assert self.total_samples > 0, \
@@ -69,11 +78,9 @@ class MegatronPretrainingSampler:
            'data_parallel_rank should be smaller than data size: {}, ' \
            '{}'.format(self.data_parallel_rank, data_parallel_size)
    def __len__(self):
        return self.total_samples
    def __iter__(self):
        batch = []
        # Last batch if not complete will be dropped.
@@ -84,3 +91,57 @@ class MegatronPretrainingSampler:
                end_idx = start_idx + self.micro_batch_size
                yield batch[start_idx:end_idx]
                batch = []
+class MegatronPretrainingRandomSampler:
+    def __init__(self, total_samples, consumed_samples, micro_batch_size,
+                 data_parallel_rank, data_parallel_size):
+        # Keep a copy of input params for later use.
+        self.total_samples = total_samples
+        self.consumed_samples = consumed_samples
+        self.micro_batch_size = micro_batch_size
+        self.data_parallel_rank = data_parallel_rank
+        self.data_parallel_size = data_parallel_size
+        self.micro_batch_times_data_parallel_size = \
+            self.micro_batch_size * data_parallel_size
+        self.last_batch_size = \
+            self.total_samples % self.micro_batch_times_data_parallel_size
+        # Sanity checks.
+        assert self.total_samples > 0, \
+            'no sample to consume: {}'.format(self.total_samples)
+        assert self.micro_batch_size > 0
+        assert data_parallel_size > 0
+        assert self.data_parallel_rank < data_parallel_size, \
+            'data_parallel_rank should be smaller than data size: {}, ' \
+            '{}'.format(self.data_parallel_rank, data_parallel_size)
+    def __len__(self):
+        return self.total_samples
+    def __iter__(self):
+        active_total_samples = self.total_samples - self.last_batch_size
+        self.epoch = self.consumed_samples // active_total_samples
+        current_epoch_samples = self.consumed_samples % active_total_samples
+        assert current_epoch_samples % self.micro_batch_times_data_parallel_size == 0
+        # data sharding and random sampling
+        bucket_size = (self.total_samples // self.micro_batch_times_data_parallel_size) \
+                       * self.micro_batch_size
+        bucket_offset = current_epoch_samples // self.data_parallel_size
+        start_idx = self.data_parallel_rank * bucket_size
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+        random_idx = torch.randperm(bucket_size, generator=g).tolist()
+        idx_range = [start_idx + x for x in random_idx[bucket_offset:]]
+        batch = []
+        # Last batch if not complete will be dropped.
+        for idx in idx_range:
+            batch.append(idx)
+            if len(batch) == self.micro_batch_size:
+                self.consumed_samples += self.micro_batch_times_data_parallel_size
+                yield batch
+                batch = []
--- a/megatron/data/vit_dataset.py
+++ b/megatron/data/vit_dataset.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import torch
+from torchvision import datasets, transforms
+from megatron.data.autoaugment import ImageNetPolicy
+def build_train_valid_datasets(data_path, crop_size=224, color_jitter=True):
+    # training dataset
+    train_data_path = os.path.join(data_path[0], "train")
+    normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+    process = [
+        transforms.RandomResizedCrop(crop_size),
+        transforms.RandomHorizontalFlip(),
+    ]
+    if color_jitter:
+        process += [
+            transforms.ColorJitter(
+                brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1
+            )
+        ]
+    fp16_t = transforms.ConvertImageDtype(torch.half)
+    process += [ImageNetPolicy(), transforms.ToTensor(), normalize, fp16_t]
+    transform_train = transforms.Compose(process)
+    train_data = datasets.ImageFolder(
+        root=train_data_path, transform=transform_train
+    )
+    # validation dataset
+    val_data_path = os.path.join(data_path[0], "val")
+    transform_val = transforms.Compose(
+        [
+            transforms.Resize(crop_size),
+            transforms.CenterCrop(crop_size),
+            transforms.ToTensor(),
+            normalize,
+            fp16_t
+        ]
+    )
+    val_data = datasets.ImageFolder(
+        root=val_data_path, transform=transform_val
+    )
+    return train_data, val_data
--- a/megatron/global_vars.py
+++ b/megatron/global_vars.py
@@ -83,7 +83,8 @@ def set_global_variables(extra_args_provider=None, args_defaults={},
                       defaults=args_defaults,
                       ignore_unknown_args=ignore_unknown_args)
    _build_num_microbatches_calculator(args)
-    _ = _build_tokenizer(args)
+    if args.vocab_file:
+        _ = _build_tokenizer(args)
    _set_tensorboard_writer(args)
    _set_adlr_autoresume(args)
    _set_timers()
@@ -131,7 +132,7 @@ def _set_tensorboard_writer(args):
                                   'tensorboard writer')
    if hasattr(args, 'tensorboard_dir') and \
-       args.tensorboard_dir and args.rank == (args.world_size -1):
+       args.tensorboard_dir and args.rank == (args.world_size - 1):
        try:
            from torch.utils.tensorboard import SummaryWriter
            print('> setting tensorboard ...')

--- a/megatron/model/__init__.py
+++ b/megatron/model/__init__.py
@@ -30,6 +30,7 @@ def import_layernorm(fp32_residual_connection):
 from .distributed import *
+from .vit_model import VitModel
 from .bert_model import (BertModel,
                         BertModelFirstStage,
                         BertModelIntermediateStage,

--- a/megatron/model/fused_softmax.py
+++ b/megatron/model/fused_softmax.py
@@ -19,76 +19,89 @@ from megatron.model.enums import AttnMaskType
 class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
    """
-       Fused operation which performs following three operations in sequence
+    Fused operation which performs following three operations in sequence
-       1. Scale the tensor.
+    1. Scale the tensor.
-       2. Apply upper triangular mask (typically used in gpt models).
+    2. Apply upper triangular mask (typically used in gpt models).
-       3. Perform softmax.
+    3. Perform softmax.
    """
    @staticmethod
    def forward(ctx, inputs, scale):
        import scaled_upper_triang_masked_softmax_cuda
        scale_t = torch.tensor([scale])
-        softmax_results =  \
+        softmax_results = scaled_upper_triang_masked_softmax_cuda.forward(
-            scaled_upper_triang_masked_softmax_cuda.forward(inputs, scale_t[0])
+            inputs, scale_t[0]
+        )
        ctx.save_for_backward(softmax_results, scale_t)
        return softmax_results
    @staticmethod
    def backward(ctx, output_grads):
        import scaled_upper_triang_masked_softmax_cuda
        softmax_results, scale_t = ctx.saved_tensors
-        input_grads =   \
+        input_grads = scaled_upper_triang_masked_softmax_cuda.backward(
-            scaled_upper_triang_masked_softmax_cuda.backward(output_grads,
+            output_grads, softmax_results, scale_t[0]
-                                                             softmax_results,
+        )
-                                                             scale_t[0])
        return input_grads, None
 class ScaledMaskedSoftmax(torch.autograd.Function):
    """
-       Fused operation which performs following three operations in sequence
+    Fused operation which performs following three operations in sequence
-       1. Scale the tensor.
+    1. Scale the tensor.
-       2. Apply the mask.
+    2. Apply the mask.
-       3. Perform softmax.
+    3. Perform softmax.
    """
    @staticmethod
    def forward(ctx, inputs, mask, scale):
        import scaled_masked_softmax_cuda
        scale_t = torch.tensor([scale])
-        softmax_results =  \
+        softmax_results = scaled_masked_softmax_cuda.forward(
-            scaled_masked_softmax_cuda.forward(inputs, mask, scale_t[0])
+            inputs, mask, scale_t[0]
+        )
        ctx.save_for_backward(softmax_results, scale_t)
        return softmax_results
    @staticmethod
    def backward(ctx, output_grads):
        import scaled_masked_softmax_cuda
        softmax_results, scale_t = ctx.saved_tensors
-        input_grads =   \
+        input_grads = scaled_masked_softmax_cuda.backward(
-            scaled_masked_softmax_cuda.backward(output_grads,
+            output_grads, softmax_results, scale_t[0]
-                                                softmax_results,
+        )
-                                                scale_t[0])
        return input_grads, None, None
 class FusedScaleMaskSoftmax(torch.nn.Module):
    """
-       fused operation: scaling + mask + softmax
+    fused operation: scaling + mask + softmax
-       Arguments:
+    Arguments:
-           input_in_fp16: flag to indicate if input in fp16 data format.
+        input_in_fp16: flag to indicate if input in fp16 data format.
-           attn_mask_type: attention mask type (pad or causal)
+        attn_mask_type: attention mask type (pad or causal)
-           mask_func: mask function to be applied.
+        mask_func: mask function to be applied.
-           softmax_in_fp32: if true, softmax in performed at fp32 precision.
+        softmax_in_fp32: if true, softmax in performed at fp32 precision.
-           scale: scaling factor used in input tensor scaling.
+        scale: scaling factor used in input tensor scaling.
    """
-    def __init__(self, input_in_fp16, attn_mask_type,
-                 scaled_masked_softmax_fusion, mask_func,
+    def __init__(
-                 softmax_in_fp32, scale):
+        self,
+        input_in_fp16,
+        attn_mask_type,
+        scaled_masked_softmax_fusion,
+        mask_func,
+        softmax_in_fp32,
+        scale,
+    ):
        super(FusedScaleMaskSoftmax, self).__init__()
        self.input_in_fp16 = input_in_fp16
        self.attn_mask_type = attn_mask_type
@@ -97,8 +110,9 @@ class FusedScaleMaskSoftmax(torch.nn.Module):
        self.softmax_in_fp32 = softmax_in_fp32
        self.scale = scale
-        assert self.scale is None or softmax_in_fp32, \
+        assert (
-            'softmax should be in fp32 when scaled'
+            self.scale is None or softmax_in_fp32
+        ), "softmax should be in fp32 when scaled"
    def forward(self, input, mask):
        # [b, np, sq, sk]
@@ -108,7 +122,7 @@ class FusedScaleMaskSoftmax(torch.nn.Module):
        assert input.dim() == 4
        # invoke custom kernel
-        if self.input_in_fp16 and key_seq_len <= 2048 and \
+        if self.input_in_fp16 and key_seq_len <= 2048 and mask is not None and \
           query_seq_len % 4 == 0 and self.scaled_masked_softmax_fusion:
            scale = self.scale if self.scale is not None else 1.0
@@ -128,7 +142,7 @@ class FusedScaleMaskSoftmax(torch.nn.Module):
            if self.scale is not None:
                input = input * self.scale
-            mask_output = self.mask_func(input, mask)
+            mask_output = self.mask_func(input, mask) if mask is not None else input
            probs = torch.nn.Softmax(dim=-1)(mask_output)
            if self.input_in_fp16 and self.softmax_in_fp32:

--- a/megatron/model/language_model.py
+++ b/megatron/model/language_model.py
@@ -332,7 +332,7 @@ class TransformerLanguageModelBase(MegatronModule):
    def forward(self, enc_language_model_input, enc_attn_mask,
                dec_language_model_input=None, dec_attn_mask=None,
                enc_dec_attn_mask=None, tokentype_ids=None, layer_past=None,
-                get_key_value=False, pooling_sequence_index=0, 
+                get_key_value=False, pooling_sequence_index=0,
                enc_hidden_states=None, output_enc_hidden=False):
        # Embeddings.

--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -85,6 +85,7 @@ class ParallelMLP(MegatronModule):
            init_method=output_layer_init_method,
            skip_bias_add=True)
    def forward(self, hidden_states):
        # [s, b, 4hp]
@@ -401,7 +402,7 @@ class ParallelAttention(MegatronModule):
        return output, bias
-def bias_dropout_add(x, bias, residual, prob, training) :
+def bias_dropout_add(x, bias, residual, prob, training):
    # type: (Tensor, Tensor, Tensor, float, bool) -> Tensor
    out = torch.nn.functional.dropout(x + bias, p=prob, training=training)
    out = residual + out
@@ -415,13 +416,13 @@ def get_bias_dropout_add(training):
 @torch.jit.script
-def bias_dropout_add_fused_train(x, bias, residual, prob) :
+def bias_dropout_add_fused_train(x, bias, residual, prob):
    # type: (Tensor, Tensor, Tensor, float) -> Tensor
    return bias_dropout_add(x, bias, residual, prob, True)
 @torch.jit.script
-def bias_dropout_add_fused_inference(x, bias, residual, prob) :
+def bias_dropout_add_fused_inference(x, bias, residual, prob):
    # type: (Tensor, Tensor, Tensor, float) -> Tensor
    return bias_dropout_add(x, bias, residual, prob, False)

--- a/megatron/model/vit_model.py
+++ b/megatron/model/vit_model.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Vision Transformer(VIT) model."""
+import math
+import einops
+import torch
+import torch.nn.functional as F
+from megatron import get_args
+from megatron.model.transformer import ParallelTransformer
+from megatron.model.utils import (
+    get_linear_layer,
+    init_method_normal,
+    scaled_init_method_normal,
+)
+from .module import MegatronModule
+class VitMlpHead(MegatronModule):
+    """Pooler layer.
+    Pool hidden states of a specific token (for example start of the
+    sequence) and add a linear transformation followed by a tanh.
+    Arguments:
+        hidden_size: hidden size
+        init_method: weight initialization method for the linear layer.
+            bias is set to zero.
+    """
+    def __init__(self, hidden_size, num_classes):
+        super(VitMlpHead, self).__init__()
+        self.dense_in = torch.nn.Linear(hidden_size, hidden_size)
+        self.dense_out = torch.nn.Linear(hidden_size, num_classes)
+        torch.nn.init.constant_(self.dense_out.bias, -10)
+    def forward(self, hidden_states, sequence_index=0):
+        # hidden_states: [b, s, h]
+        # sequence_index: index of the token to pool.
+        x = hidden_states[:, sequence_index, :]
+        x = self.dense_in(x)
+        x = torch.tanh(x)
+        x = self.dense_out(x)
+        return x
+def twod_interpolate_position_embeddings_hook(
+    state_dict,
+    prefix,
+    local_metadata,
+    strict,
+    missing_keys,
+    unexpected_keys,
+    error_msgs,
+):
+    args = get_args()
+    num_patches_per_dim = args.img_dim // args.patch_dim
+    num_patches = num_patches_per_dim ** 2
+    seq_length = num_patches + 1
+    hidden_size = args.hidden_size
+    key = prefix + "weight"
+    # import pdb
+    # pdb.set_trace()
+    assert key in state_dict
+    if key in state_dict:
+        input_param = state_dict[key]
+        assert input_param.shape[1] == hidden_size
+        if input_param.shape[0] != seq_length:
+            # update input_param and load it to state_dict[key]
+            num_tok_input = input_param.shape[0] - 1
+            num_tok_new = seq_length - 1
+            input_param_tok, input_param_grid = (
+                input_param[:1, :],
+                input_param[1:, :],
+            )
+            gs_input = int(math.sqrt(num_tok_input))
+            gs_new = int(math.sqrt(num_tok_new))
+            input_param_grid = input_param_grid.transpose(0, 1).contiguous()
+            input_param_grid = input_param_grid.reshape(
+                (1, -1, gs_input, gs_input)
+            )
+            input_param_grid = input_param_grid.float()
+            scale_factor = gs_new / gs_input
+            input_param_grid = F.interpolate(
+                input_param_grid, scale_factor=scale_factor, mode="bilinear"
+            )
+            input_param_grid = input_param_grid.half()
+            input_param_grid = input_param_grid.reshape((-1, gs_new * gs_new))
+            input_param_grid = input_param_grid.transpose(0, 1).contiguous()
+            assert input_param_grid.shape[1] == hidden_size
+            input_param = torch.cat((input_param_tok, input_param_grid), dim=0)
+            assert (
+                input_param.shape[0] == seq_length
+                and input_param.shape[1] == hidden_size
+            )
+            state_dict[key] = input_param
+class VitModel(MegatronModule):
+    """Vision Transformer Model."""
+    def __init__(self, num_classes, finetune=False):
+        super(VitModel, self).__init__()
+        args = get_args()
+        self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy
+        if args.init_method_xavier_uniform:
+            self.init_method = torch.nn.init.xavier_uniform_
+            self.scaled_init_method = torch.nn.init.xavier_uniform_
+        else:
+            self.init_method = init_method_normal(args.init_method_std)
+            self.scaled_init_method = scaled_init_method_normal(
+                args.init_method_std, args.num_layers
+            )
+        self.hidden_size = args.hidden_size
+        self.num_classes = num_classes
+        self.patch_dim = args.patch_dim
+        self.img_dim = args.img_dim
+        self.finetune = finetune
+        assert self.img_dim % self.patch_dim == 0
+        self.num_patches_per_dim = self.img_dim // self.patch_dim
+        self.num_patches = self.num_patches_per_dim ** 2
+        self.seq_length = self.num_patches + 1
+        self.flatten_dim = self.patch_dim * self.patch_dim * args.num_channels
+        # cls_token
+        self.cls_token = torch.nn.Parameter(torch.randn(1, 1, self.hidden_size))
+        torch.nn.init.zeros_(self.cls_token)
+        # Linear encoder
+        self.linear_encoder = torch.nn.Linear(
+            self.flatten_dim, self.hidden_size
+        )
+        # embedding
+        self.position_embeddings = torch.nn.Embedding(
+            self.seq_length, self.hidden_size
+        )
+        init_method_normal(args.init_method_std)(
+            self.position_embeddings.weight
+        )
+        self.position_ids = torch.arange(self.seq_length).expand(1, -1).cuda()
+        self.position_embeddings._register_load_state_dict_pre_hook(
+            twod_interpolate_position_embeddings_hook
+        )
+        self.embedding_dropout = torch.nn.Dropout(args.hidden_dropout)
+        # Transformer
+        self.transformer = ParallelTransformer(
+            self.init_method, self.scaled_init_method
+        )
+        # MLP head
+        if not self.finetune:
+            self.mlp_head = VitMlpHead(self.hidden_size, self.num_classes)
+        else:
+            self.class_head = get_linear_layer(
+                self.hidden_size, num_classes, torch.nn.init.zeros_
+            )
+    def forward(self, x):
+        x = einops.rearrange(
+            x,
+            "b c (h p1) (w p2) -> b (h w) (p1 p2 c)",
+            p1=self.patch_dim,
+            p2=self.patch_dim,
+        )
+        assert x.dtype == torch.half
+        x = self.linear_encoder(x)
+        cls_tokens = self.cls_token.expand(x.shape[0], -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+        x = x + self.position_embeddings(self.position_ids)
+        x = self.embedding_dropout(x)
+        x = self.transformer(x, None)
+        if not self.finetune:
+            x = self.mlp_head(x)
+        else:
+            x = self.class_head(x[:, 0, :])
+        return x
--- a/megatron/mpu/data.py
+++ b/megatron/mpu/data.py
@@ -20,7 +20,7 @@ from .initialize import get_tensor_model_parallel_rank
 from .initialize import get_tensor_model_parallel_src_rank
-_MAX_DATA_DIM = 4
+_MAX_DATA_DIM = 5
 def _check_data_types(keys, data, target_dtype):

--- a/megatron/optimizer/__init__.py
+++ b/megatron/optimizer/__init__.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 from apex.optimizers import FusedAdam as Adam
+from apex.optimizers import FusedSGD as SGD
 from megatron import get_args
 from megatron.model import import_layernorm
@@ -52,11 +53,20 @@ def get_megatron_optimizer(model):
    # Base optimizer.
    param_groups = _get_params_for_weight_decay_optimization(model)
-    optimizer = Adam(param_groups,
+    if args.optimizer == 'adam':
-                     lr=args.lr,
+        optimizer = Adam(param_groups,
-                     weight_decay=args.weight_decay,
+                         lr=args.lr,
-                     betas=(args.adam_beta1, args.adam_beta2),
+                         weight_decay=args.weight_decay,
-                     eps=args.adam_eps)
+                         betas=(args.adam_beta1, args.adam_beta2),
+                         eps=args.adam_eps)
+    elif args.optimizer == 'sgd':
+        optimizer = SGD(param_groups,
+                        lr=args.lr,
+                        weight_decay=args.weight_decay,
+                        momentum=args.sgd_momentum)
+    else:
+        raise Exception('{} optimizer is not supported.'.format(
+                args.optimizer))
    if args.fp16:
        # Constant loss scale.

--- a/megatron/training.py
+++ b/megatron/training.py
@@ -46,7 +46,7 @@ from megatron.learning_rates import AnnealingLR
 from megatron.model import DistributedDataParallel as LocalDDP
 from megatron.model.realm_model import ICTBertModel
 from megatron.utils import check_adlr_autoresume_termination
-from megatron.data.data_loaders import build_pretraining_data_loader
+from megatron.data.data_samplers import build_pretraining_data_loader
 from megatron.utils import report_memory
@@ -57,8 +57,11 @@ def print_datetime(string):
    print_rank_0('[' + string + '] datetime: {} '.format(time_str))
-def pretrain(train_valid_test_dataset_provider, model_provider,
+def pretrain(train_valid_test_dataset_provider, 
-             forward_step_func, extra_args_provider=None, args_defaults={}):
+             model_provider,
+             forward_step_func, 
+             extra_args_provider=None, 
+             args_defaults={}):
    """Main training program.
    This function will run the followings in the order provided:
@@ -950,6 +953,11 @@ def evaluate_and_print_results(prefix, forward_step_func,
    print_rank_last('-' * length)
+def cyclic_iter(iter):
+    while True:
+        for x in iter:
+            yield x
 def build_train_valid_test_data_iterators(
        build_train_valid_test_datasets_provider):
    """XXX"""
@@ -1018,19 +1026,26 @@ def build_train_valid_test_data_iterators(
    args.do_valid = flags[1].item()
    args.do_test = flags[2].item()
    # Build iterators.
+    dl_type = args.dataloader_type
+    assert dl_type in ['single', 'cyclic']
    if train_dataloader is not None:
-        train_data_iterator = iter(train_dataloader)
+        train_data_iterator = iter(train_dataloader) if dl_type == 'single' \
+                              else iter(cyclic_iter(train_dataloader))
    else:
        train_data_iterator = None
    if valid_dataloader is not None:
-        valid_data_iterator = iter(valid_dataloader)
+        valid_data_iterator = iter(valid_dataloader) if dl_type == 'single' \
+                              else iter(cyclic_iter(valid_dataloader))
    else:
        valid_data_iterator = None
    if test_dataloader is not None:
-        test_data_iterator = iter(test_dataloader)
+        test_data_iterator = iter(test_dataloader) if dl_type == 'single' \
+                             else iter(cyclic_iter(test_dataloader))
    else:
        test_data_iterator = None

--- a/pretrain_vit.py
+++ b/pretrain_vit.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Pretrain VIT"""
+import torch
+import torch.nn.functional as F
+from megatron import get_args, get_timers, mpu, print_rank_0
+from megatron.data.vit_dataset import build_train_valid_datasets
+from megatron.model import VitModel
+from megatron.training import pretrain
+from megatron.utils import average_losses_across_data_parallel_group
+def model_provider():
+    """Build the model."""
+    print_rank_0("building VIT model ...")
+    args = get_args()
+    model = VitModel(num_classes=args.num_classes)
+    return model
+def get_batch(data_iterator):
+    """Build the batch."""
+    data = next(data_iterator)
+    # only data parallelism; no need for broadcast
+    images = data[0].cuda()
+    labels = data[1].cuda()
+    return images, labels
+def forward_step(data_iterator, model, input_tensor):
+    """Forward step."""
+    timers = get_timers()
+    assert input_tensor is None
+    # Get the batch.
+    timers("batch-generator").start()
+    (
+        images,
+        labels,
+    ) = get_batch(data_iterator)
+    timers("batch-generator").stop()
+    # Forward model. lm_labels
+    logits = model(images).contiguous().float()
+    loss = F.cross_entropy(logits, labels)
+    outputs = torch.argmax(logits, -1)
+    correct = (outputs == labels).float()
+    accuracy = torch.mean(correct)
+    averaged_loss = average_losses_across_data_parallel_group([loss, accuracy])
+    return loss, {"loss": averaged_loss[0], "accuracy": averaged_loss[1]}
+def train_valid_test_datasets_provider(train_val_test_num_samples):
+    """Build train, valid, and test datasets."""
+    args = get_args()
+    print_rank_0(
+        "> building train, validation, and test datasets " "for VIT ..."
+    )
+    train_ds, valid_ds = build_train_valid_datasets(data_path=args.data_path)
+    print_rank_0("> finished creating VIT datasets ...")
+    return train_ds, valid_ds, None
+if __name__ == "__main__":
+    pretrain(
+        train_valid_test_datasets_provider,
+        model_provider,
+        forward_step,
+        args_defaults={'dataloader_type': 'cyclic'}
+    )
--- a/tasks/vision/classification.py
+++ b/tasks/vision/classification.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Vision-classification finetuning/evaluation."""
+from megatron import get_args
+from megatron import print_rank_0
+from megatron.model import VitModel
+from megatron.data.vit_dataset import build_train_valid_datasets
+from tasks.vision.eval_utils import accuracy_func_provider
+from tasks.vision.finetune_utils import finetune
+def classification():
+    def train_valid_datasets_provider():
+        """Build train and validation dataset."""
+        args = get_args()
+        train_ds, valid_ds = build_train_valid_datasets(
+            data_path=args.data_path,
+            crop_size=args.img_dim,
+        )
+        return train_ds, valid_ds
+    def model_provider():
+        """Build the model."""
+        args = get_args()
+        print_rank_0("building classification model for ImageNet ...")
+        return VitModel(num_classes=args.num_classes, finetune=True)
+    """Finetune/evaluate."""
+    finetune(
+        train_valid_datasets_provider,
+        model_provider,
+        end_of_epoch_callback_provider=accuracy_func_provider,
+    )
+def main():
+    classification()
--- a/tasks/vision/eval_utils.py
+++ b/tasks/vision/eval_utils.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluation utilities."""
+import os
+import torch
+from megatron import get_args
+from megatron import print_rank_0
+from megatron import mpu
+from tasks.vision.finetune_utils import build_data_loader
+from tasks.vision.finetune_utils import process_batch
+from torchvision import datasets, transforms
+def accuracy_func_provider():
+    """Provide function that calculates accuracies."""
+    args = get_args()
+    data_path = args.data_path
+    crop_size = args.img_dim
+    # mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+    # Build dataloaders.
+    val_data_path = os.path.join(data_path[0], "val")
+    normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+    transform_val = transforms.Compose(
+        [
+            transforms.Resize(crop_size),
+            transforms.CenterCrop(crop_size),
+            transforms.ToTensor(),
+            normalize,
+        ]
+    )
+    dataset = datasets.ImageFolder(root=val_data_path, transform=transform_val)
+    dataloader = build_data_loader(
+        dataset,
+        args.micro_batch_size,
+        num_workers=args.num_workers,
+        drop_last=(mpu.get_data_parallel_world_size() > 1),
+    )
+    def metrics_func(model, epoch):
+        print_rank_0("calculating metrics ...")
+        correct, total = calculate_correct_answers(model, dataloader, epoch)
+        percent = float(correct) * 100.0 / float(total)
+        print_rank_0(
+            " >> |epoch: {}| overall: correct / total = {} / {} = "
+            "{:.4f} %".format(epoch, correct, total, percent)
+        )
+    return metrics_func
+def calculate_correct_answers(model, dataloader, epoch):
+    """Calculate correct over total answers"""
+    model.eval()
+    with torch.no_grad():
+        # For all the batches in the dataset.
+        total = 0
+        correct = 0
+        for _, batch in enumerate(dataloader):
+            # Run the model forward.
+            images, labels = process_batch(batch)
+            logits = model(images).contiguous().float()
+            # Add output predictions.
+            # Compute the correct answers.
+            predicted = torch.argmax(logits, dim=-1)
+            corrects = (predicted == labels).float()
+            # Add to the counters.
+            total += labels.size(0)
+            correct += corrects.sum().item()
+    model.train()
+    # Reduce.
+    unreduced = torch.cuda.LongTensor([correct, total])
+    torch.distributed.all_reduce(unreduced, group=mpu.get_data_parallel_group())
+    # Print on screen.
+    correct_ans = unreduced[0].item()
+    total_count = unreduced[1].item()
+    return correct_ans, total_count
--- a/tasks/vision/finetune_utils.py
+++ b/tasks/vision/finetune_utils.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Finetune utilities."""
+import torch
+import torch.nn.functional as F
+from megatron import get_args
+from megatron import print_rank_0
+from megatron import get_timers
+from megatron import mpu
+from megatron.checkpointing import load_checkpoint
+from megatron.checkpointing import save_checkpoint
+from megatron.training import evaluate_and_print_results
+from megatron.training import setup_model_and_optimizer
+from megatron.training import train_step
+from megatron.training import training_log
+from megatron.utils import check_adlr_autoresume_termination
+from megatron.utils import average_losses_across_data_parallel_group
+def process_batch(batch):
+    """Process batch and produce inputs for the model."""
+    images = batch[0].cuda().contiguous()
+    labels = batch[1].cuda().contiguous()
+    return images, labels
+def _cross_entropy_forward_step(batch, model, input_tensor):
+    """Simple forward step with cross-entropy loss."""
+    timers = get_timers()
+    assert input_tensor is None
+    # Get the batch.
+    timers("batch generator").start()
+    try:
+        batch_ = next(batch)
+    except BaseException:
+        batch_ = batch
+    images, labels = process_batch(batch_)
+    timers("batch generator").stop()
+    # Forward model.
+    logits = model(images).contiguous().float()
+    # Cross-entropy loss.
+    loss = F.cross_entropy(logits, labels)
+    # Reduce loss for logging.
+    average_loss = average_losses_across_data_parallel_group([loss])
+    return loss, {"lm loss": average_loss[0]}
+def build_data_loader(dataset, micro_batch_size, num_workers, drop_last):
+    """Data loader. Note that batch-size is the local (per GPU) batch-size."""
+    # Sampler.
+    world_size = mpu.get_data_parallel_world_size()
+    rank = mpu.get_data_parallel_rank()
+    sampler = torch.utils.data.distributed.DistributedSampler(
+        dataset, num_replicas=world_size, rank=rank
+    )
+    # Data loader. Note that batch size is the per GPU batch size.
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=micro_batch_size,
+        sampler=sampler,
+        shuffle=False,
+        num_workers=num_workers,
+        drop_last=drop_last,
+        pin_memory=True,
+    )
+    return data_loader
+def _build_infinite_size_dataloader(dataloader):
+    """Build a looped dataloader with infinite size."""
+    iterator = dataloader.__iter__()
+    while True:
+        try:
+            yield iterator.__next__()
+        except StopIteration:
+            iterator = dataloader.__iter__()
+def _build_train_valid_dataloaders(train_dataset, valid_dataset):
+    """Traing and validation dataloaders."""
+    args = get_args()
+    print_rank_0("building train and validation dataloaders ...")
+    # Training dataset.
+    train_dataloader = build_data_loader(
+        train_dataset, args.micro_batch_size, args.num_workers, not args.keep_last
+    )
+    # Set the training iterations.
+    args.train_iters_per_epoch = len(train_dataloader)
+    args.train_iters = args.epochs * args.train_iters_per_epoch
+    # Validation dataset. For this dataset, we do not need to set up
+    # shuffling so we can just use a simple infinite loop.
+    valid_dataloader_ = build_data_loader(
+        valid_dataset, args.micro_batch_size, args.num_workers, not args.keep_last
+    )
+    valid_dataloader = _build_infinite_size_dataloader(valid_dataloader_)
+    return train_dataloader, valid_dataloader
+def _train(
+    model,
+    optimizer,
+    lr_scheduler,
+    forward_step,
+    train_dataloader,
+    valid_dataloader,
+    end_of_epoch_callback,
+):
+    """Train the model."""
+    args = get_args()
+    timers = get_timers()
+    # Turn on training mode which enables dropout.
+    model.train()
+    # Tracking loss.
+    losses_dict_sum = {}
+    # Starting epoch and iteration
+    start_epoch = args.iteration // args.train_iters_per_epoch
+    start_iteration = args.iteration % args.train_iters_per_epoch
+    iteration = args.iteration
+    # Memory reporting flag.
+    report_memory_flag = True
+    # For each remaining epoch
+    timers("interval time").start()
+    for epoch in range(start_epoch, args.epochs):
+        print_rank_0("working on epoch {} ...".format(epoch + 1))
+        # Set the data loader epoch to shuffle the index iterator.
+        train_dataloader.sampler.set_epoch(args.seed + epoch)
+        # For all the batches in the dataset.
+        for iteration_, batch in enumerate(train_dataloader):
+            # Ignore the iterations before starting value
+            if iteration_ < start_iteration:
+                continue
+            # Set to zero so the next epoch does not skip any batches.
+            start_iteration = 0
+            # Train for one step.
+            losses_dict, skipped_iter = train_step(
+                forward_step, batch, model, optimizer, lr_scheduler
+            )
+            iteration += 1
+            # Logging.
+            report_memory_flag = training_log(
+                losses_dict,
+                losses_dict_sum,
+                optimizer.param_groups[0]["lr"],
+                iteration,
+                optimizer.get_loss_scale().item(),
+                report_memory_flag,
+                skipped_iter,
+            )
+            # Autoresume
+            if args.adlr_autoresume and (
+                iteration % args.adlr_autoresume_interval == 0
+            ):
+                check_adlr_autoresume_termination(
+                    iteration, model, optimizer, lr_scheduler
+                )
+            # Checkpointing
+            if (
+                args.save
+                and args.save_interval
+                and iteration % args.save_interval == 0
+            ):
+                save_checkpoint(iteration, model, optimizer, lr_scheduler)
+            # Evaluation
+            if args.eval_interval and iteration % args.eval_interval == 0:
+                prefix = "iteration {}".format(iteration)
+                evaluate_and_print_results(
+                    prefix,
+                    forward_step,
+                    valid_dataloader,
+                    model,
+                    iteration,
+                    False,
+                )
+        # Checkpointing at the end of each epoch.
+        if args.save:
+            save_checkpoint(iteration, model, optimizer, lr_scheduler)
+        # Callback at the end of each epoch.
+        if end_of_epoch_callback is not None:
+            end_of_epoch_callback(model, epoch)
+def finetune(
+    train_valid_datasets_provider,
+    model_provider,
+    forward_step=_cross_entropy_forward_step,
+    end_of_epoch_callback_provider=None,
+):
+    """Main finetune function used across all tasks."""
+    args = get_args()
+    timers = get_timers()
+    # Train and validation data loaders.
+    timers("train/valid/test dataset/dataloder").start()
+    if args.epochs > 0:
+        train_dataset, valid_dataset = train_valid_datasets_provider()
+        train_dataloader, valid_dataloader = _build_train_valid_dataloaders(
+            train_dataset, valid_dataset
+        )
+    timers("train/valid/test dataset/dataloder").stop()
+    # Build calback function.
+    timers("callback function").start()
+    end_of_epoch_callback = None
+    if end_of_epoch_callback_provider is not None:
+        end_of_epoch_callback = end_of_epoch_callback_provider()
+    timers("callback function").stop()
+    # Build model, optimizer and learning rate scheduler.
+    timers("model and optimizer").start()
+    model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider)
+    timers("model and optimizer").stop()
+    # If pretrained checkpoint is provided and we have not trained for
+    # any iteration (i.e., iteration is zero), then load the pretrained
+    # checkpoint.
+    timers("pretrained checkpoint").start()
+    if args.iteration == 0 and args.pretrained_checkpoint is not None:
+        original_load = args.load
+        args.load = args.pretrained_checkpoint
+        _ = load_checkpoint(model, None, None, strict=False)
+        args.load = original_load
+        # This is critical when only model is loaded. We should make sure
+        # master parameters are also updated.
+        optimizer.reload_model_params()
+    timers("pretrained checkpoint").stop()
+    # Print setup timing.
+    print_rank_0("done with setups ...")
+    timers.log(
+        [
+            "train/valid/test dataset/dataloder",
+            "callback function",
+            "model and optimizer",
+            "pretrained checkpoint",
+        ]
+    )
+    print_rank_0("training ...")
+    # Finetune the model.
+    if args.epochs > 0:
+        _train(
+            model,
+            optimizer,
+            lr_scheduler,
+            forward_step,
+            train_dataloader,
+            valid_dataloader,
+            end_of_epoch_callback,
+        )
+    # Or just evaluate.
+    else:
+        if end_of_epoch_callback is not None:
+            print_rank_0("evaluation only mode, setting epoch to -1")
+            end_of_epoch_callback(model, epoch=-1, output_predictions=True)
+    print_rank_0("done :-)")
--- a/tasks/vision/main.py
+++ b/tasks/vision/main.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Main tasks functionality."""
+import os
+import sys
+sys.path.append(
+    os.path.abspath(
+        os.path.join(
+            os.path.join(os.path.dirname(__file__), os.path.pardir),
+            os.path.pardir,
+        )
+    )
+)
+from megatron import get_args
+from megatron.initialize import initialize_megatron
+from classification import main
+def get_tasks_args(parser):
+    """Provide extra arguments required for tasks."""
+    group = parser.add_argument_group(title="tasks")
+    group.add_argument(
+        "--epochs",
+        type=int,
+        default=None,
+        help="Number of finetunning epochs. Zero results in "
+        "evaluation only.",
+    )
+    group.add_argument(
+        "--pretrained-checkpoint",
+        type=str,
+        default=None,
+        help="Pretrained checkpoint used for finetunning.",
+    )
+    group.add_argument(
+        "--keep-last",
+        action="store_true",
+        help="Keep the last batch (maybe incomplete) in" "the data loader",
+    )
+    return parser
+if __name__ == "__main__":
+    initialize_megatron(extra_args_provider=get_tasks_args)
+    args = get_args()
+    main()