vision transformer model and vision classification task

456f1728 · Vijay Korthikanti · f5eac3d1 · 456f1728 · 456f1728 · 456f1728
Commit 456f1728 authored Jan 08, 2021 by Vijay Korthikanti
5 changed files
--- a/pretrain_vit.py
+++ b/pretrain_vit.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Pretrain VIT"""
+import torch
+import torch.nn.functional as F
+from megatron import get_args, get_timers, mpu, print_rank_0
+from megatron.data.vit_dataset import build_train_valid_datasets
+from megatron.model import VitModel
+from megatron.training import pretrain
+from megatron.utils import average_losses_across_data_parallel_group
+def model_provider():
+    """Build the model."""
+    print_rank_0("building VIT model ...")
+    args = get_args()
+    model = VitModel(num_classes=args.num_classes)
+    return model
+def get_batch(data_iterator):
+    """Build the batch."""
+    # Items and their type.
+    keys = ["image", "label"]
+    datatype = torch.half
+    # Broadcast data.
+    if data_iterator is not None:
+        data = next(data_iterator)
+    else:
+        data = None
+    dict_data = {}
+    dict_data["image"] = data[0].half()
+    dict_data["label"] = data[1].half()
+    data_b = mpu.broadcast_data(keys, dict_data, datatype)
+    # Unpack.
+    images = data_b["image"]
+    labels = data_b["label"].long()
+    return images, labels
+def forward_step(data_iterator, model, input_tensor):
+    """Forward step."""
+    timers = get_timers()
+    assert input_tensor is None
+    # Get the batch.
+    timers("batch generator").start()
+    (
+        images,
+        labels,
+    ) = get_batch(data_iterator)
+    timers("batch generator").stop()
+    # Forward model. lm_labels
+    logits = model(images).contiguous().float()
+    loss = F.cross_entropy(logits, labels)
+    outputs = torch.argmax(logits, -1)
+    correct = (outputs == labels).float()
+    accuracy = torch.mean(correct)
+    averaged_loss = average_losses_across_data_parallel_group([loss, accuracy])
+    return loss, {"loss": averaged_loss[0], "accuracy": averaged_loss[1]}
+def train_valid_test_datasets_provider(train_val_test_num_samples):
+    """Build train, valid, and test datasets."""
+    args = get_args()
+    print_rank_0(
+        "> building train, validation, and test datasets " "for VIT ..."
+    )
+    train_ds, valid_ds = build_train_valid_datasets(data_path=args.data_path)
+    print_rank_0("> finished creating VIT datasets ...")
+    return train_ds, valid_ds, None
+if __name__ == "__main__":
+    pretrain(
+        train_valid_test_datasets_provider,
+        model_provider,
+        forward_step,
+        random_sample=True
+    )
--- a/tasks/vision/classification.py
+++ b/tasks/vision/classification.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GLUE finetuning/evaluation."""
+from megatron import get_args
+from megatron import print_rank_0
+from megatron.model import VitModel
+from megatron.data.vit_dataset import build_train_valid_datasets
+from tasks.vision.eval_utils import accuracy_func_provider
+from tasks.vision.finetune_utils import finetune
+def classification():
+    def train_valid_datasets_provider():
+        """Build train and validation dataset."""
+        args = get_args()
+        train_ds, valid_ds = build_train_valid_datasets(
+            data_path=args.data_path,
+            crop_size=args.img_dim,
+        )
+        return train_ds, valid_ds
+    def model_provider():
+        """Build the model."""
+        args = get_args()
+        print_rank_0("building classification model for ImageNet ...")
+        return VitModel(num_classes=args.num_classes, finetune=True)
+    """Finetune/evaluate."""
+    finetune(
+        train_valid_datasets_provider,
+        model_provider,
+        end_of_epoch_callback_provider=accuracy_func_provider,
+    )
+def main():
+    classification()
--- a/tasks/vision/eval_utils.py
+++ b/tasks/vision/eval_utils.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluation utilities."""
+import os
+import torch
+from megatron import get_args
+from megatron import print_rank_0
+from megatron import mpu
+from tasks.vision.finetune_utils import build_data_loader
+from tasks.vision.finetune_utils import process_batch
+from torchvision import datasets, transforms
+def accuracy_func_provider():
+    """Provide function that calculates accuracies."""
+    args = get_args()
+    data_path = args.data_path
+    crop_size = args.img_dim
+    # mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+    # Build dataloaders.
+    val_data_path = os.path.join(data_path[0], "val")
+    normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+    transform_val = transforms.Compose(
+        [
+            transforms.Resize(crop_size),
+            transforms.CenterCrop(crop_size),
+            transforms.ToTensor(),
+            normalize,
+        ]
+    )
+    dataset = datasets.ImageFolder(root=val_data_path, transform=transform_val)
+    dataloader = build_data_loader(
+        dataset,
+        args.micro_batch_size,
+        num_workers=args.num_workers,
+        drop_last=(mpu.get_data_parallel_world_size() > 1),
+    )
+    def metrics_func(model, epoch):
+        print_rank_0("calculating metrics ...")
+        correct, total = calculate_correct_answers(model, dataloader, epoch)
+        percent = float(correct) * 100.0 / float(total)
+        print_rank_0(
+            " >> |epoch: {}| overall: correct / total = {} / {} = "
+            "{:.4f} %".format(epoch, correct, total, percent)
+        )
+    return metrics_func
+def calculate_correct_answers(model, dataloader, epoch):
+    """Calculate correct over total answers"""
+    model.eval()
+    with torch.no_grad():
+        # For all the batches in the dataset.
+        total = 0
+        correct = 0
+        for _, batch in enumerate(dataloader):
+            # Run the model forward.
+            images, labels = process_batch(batch)
+            logits = model(images).contiguous().float()
+            # Add output predictions.
+            # Compute the correct answers.
+            predicted = torch.argmax(logits, dim=-1)
+            corrects = (predicted == labels).float()
+            # Add to the counters.
+            total += labels.size(0)
+            correct += corrects.sum().item()
+    model.train()
+    # Reduce.
+    unreduced = torch.cuda.LongTensor([correct, total])
+    torch.distributed.all_reduce(unreduced, group=mpu.get_data_parallel_group())
+    # Print on screen.
+    correct_ans = unreduced[0].item()
+    total_count = unreduced[1].item()
+    return correct_ans, total_count
--- a/tasks/vision/finetune_utils.py
+++ b/tasks/vision/finetune_utils.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Finetune utilities."""
+import torch
+import torch.nn.functional as F
+from megatron import get_args
+from megatron import print_rank_0
+from megatron import get_timers
+from megatron import mpu
+from megatron.checkpointing import load_checkpoint
+from megatron.checkpointing import save_checkpoint
+from megatron.training import evaluate_and_print_results
+from megatron.training import setup_model_and_optimizer
+from megatron.training import train_step
+from megatron.training import training_log
+from megatron.utils import check_adlr_autoresume_termination
+from megatron.utils import average_losses_across_data_parallel_group
+def process_batch(batch):
+    """Process batch and produce inputs for the model."""
+    images = batch[0].half().cuda().contiguous()
+    labels = batch[1].long().cuda().contiguous()
+    return images, labels
+def _cross_entropy_forward_step(batch, model, input_tensor):
+    """Simple forward step with cross-entropy loss."""
+    timers = get_timers()
+    assert input_tensor is None
+    # Get the batch.
+    timers("batch generator").start()
+    try:
+        batch_ = next(batch)
+    except BaseException:
+        batch_ = batch
+    images, labels = process_batch(batch_)
+    timers("batch generator").stop()
+    # Forward model.
+    logits = model(images).contiguous().float()
+    # Cross-entropy loss.
+    loss = F.cross_entropy(logits, labels)
+    # Reduce loss for logging.
+    average_loss = average_losses_across_data_parallel_group([loss])
+    return loss, {"lm loss": average_loss[0]}
+def build_data_loader(dataset, micro_batch_size, num_workers, drop_last):
+    """Data loader. Note that batch-size is the local (per GPU) batch-size."""
+    # Sampler.
+    world_size = mpu.get_data_parallel_world_size()
+    rank = mpu.get_data_parallel_rank()
+    sampler = torch.utils.data.distributed.DistributedSampler(
+        dataset, num_replicas=world_size, rank=rank
+    )
+    # Data loader. Note that batch size is the per GPU batch size.
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=micro_batch_size,
+        sampler=sampler,
+        shuffle=False,
+        num_workers=num_workers,
+        drop_last=drop_last,
+        pin_memory=True,
+    )
+    return data_loader
+def _build_infinite_size_dataloader(dataloader):
+    """Build a looped dataloader with infinite size."""
+    iterator = dataloader.__iter__()
+    while True:
+        try:
+            yield iterator.__next__()
+        except StopIteration:
+            iterator = dataloader.__iter__()
+def _build_train_valid_dataloaders(train_dataset, valid_dataset):
+    """Traing and validation dataloaders."""
+    args = get_args()
+    print_rank_0("building train and validation dataloaders ...")
+    # Training dataset.
+    train_dataloader = build_data_loader(
+        train_dataset, args.micro_batch_size, args.num_workers, not args.keep_last
+    )
+    # Set the training iterations.
+    args.train_iters_per_epoch = len(train_dataloader)
+    args.train_iters = args.epochs * args.train_iters_per_epoch
+    # Validation dataset. For this dataset, we do not need to set up
+    # shuffling so we can just use a simple infinite loop.
+    valid_dataloader_ = build_data_loader(
+        valid_dataset, args.micro_batch_size, args.num_workers, not args.keep_last
+    )
+    valid_dataloader = _build_infinite_size_dataloader(valid_dataloader_)
+    return train_dataloader, valid_dataloader
+def _train(
+    model,
+    optimizer,
+    lr_scheduler,
+    forward_step,
+    train_dataloader,
+    valid_dataloader,
+    end_of_epoch_callback,
+):
+    """Train the model."""
+    args = get_args()
+    timers = get_timers()
+    # Turn on training mode which enables dropout.
+    model.train()
+    # Tracking loss.
+    losses_dict_sum = {}
+    # Starting epoch and iteration
+    start_epoch = args.iteration // args.train_iters_per_epoch
+    start_iteration = args.iteration % args.train_iters_per_epoch
+    iteration = args.iteration
+    # Memory reporting flag.
+    report_memory_flag = True
+    # For each remaining epoch
+    timers("interval time").start()
+    for epoch in range(start_epoch, args.epochs):
+        print_rank_0("working on epoch {} ...".format(epoch + 1))
+        # Set the data loader epoch to shuffle the index iterator.
+        train_dataloader.sampler.set_epoch(args.seed + epoch)
+        # For all the batches in the dataset.
+        for iteration_, batch in enumerate(train_dataloader):
+            # Ignore the iterations before starting value
+            if iteration_ < start_iteration:
+                continue
+            # Set to zero so the next epoch does not skip any batches.
+            start_iteration = 0
+            # Train for one step.
+            losses_dict, skipped_iter = train_step(
+                forward_step, batch, model, optimizer, lr_scheduler
+            )
+            iteration += 1
+            # Logging.
+            report_memory_flag = training_log(
+                losses_dict,
+                losses_dict_sum,
+                optimizer.param_groups[0]["lr"],
+                iteration,
+                optimizer.get_loss_scale().item(),
+                report_memory_flag,
+                skipped_iter,
+            )
+            # Autoresume
+            if args.adlr_autoresume and (
+                iteration % args.adlr_autoresume_interval == 0
+            ):
+                check_adlr_autoresume_termination(
+                    iteration, model, optimizer, lr_scheduler
+                )
+            # Checkpointing
+            if (
+                args.save
+                and args.save_interval
+                and iteration % args.save_interval == 0
+            ):
+                save_checkpoint(iteration, model, optimizer, lr_scheduler)
+            # Evaluation
+            if args.eval_interval and iteration % args.eval_interval == 0:
+                prefix = "iteration {}".format(iteration)
+                evaluate_and_print_results(
+                    prefix,
+                    forward_step,
+                    valid_dataloader,
+                    model,
+                    iteration,
+                    False,
+                )
+        # Checkpointing at the end of each epoch.
+        if args.save:
+            save_checkpoint(iteration, model, optimizer, lr_scheduler)
+        # Callback at the end of each epoch.
+        if end_of_epoch_callback is not None:
+            end_of_epoch_callback(model, epoch)
+def finetune(
+    train_valid_datasets_provider,
+    model_provider,
+    forward_step=_cross_entropy_forward_step,
+    end_of_epoch_callback_provider=None,
+):
+    """Main finetune function used across all tasks."""
+    args = get_args()
+    timers = get_timers()
+    # Train and validation data loaders.
+    timers("train/valid/test dataset/dataloder").start()
+    if args.epochs > 0:
+        train_dataset, valid_dataset = train_valid_datasets_provider()
+        train_dataloader, valid_dataloader = _build_train_valid_dataloaders(
+            train_dataset, valid_dataset
+        )
+    timers("train/valid/test dataset/dataloder").stop()
+    # Build calback function.
+    timers("callback function").start()
+    end_of_epoch_callback = None
+    if end_of_epoch_callback_provider is not None:
+        end_of_epoch_callback = end_of_epoch_callback_provider()
+    timers("callback function").stop()
+    # Build model, optimizer and learning rate scheduler.
+    timers("model and optimizer").start()
+    model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider)
+    timers("model and optimizer").stop()
+    # If pretrained checkpoint is provided and we have not trained for
+    # any iteration (i.e., iteration is zero), then load the pretrained
+    # checkpoint.
+    timers("pretrained checkpoint").start()
+    if args.iteration == 0 and args.pretrained_checkpoint is not None:
+        original_load = args.load
+        args.load = args.pretrained_checkpoint
+        _ = load_checkpoint(model, None, None, strict=False)
+        args.load = original_load
+        # This is critical when only model is loaded. We should make sure
+        # master parameters are also updated.
+        optimizer.reload_model_params()
+    timers("pretrained checkpoint").stop()
+    # Print setup timing.
+    print_rank_0("done with setups ...")
+    timers.log(
+        [
+            "train/valid/test dataset/dataloder",
+            "callback function",
+            "model and optimizer",
+            "pretrained checkpoint",
+        ]
+    )
+    print_rank_0("training ...")
+    # Finetune the model.
+    if args.epochs > 0:
+        _train(
+            model,
+            optimizer,
+            lr_scheduler,
+            forward_step,
+            train_dataloader,
+            valid_dataloader,
+            end_of_epoch_callback,
+        )
+    # Or just evaluate.
+    else:
+        if end_of_epoch_callback is not None:
+            print_rank_0("evaluation only mode, setting epoch to -1")
+            end_of_epoch_callback(model, epoch=-1, output_predictions=True)
+    print_rank_0("done :-)")
--- a/tasks/vision/main.py
+++ b/tasks/vision/main.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Main tasks functionality."""
+import os
+import sys
+sys.path.append(
+    os.path.abspath(
+        os.path.join(
+            os.path.join(os.path.dirname(__file__), os.path.pardir),
+            os.path.pardir,
+        )
+    )
+)
+from megatron import get_args
+from megatron.initialize import initialize_megatron
+from classification import main
+def get_tasks_args(parser):
+    """Provide extra arguments required for tasks."""
+    group = parser.add_argument_group(title="tasks")
+    group.add_argument(
+        "--epochs",
+        type=int,
+        default=None,
+        help="Number of finetunning epochs. Zero results in "
+        "evaluation only.",
+    )
+    group.add_argument(
+        "--pretrained-checkpoint",
+        type=str,
+        default=None,
+        help="Pretrained checkpoint used for finetunning.",
+    )
+    group.add_argument(
+        "--keep-last",
+        action="store_true",
+        help="Keep the last batch (maybe incomplete) in" "the data loader",
+    )
+    return parser
+if __name__ == "__main__":
+    initialize_megatron(extra_args_provider=get_tasks_args)
+    args = get_args()
+    main()