adress review comments

488f8c02 · Vijay Korthikanti · Sangkug Lym · 1cd3650d · 488f8c02 · 488f8c02
Commit 488f8c02 authored Feb 11, 2022 by Vijay Korthikanti Committed by Sangkug Lym Feb 15, 2022
13 changed files
--- a/LICENSE
+++ b/LICENSE
@@ -29,12 +29,13 @@ The following applies to all files unless otherwise noted:
 --

 This repository also contains code from Hugging Face Inc., Google Research,
-Facebook (from their Fairseq project), and Philip Popien. Files from these
-organizations have notices at the top of each file. Below are licenses
-used in those files, as indicated.
+Facebook (from their Fairseq and Dino projects), Microsoft(from their 
+Swin-Transformer project)and Philip Popien. Files from these 
+organizations have notices at  the top of each file. Below are 
+licenses used in those files, as indicated.


------------- LICENSE FOR huggingface and Google Research code  --------------
+------------- LICENSE FOR Facebook, huggingface and Google Research code  --------------


                                 Apache License
@@ -263,3 +264,113 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.

+------------- LICENSE FOR Mircrosoft Swin transformer code --------------
+
+MIT License
+
+Copyright (c) Microsoft Corporation.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE
+
+
+--------------- NVIDIA Source Code License for SegFormer -----------------
+1. Definitions
+
+“Licensor” means any person or entity that distributes its Work.
+
+“Software” means the original work of authorship made available under this
+License.
+
+“Work” means the Software and any additions to or derivative works of the
+Software that are made available under this License.
+
+The terms “reproduce,” “reproduction,” “derivative works,” and 
+“distribution” have the meaning as provided under U.S. copyright law;
+provided, however, that for the purposes of this License, derivative works
+shall not include works that remain separable from, or merely link 
+(or bind by name) to the interfaces of, the Work.
+
+Works, including the Software, are “made available” under this License by 
+including in or with the Work either (a) a copyright notice referencing 
+the applicability of this License to the Work, or (b) a copy of this License.
+
+2. License Grant
+
+2.1 Copyright Grant. Subject to the terms and conditions of this License,
+each Licensor grants to you a perpetual, worldwide, non-exclusive, 
+royalty-free, copyright license to reproduce, prepare derivative works of, 
+publicly  display, publicly perform, sublicense and distribute its Work 
+and any resulting derivative works in any form.
+
+3. Limitations
+
+3.1 Redistribution. You may reproduce or distribute the Work only if 
+(a) you do so under this License, (b) you include a complete copy of this 
+License with your distribution, and (c) you retain without modification any
+copyright, patent, trademark, or attribution notices that are present
+in the Work.
+
+3.2 Derivative Works. You may specify that additional or different terms 
+apply to the use, reproduction, and distribution of your derivative works 
+of the Work (“Your Terms”) only if (a) Your Terms provide that the use
+limitation in Section 3.3 applies to your derivative works, and (b) you 
+identify the specific derivative works that are subject to Your Terms. 
+Notwithstanding Your Terms, this License (including the redistribution
+requirements in Section 3.1) will continue to apply to the Work itself.
+
+3.3 Use Limitation. The Work and any derivative works thereof only may 
+be used or intended for use non-commercially. Notwithstanding the 
+foregoing, NVIDIA and its affiliates may use the Work and any derivative
+works commercially. As used herein, “non-commercially” means for research 
+or evaluation purposes only.
+
+3.4 Patent Claims. If you bring or threaten to bring a patent claim against 
+any Licensor (including any claim, cross-claim or counterclaim in a lawsuit) 
+to enforce any patents that you allege are infringed by any Work, then
+your rights under this License from such Licensor (including the grant 
+in Section 2.1) will terminate immediately.
+
+3.5 Trademarks. This License does not grant any rights to use any Licensor’s 
+or its affiliates’ names, logos, or trademarks, except as necessary to 
+reproduce the notices described in this License.
+
+3.6 Termination. If you violate any term of this License, then your rights 
+under this License (including the grant in Section 2.1) will terminate 
+immediately.
+
+4. Disclaimer of Warranty.
+
+THE WORK IS PROVIDED “AS IS” WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF 
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR NON-INFRINGEMENT.
+YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER THIS LICENSE.
+
+5. Limitation of Liability.
+
+EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL 
+THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE 
+SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT 
+OF OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK 
+(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER 
+COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+
+
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -881,6 +881,8 @@ def _add_vision_args(parser):
                       help='learning rate multiplier for head during finetuning')

    # pretraining type and backbone selection`
+    group.add_argument('--vision-pretraining', action='store_true',
+                       help='flag to indicate vision pretraining')
    group.add_argument('--vision-pretraining-type', type=str, default='classify',
                       choices=['classify', 'inpaint', 'dino'],
                       help='pretraining objectives')

--- a/megatron/model/vision/classification.py
+++ b/megatron/model/vision/classification.py
@@ -16,11 +16,11 @@
 """Vision Transformer(VIT) model."""

 import torch
+from torch.nn.init import trunc_normal_
 from megatron import get_args
 from megatron.model.utils import get_linear_layer
 from megatron.model.vision.vit_backbone import VitBackbone, VitMlpHead
 from megatron.model.vision.mit_backbone import mit_b3_avg
-from megatron.model.vision.utils import trunc_normal_
 from megatron.model.module import MegatronModule

 class VitClassificationModel(MegatronModule):

--- a/megatron/model/vision/dino.py
+++ b/megatron/model/vision/dino.py
@@ -11,11 +11,11 @@ import einops
 import torch
 import numpy as np
 import torch.nn.functional as F
+from torch.nn.init import trunc_normal_
 from megatron import get_args, print_rank_0
 from megatron.model.utils import get_linear_layer
 from megatron.model.vision.vit_backbone import VitBackbone
 from megatron.model.module import MegatronModule
-from megatron.model.vision.utils import trunc_normal_
 from megatron.model.vision.mit_backbone import mit_b5_avg
 from megatron.model.vision.esvit_swin_backbone import get_swin


--- a/megatron/model/vision/esvit_swin_backbone.py
+++ b/megatron/model/vision/esvit_swin_backbone.py
@@ -14,7 +14,7 @@ import torch.nn as nn
 import torch.nn.functional as F
 from functools import partial
 import torch.distributed as dist
-from megatron.model.vision.utils import trunc_normal_
+from torch.nn.init import trunc_normal_
 from megatron.model.transformer import DropPath
 from megatron import get_args
 from megatron.model import LayerNorm

--- a/megatron/model/vision/inpainting.py
+++ b/megatron/model/vision/inpainting.py
 # coding=utf-8
 # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Vision Transformer(VIT) model."""
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+i
 import math
 import apex
 import einops
@@ -24,7 +14,7 @@ from megatron.model.utils import get_linear_layer
 from megatron.model.vision.vit_backbone import VitBackbone
 from megatron.model.module import MegatronModule
 from megatron.model.vision.mit_backbone import mit_b3
-from megatron.model.vision.utils import resize, trunc_normal_
+from megatron.model.vision.utils import resize_


 class VitInpaintingModel(MegatronModule):

--- a/megatron/model/vision/knn_monitor.py
+++ b/megatron/model/vision/knn_monitor.py
@@ -4,6 +4,9 @@ from megatron import print_rank_0, get_args, mpu
 from megatron.data.vit_dataset import ClassificationTransform
 from megatron.data.image_folder import ImageFolder

+_FEATURE_BANK = None
+
+
 def build_data_loader(dataset, drop_last=True, shuffle=False):
    """Data loader. Note that batch-size is the local (per GPU) batch-size."""
    # Sampler.
@@ -32,6 +35,7 @@ def build_data_loader(dataset, drop_last=True, shuffle=False):

 def compute_feature_bank(model):
    args = get_args()
+    global _FEATURE_BANK
    feature_bank = []
    feature_label = []

@@ -84,7 +88,13 @@ def compute_feature_bank(model):
    print_rank_0("feature_banks size is {}".format(feature_banks.size()))
    print_rank_0("feature labels size is {}".format(feature_labels.size()))

-    return (feature_banks, feature_labels, classes)
+    _FEATURE_BANK = (feature_banks, feature_labels, classes)
+
+
+def get_feature_bank():
+    global _FEATURE_BANK
+    assert _FEATURE_BANK is not None
+    return _FEATURE_BANK


 # knn monitor as in InstDisc https://arxiv.org/abs/1805.01978

--- a/megatron/model/vision/mit_backbone.py
+++ b/megatron/model/vision/mit_backbone.py
@@ -2,13 +2,15 @@
 # Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
 #
 # This work is licensed under the NVIDIA Source Code License
+# found in the LICENSE file in the root directory of this 
+# source tree.
 # ---------------------------------------------------------------
 import math
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from functools import partial
-from megatron.model.vision.utils import trunc_normal_
+from torch.nn.init import trunc_normal_
 from megatron.model.transformer import DropPath
 from megatron.model import LayerNorm


--- a/megatron/model/vision/utils.py
+++ b/megatron/model/vision/utils.py
 import warnings
-import math
-from itertools import repeat
 import torch
-import torch.nn as nn
 import torch.nn.functional as F


@@ -28,61 +25,3 @@ def resize(input,
    if isinstance(size, torch.Size):
        size = tuple(int(x) for x in size)
    return F.interpolate(input, size, scale_factor, mode, align_corners)
-
-
-def _no_grad_trunc_normal_(tensor, mean, std, a, b):
-    # Cut & paste from PyTorch official master until it's in a few official releases - RW
-    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
-    def norm_cdf(x):
-        # Computes standard normal cumulative distribution function
-        return (1. + math.erf(x / math.sqrt(2.))) / 2.
-
-    if (mean < a - 2 * std) or (mean > b + 2 * std):
-        warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
-                      "The distribution of values may be incorrect.",
-                      stacklevel=2)
-
-    with torch.no_grad():
-        # Values are generated by using a truncated uniform distribution and
-        # then using the inverse CDF for the normal distribution.
-        # Get upper and lower cdf values
-        l = norm_cdf((a - mean) / std)
-        u = norm_cdf((b - mean) / std)
-
-        # Uniformly fill tensor with values from [l, u], then translate to
-        # [2l-1, 2u-1].
-        tensor.uniform_(2 * l - 1, 2 * u - 1)
-
-        # Use inverse cdf transform for normal distribution to get truncated
-        # standard normal
-        tensor.erfinv_()
-
-        # Transform to proper mean, std
-        tensor.mul_(std * math.sqrt(2.))
-        tensor.add_(mean)
-
-        # Clamp to ensure it's in the proper range
-        tensor.clamp_(min=a, max=b)
-        return tensor
-
-
-def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
-    # type: (Tensor, float, float, float, float) -> Tensor
-    r"""Fills the input Tensor with values drawn from a truncated
-    normal distribution. The values are effectively drawn from the
-    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
-    with values outside :math:`[a, b]` redrawn until they are within
-    the bounds. The method used for generating the random values works
-    best when :math:`a \leq \text{mean} \leq b`.
-    Args:
-        tensor: an n-dimensional `torch.Tensor`
-        mean: the mean of the normal distribution
-        std: the standard deviation of the normal distribution
-        a: the minimum cutoff value
-        b: the maximum cutoff value
-    Examples:
-        >>> w = torch.empty(3, 5)
-        >>> nn.init.trunc_normal_(w)
-    """
-    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
-
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -21,7 +21,6 @@ import sys
 import time
 # The earliest we can measure the start time.
 _TRAIN_START_TIME = time.time()
-
 import torch
 from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP

@@ -465,7 +464,7 @@ def train_step(forward_step_func, data_iterator,
        torch.distributed.all_reduce(grad, group=mpu.get_position_embedding_group())
    timers('backward-embedding-all-reduce').stop()

-    if args.vision_pretraining_type == "dino":
+    if args.vision_pretraining and args.vision_pretraining_type == "dino":
        unwrapped_model = unwrap_model(model[0],
                                       (torchDDP, LocalDDP, Float16Module))
        unwrapped_model.cancel_gradients_last_layer(args.curr_iteration)
@@ -476,7 +475,7 @@ def train_step(forward_step_func, data_iterator,
    update_successful, grad_norm, num_zeros_in_grad = optimizer.step()
    timers('optimizer').stop()

-    if args.vision_pretraining_type == "dino":
+    if args.vision_pretraining and args.vision_pretraining_type == "dino":
        unwrapped_model = unwrap_model(model[0],
                                       (torchDDP, LocalDDP, Float16Module))
        unwrapped_model.update_momentum(args.curr_iteration)
@@ -804,8 +803,8 @@ def evaluate(forward_step_func,
    """Evaluation."""
    args = get_args()

-    if args.vision_pretraining_type == "dino":
-        args.knn_features = compute_feature_bank(model)
+    if args.vision_pretraining and args.vision_pretraining_type == "dino":
+        compute_feature_bank(model)

    # Turn on evaluation mode which disables dropout.
    for model_module in model:

--- a/pretrain_vision_classify.py
+++ b/pretrain_vision_classify.py
@@ -112,5 +112,5 @@ if __name__ == "__main__":
        model_provider,
        ModelType.encoder_or_decoder,
        forward_step,
-        args_defaults={'dataloader_type': 'cyclic'}
+        args_defaults={'dataloader_type': 'cyclic', 'vision_pretraining': True}
    )
--- a/pretrain_vision_dino.py
+++ b/pretrain_vision_dino.py
@@ -13,8 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-"""Pretrain VIT"""
-
 import torch
 import torch.nn.functional as F
 import torch.nn as nn
@@ -24,7 +22,7 @@ from functools import partial
 from megatron import get_args, get_timers, mpu, print_rank_0
 from megatron.data.vit_dataset import build_train_valid_datasets
 from megatron.model.vision.dino import DINOPretrainModel
-from megatron.model.vision.knn_monitor import knn_predict
+from megatron.model.vision.knn_monitor import knn_predict, get_feature_bank
 from megatron.training import pretrain
 from megatron.utils import average_losses_across_data_parallel_group, unwrap_model
 from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
@@ -34,7 +32,6 @@ from megatron.model import ModelType

 def model_provider(pre_process=True, post_process=True):
    """Build the model."""
-    print_rank_0("building VIT model ...")
    return DINOPretrainModel(pre_process=pre_process, post_process=post_process)

 def get_batch(data_iterator):
@@ -65,7 +62,7 @@ def loss_func(model, labels, output_tensor, collect_data=False):
        return loss, {"loss": averaged_loss[0]}
    else:
        _, teacher_feature = output_tensor
-        feature_bank, feature_labels, classes = args.knn_features
+        feature_bank, feature_labels, classes = get_feature_bank()
        feature = F.normalize(teacher_feature.float(), dim=1)

        knn_accs = []
@@ -119,6 +116,6 @@ if __name__ == "__main__":
        model_provider,
        ModelType.encoder_or_decoder,
        forward_step,
-        args_defaults={'dataloader_type': 'cyclic'}
+        args_defaults={'dataloader_type': 'cyclic', 'vision_pretraining': True}
    )

--- a/pretrain_vision_inpaint.py
+++ b/pretrain_vision_inpaint.py
@@ -146,5 +146,5 @@ if __name__ == "__main__":
        ModelType.encoder_or_decoder,
        forward_step,
        process_non_loss_data,
-        args_defaults={'dataloader_type': 'cyclic'}
+        args_defaults={'dataloader_type': 'cyclic', 'vision_pretraining': True}
    )