Merge branch 'main' into lmcafee/distrib-opt

5fb0ff9a · Lawrence McAfee · 862d70fc · 53f3efc4 · 5fb0ff9a · 5fb0ff9a
Commit 5fb0ff9a authored Mar 10, 2022 by Lawrence McAfee
16 changed files
--- a/megatron/mpu/__init__.py
+++ b/megatron/mpu/__init__.py
@@ -49,6 +49,7 @@ from .initialize import get_virtual_pipeline_model_parallel_rank, set_virtual_pi
 from .initialize import initialize_model_parallel
 from .initialize import model_parallel_is_initialized
+from .layers import LinearWithGradAccumulationAndAsyncAllreduce
 from .layers import ColumnParallelLinear
 from .layers import RowParallelLinear
 from .layers import VocabParallelEmbedding

--- a/megatron/mpu/initialize.py
+++ b/megatron/mpu/initialize.py
@@ -323,20 +323,44 @@ def get_num_layers(args, is_encoder_and_decoder_model):
    if get_pipeline_model_parallel_world_size() > 1:
        if is_encoder_and_decoder_model:
            assert args.pipeline_model_parallel_split_rank is not None
-            num_ranks_in_encoder = args.pipeline_model_parallel_split_rank
-            num_ranks_in_decoder = get_pipeline_model_parallel_world_size() - num_ranks_in_encoder
+            # When a standalone embedding stage is used, a rank is taken from
+            # the encoder's ranks, to be used for the encoder's embedding
+            # layer. This way, the rank referenced by the 'split rank' remains
+            # the same whether or not a standalone embedding stage is used.
+            num_ranks_in_encoder = (
+                args.pipeline_model_parallel_split_rank - 1
+                if args.standalone_embedding_stage else
+                args.pipeline_model_parallel_split_rank
+            )
+            num_ranks_in_decoder = args.transformer_pipeline_model_parallel_size - num_ranks_in_encoder
            assert args.num_layers % num_ranks_in_encoder == 0, \
-                    'num_layers must be divisible by number of ranks given to encoder'
+                    'num_layers (%d) must be divisible by number of ranks given to encoder (%d)' % (args.num_layers, num_ranks_in_encoder)
            assert args.num_layers % num_ranks_in_decoder == 0, \
-                    'num_layers must be divisible by number of ranks given to decoder'
+                    'num_layers (%d) must be divisible by number of ranks given to decoder (%d)' % (args.num_layers, num_ranks_in_decoder)
            if is_pipeline_stage_before_split():
-                num_layers = args.num_layers // num_ranks_in_encoder
+                num_layers = (
+                    0
+                    if args.standalone_embedding_stage
+                    and get_pipeline_model_parallel_rank() == 0 else
+                    args.num_layers // num_ranks_in_encoder
+                )
            else:
                num_layers = args.num_layers // num_ranks_in_decoder
        else:
-            assert args.num_layers % get_pipeline_model_parallel_world_size() == 0, \
+            assert args.num_layers % args.transformer_pipeline_model_parallel_size == 0, \
-                'num_layers must be divisible by pipeline_model_parallel_size'
+                'num_layers must be divisible by transformer_pipeline_model_parallel_size'
-            num_layers = args.num_layers // get_pipeline_model_parallel_world_size()
+            # When a standalone embedding stage is used, all transformer layers
+            # are divided among pipeline rank >= 1, while on pipeline rank 0,
+            # ranks either contain the input embedding layer (virtual pp rank 0),
+            # or no layers at all (virtual pp rank >= 1).
+            num_layers = (
+                0
+                if args.standalone_embedding_stage
+                and get_pipeline_model_parallel_rank() == 0 else
+                args.num_layers // args.transformer_pipeline_model_parallel_size
+            )
    else:
        num_layers = args.num_layers
    return num_layers

--- a/megatron/mpu/layers.py
+++ b/megatron/mpu/layers.py
@@ -199,15 +199,18 @@ class VocabParallelEmbedding(torch.nn.Module):
        return output
-class ColumnParallelLinearWithAsyncAllreduce(torch.autograd.Function):
+class LinearWithGradAccumulationAndAsyncAllreduce(torch.autograd.Function):
    """
-    Column-parallel linear layer execution with asynchronous all-reduce
+    Linear layer execution with asynchronous all-reduce and gradient accumulation
-    execution in backprop.
+    fusion in backprop.
    """
    @staticmethod
-    def forward(ctx, input, weight, bias):
+    def forward(ctx, input, weight, bias, gradient_accumulation_fusion,
+                async_grad_allreduce):
        ctx.save_for_backward(input, weight)
        ctx.use_bias = bias is not None
+        ctx.gradient_accumulation_fusion = gradient_accumulation_fusion
+        ctx.async_grad_allreduce = async_grad_allreduce
        output = torch.matmul(input, weight.t())
        if bias is not None:
            output = output + bias
@@ -215,19 +218,32 @@ class ColumnParallelLinearWithAsyncAllreduce(torch.autograd.Function):
    @staticmethod
    def backward(ctx, grad_output):
+        import fused_dense_cuda
        input, weight = ctx.saved_tensors
        use_bias = ctx.use_bias
        grad_input = grad_output.matmul(weight)
-        # Asyncronous all-reduce
-        handle = torch.distributed.all_reduce(
+        # Convert the tensor shapes to 2D for execution compatibility
-                grad_input, group=get_tensor_model_parallel_group(), async_op=True)
+        grad_output = grad_output.view(grad_output.shape[0] * grad_output.shape[1],
-        # Delay the start of weight gradient computation shortly (3us) to have
+                                       grad_output.shape[2])
-        # all-reduce scheduled first and have GPU resources allocated
+        input = input.view(input.shape[0] * input.shape[1], input.shape[2])
-        _ = torch.empty(1, device=grad_output.device) + 1
-        grad_weight = grad_output.t().matmul(input)
+        if ctx.async_grad_allreduce:
+            # Asynchronous all-reduce
+            handle = torch.distributed.all_reduce(
+                    grad_input, group=get_tensor_model_parallel_group(), async_op=True)
+            # Delay the start of weight gradient computation shortly (3us) to have
+            # all-reduce scheduled first and have GPU resources allocated
+            _ = torch.empty(1, device=grad_output.device) + 1
+        if ctx.gradient_accumulation_fusion:
+            fused_dense_cuda.wgrad_gemm_accum_fp32(input, grad_output, weight.main_grad)
+            grad_weight = None
+        else:
+            grad_weight = grad_output.t().matmul(input)
        grad_bias = grad_output.sum(dim=0) if use_bias else None
-        handle.wait()
+        if ctx.async_grad_allreduce:
-        return grad_input, grad_weight, grad_bias
+            handle.wait()
+        return grad_input, grad_weight, grad_bias, None, None
 class ColumnParallelLinear(torch.nn.Module):
@@ -240,7 +256,7 @@ class ColumnParallelLinear(torch.nn.Module):
        input_size: first dimension of matrix A.
        output_size: second dimension of matrix A.
        bias: If true, add bias
-        gather_output: If true, call all-gather on output and make Y avaiable
+        gather_output: If true, call all-gather on output and make Y available
                       to all GPUs, otherwise, every GPU will have its output
                       which is Y_i = XA_i
        init_method: method to initialize weights. Note that bias is always set
@@ -305,29 +321,23 @@ class ColumnParallelLinear(torch.nn.Module):
        else:
            self.register_parameter('bias', None)
        self.async_tensor_model_parallel_allreduce = (
-                not args.no_async_tensor_model_parallel_allreduce and
+                args.async_tensor_model_parallel_allreduce and
                world_size > 1)
+        self.gradient_accumulation_fusion = args.gradient_accumulation_fusion
    def forward(self, input_):
        bias = self.bias if not self.skip_bias_add else None
        if self.async_tensor_model_parallel_allreduce:
-            input_shape = input_.shape
+            input_parallel = input_
-            input_ = input_.view(input_shape[0] * input_shape[1],input_shape[2])
-            # Maxtrix multiply with asynchronouse all-reduce execution
-            output_parallel = ColumnParallelLinearWithAsyncAllreduce.apply(
-                    input_, self.weight, bias)
-            output_parallel = output_parallel.view(
-                    input_shape[0], input_shape[1], output_parallel.shape[1])
        else:
            # Set up backprop all-reduce.
            input_parallel = copy_to_tensor_model_parallel_region(input_)
+        # Matrix multiply.
-            # Matrix multiply.
+        output_parallel = LinearWithGradAccumulationAndAsyncAllreduce.apply(
-            output_parallel = F.linear(input_parallel, self.weight, bias)
+            input_parallel, self.weight, bias, self.gradient_accumulation_fusion,
+            self.async_tensor_model_parallel_allreduce)
        if self.gather_output:
            # All-gather across the partitions.
            output = gather_from_tensor_model_parallel_region(output_parallel)
@@ -415,7 +425,7 @@ class RowParallelLinear(torch.nn.Module):
                self.bias.zero_()
        else:
            self.register_parameter('bias', None)
+        self.gradient_accumulation_fusion = args.gradient_accumulation_fusion
    def forward(self, input_):
@@ -425,7 +435,9 @@ class RowParallelLinear(torch.nn.Module):
        else:
            input_parallel = scatter_to_tensor_model_parallel_region(input_)
        # Matrix multiply.
-        output_parallel = F.linear(input_parallel, self.weight)
+        output_parallel = LinearWithGradAccumulationAndAsyncAllreduce.apply(
+            input_parallel, self.weight, None,
+            self.gradient_accumulation_fusion, None)
        # All-reduce across all the partitions.
        output_ = reduce_from_tensor_model_parallel_region(output_parallel)
        if not self.skip_bias_add:

--- a/megatron/mpu/random.py
+++ b/megatron/mpu/random.py
@@ -87,17 +87,21 @@ def split_tensor_into_1d_equal_chunks(tensor, new_buffer=False):
 def gather_split_1d_tensor(tensor):
    """Opposite of above function, gather values from model parallel ranks."""
-    world_size = get_tensor_model_parallel_world_size()
+    numel_gathered = torch.numel(tensor) * \
-    numel = torch.numel(tensor)
+        get_tensor_model_parallel_world_size()
-    numel_gathered = world_size * numel
    gathered = torch.empty(numel_gathered, dtype=tensor.dtype,
                           device=torch.cuda.current_device(),
                           requires_grad=False)
-    chunks = [gathered[i*numel:(i+1)*numel] for i in range(world_size)]
+    # TODO: This API is experimental in pytorch (as of Feb 2022) and
-    torch.distributed.all_gather(chunks, tensor,
+    # this might break in future pytorch releases. We chose this API
-                                 group=get_tensor_model_parallel_group())
+    # as opposed to torch.distributed.all_gather for efficiency reasons.
+    # This API calls directly NCCL all-gather versus the former does
+    # internal copies and can potentially cause slow down.
+    torch.distributed._all_gather_base(gathered, tensor,
+                                       group=get_tensor_model_parallel_group())
    return gathered
 def _kernel_make_viewless_tensor(inp, requires_grad):
    '''Make a viewless tensor.

--- a/megatron/schedules.py
+++ b/megatron/schedules.py
@@ -34,9 +34,13 @@ def get_forward_backward_func():
    if mpu.get_pipeline_model_parallel_world_size() > 1:
        if args.virtual_pipeline_model_parallel_size is not None:
            forward_backward_func = forward_backward_pipelining_with_interleaving
-            assert get_num_microbatches() % args.pipeline_model_parallel_size == 0, \
+            assert get_num_microbatches() % \
-                'number of microbatches is not divisible by pipeline-parallel ' \
+                args.pipeline_model_parallel_size == 0, \
-                'size when using interleaved schedule'
+                'number of microbatches (%d) is not divisible by pipeline-' \
+                'model-parallel-size (%d) when using interleaved schedule' % (
+                    get_num_microbatches(),
+                    args.pipeline_model_parallel_size,
+                )
        else:
            forward_backward_func = forward_backward_pipelining_without_interleaving
    else:

--- a/megatron/static/index.html
+++ b/megatron/static/index.html
+<!-- coding=utf-8-->
+<!-- Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.-->
+<!---->
+<!-- Licensed under the Apache License, Version 2.0 (the "License");-->
+<!-- you may not use this file except in compliance with the License.-->
+<!-- You may obtain a copy of the License at-->
+<!---->
+<!--     http://www.apache.org/licenses/LICENSE-2.0-->
+<!---->
+<!-- Unless required by applicable law or agreed to in writing, software-->
+<!-- distributed under the License is distributed on an "AS IS" BASIS,-->
+<!-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.-->
+<!-- See the License for the specific language governing permissions and-->
+<!-- limitations under the License.-->
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8" />
+<title>Megatron</title>
+<style>
+.wrapper {
+max-width: 75%;
+margin: auto;
+}
+h1 { 
+margin: 3rem 0 1rem 0; 
+padding: 0;
+font-size: 1.5rem;
+}
+textarea {
+width: 100%;
+min-height: 300px;
+resize: none;
+border-radius: 8px;
+border: 1px solid #ddd;
+padding: 0.5rem;
+box-shadow: inset 0 0 0.25rem #ddd;
+&:focus {
+outline: none;
+border: 1px solid darken(#ddd, 5%);
+box-shadow: inset 0 0 0.5rem darken(#ddd, 5%);
+}
+}
+#the-count {
+float: right;
+padding: 0.1rem 0 0 0;
+font-size: 0.875rem;
+}
+/* Chat containers */
+.container {
+font-family: 'Arial', sans-serif;
+font-size: 16px;
+border: 2px solid #dedede;
+background-color: #f1f1f1;
+border-radius: 5px;
+padding: 15px;
+margin: 10px 0;
+}
+/* Clear floats */
+.container::after {
+content: "";
+clear: both;
+display: table;
+}
+/* Style images */
+.container img {
+float: left;
+max-width: 60px;
+width: 100%;
+margin-right: 20px;
+border-radius: 50%;
+}
+</style>
+</head>
+<body>
+<div class="wrapper">
+<h1>Prompt Megatron</h1>
+<textarea name="prompt" id="prompt" maxlength="1024" placeholder="Add prompt"autofocus></textarea>
+<label for="tokens_to_generate">Number tokens to generate (1-1024):</label>
+<input type="number" id="tokens_to_generate" name="tokens_to_generate" min="10" max="256", value=32>
+<button onclick="submit_query()">Submit</button>
+<div id="the-count">
+<span id="current">0</span>
+<span id="maximum">/ 1000</span>
+</div>
+<textarea name="response" id="response" maxlength="2048" placeholder="Megatron response..."></textarea>
+</div>
+<script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
+<script type="text/javascript">
+	function submit_query() {
+		$("#response").val("Waiting for Megatron response...");
+		$.ajax({
+			url:"api",
+			type:"PUT",
+			data:JSON.stringify({prompts: [$("#prompt").val()], tokens_to_generate: parseInt($("#tokens_to_generate").val(),10)}),
+			contentType:"application/json; charset=utf-8",
+			dataType:"json",
+			success: function(data){
+				data.max_len=35;
+				$("#response").val(data.text);
+			}
+		});
+	}
+$('textarea').keyup(function() {
+var characterCount = $(this).val().length,
+current = $('#current'),
+maximum = $('#maximum'),
+theCount = $('#the-count');
+current.text(characterCount);
+if (characterCount >= 800) {
+maximum.css('color', '#8f0001');
+current.css('color', '#8f0001');
+theCount.css('font-weight','bold');
+} else {
+maximum.css('color','#666');
+theCount.css('font-weight','normal');
+}
+});
+</script>
+</body>
+</html>
--- a/megatron/text_generation/api.py
+++ b/megatron/text_generation/api.py
@@ -35,7 +35,10 @@ def generate_and_post_process(model,
                              top_p_sampling=0.0,
                              temperature=1.0,
                              add_BOS=False,
-                              use_eod_token_for_early_termination=True):
+                              use_eod_token_for_early_termination=True,
+                              stop_on_double_eol=False,
+                              stop_on_eol=False,
+                              random_seed=-1):
    """Run inference and post-process outputs, i.e., detokenize,
    move to cpu and convert to list."""
@@ -49,7 +52,10 @@ def generate_and_post_process(model,
        top_p_sampling=top_p_sampling,
        temperature=temperature,
        add_BOS=add_BOS,
-        use_eod_token_for_early_termination=use_eod_token_for_early_termination)
+        use_eod_token_for_early_termination=use_eod_token_for_early_termination,
+        stop_on_double_eol=stop_on_double_eol,
+        stop_on_eol=stop_on_eol,
+        random_seed=random_seed)
    # Only post-process on first stage.
    if mpu.is_pipeline_first_stage():
@@ -74,7 +80,10 @@ def generate(model,
             top_p_sampling=0.0,
             temperature=1.0,
             add_BOS=False,
-             use_eod_token_for_early_termination=True):
+             use_eod_token_for_early_termination=True,
+             stop_on_double_eol=False,
+             stop_on_eol=False,
+             random_seed=-1):
    """Given prompts and input parameters, run inference and return:
       tokens: prompts plus the generated tokens.
       lengths: length of the prompt + generations. Note that we can
@@ -87,8 +96,11 @@ def generate(model,
    values = [tokens_to_generate,
              return_output_log_probs,
              top_k_sampling, top_p_sampling,
-              temperature, add_BOS, use_eod_token_for_early_termination]
+              temperature, add_BOS, use_eod_token_for_early_termination,
-    values_float_tensor = broadcast_float_list(7, float_list=values)
+              stop_on_double_eol,
+              stop_on_eol,
+              random_seed]
+    values_float_tensor = broadcast_float_list(10, float_list=values)
    tokens_to_generate = int(values_float_tensor[0].item())
    return_output_log_probs = bool(values_float_tensor[1].item())
    top_k_sampling = int(values_float_tensor[2].item())
@@ -96,6 +108,12 @@ def generate(model,
    temperature = values_float_tensor[4].item()
    add_BOS = bool(values_float_tensor[5].item())
    use_eod_token_for_early_termination = bool(values_float_tensor[6].item())
+    stop_on_double_eol = bool(values_float_tensor[7].item())
+    stop_on_eol = bool(values_float_tensor[8].item())
+    random_seed = int(values_float_tensor[9].item())
+    if random_seed != -1:
+        torch.random.manual_seed(random_seed)
    # Tokenize prompts and get the batch.
    # Note that these tensors are broadcaseted to all ranks.
@@ -108,7 +126,7 @@ def generate(model,
    if tokens_to_generate == 0:
        return score_and_return_on_first_stage(
            model, context_tokens_tensor, context_length_tensor)
    # Main inference function.
    # Note that the outputs are available on the first stage.
    return generate_tokens_probs_and_return_on_first_stage(
@@ -117,4 +135,6 @@ def generate(model,
        top_k=top_k_sampling,
        top_p=top_p_sampling,
        temperature=temperature,
-        use_eod_token_for_early_termination=use_eod_token_for_early_termination)
+        use_eod_token_for_early_termination=use_eod_token_for_early_termination,
+        stop_on_double_eol=stop_on_double_eol,
+        stop_on_eol=stop_on_eol)
--- a/megatron/text_generation/generation.py
+++ b/megatron/text_generation/generation.py
@@ -96,7 +96,10 @@ def generate_tokens_probs_and_return_on_first_stage(
        return_output_log_probs=False,
        top_k=0, top_p=0.0,
        temperature=1.0,
-        use_eod_token_for_early_termination=True):
+        use_eod_token_for_early_termination=True,
+        stop_on_double_eol=False,
+        stop_on_eol=False
+        ):
    """Main token generation function.
    Arguments:
        model: no interleaving is supported.
@@ -130,6 +133,10 @@ def generate_tokens_probs_and_return_on_first_stage(
    min_prompt_length = lengths.min().item()
    max_sequence_length = tokens.size(1)
    max_sequence_length = min(max_sequence_length, args.max_position_embeddings)
+    # If the context is too big, this happens
+    if min_prompt_length >= max_sequence_length:
+        raise ValueError("context length + tokens_to_generate too large")
    # forward step.
    forward_step = ForwardStep(model, batch_size, max_sequence_length)
@@ -227,8 +234,20 @@ def generate_tokens_probs_and_return_on_first_stage(
            # Check if all the sequences have hit the termination_id.
            done = None
            if mpu.is_pipeline_last_stage():
-                done_token = (new_sample == termination_id).byte() & \
+                # TODO(rprenger) These stopping methods are tokenizer dependent
-                    started.byte()
+                # instead tokenization should be in the inference loop so stop sequences can be used
+                if stop_on_double_eol:
+                    hit_double_eol = (new_sample == 628).byte() & started.byte()
+                    hit_two_eols = (new_sample == 198).byte() & (tokens[:, context_length-1] == 198).byte() & started.byte()
+                    done_token = hit_double_eol | hit_two_eols
+                elif stop_on_eol:
+                    hit_double_eol = (new_sample == 628).byte() & started.byte()
+                    hit_eol = (new_sample == 198).byte() & started.byte()
+                    done_token = hit_double_eol | hit_eol
+                else: 
+                    done_token = (new_sample == termination_id).byte() & \
+                        started.byte()
                just_finished = (done_token & ~is_generation_done).bool()
                generated_sequence_lengths[just_finished.view(-1)] = \
                    context_length + 1

--- a/megatron/text_generation_server.py
+++ b/megatron/text_generation_server.py
@@ -36,9 +36,6 @@ class MegatronGenerate(Resource):
    def put(self):
        args = get_args()
-        print("request IP: " + str(request.remote_addr))
-        print(json.dumps(request.get_json()),flush=True)
-        print("current time: ", datetime.datetime.now())
        if not "prompts" in request.get_json():
            return "prompts argument required", 400
@@ -101,20 +98,60 @@ class MegatronGenerate(Resource):
            add_BOS = request.get_json()["add_BOS"]
            if not isinstance(add_BOS, bool):
                return "add_BOS must be a boolean value"
+        if any([len(prompt) == 0 for prompt in prompts]) and not add_BOS:
+            return "Empty prompts require add_BOS=true"
+        stop_on_double_eol = False
+        if "stop_on_double_eol" in request.get_json():
+            stop_on_double_eol = request.get_json()["stop_on_double_eol"]
+            if not isinstance(stop_on_double_eol, bool):
+                return "stop_on_double_eol must be a boolean value"
+        stop_on_eol = False
+        if "stop_on_eol" in request.get_json():
+            stop_on_eol = request.get_json()["stop_on_eol"]
+            if not isinstance(stop_on_eol, bool):
+                return "stop_on_eol must be a boolean value"
+        random_seed = -1
+        if "random_seed" in request.get_json():
+            random_seed = request.get_json()["random_seed"]
+            if not isinstance(random_seed, int):
+                return "random_seed must be integer"
+            if random_seed < 0: 
+                return "random_seed must be a positive integer"
+        no_log = False
+        if "no_log" in request.get_json():
+            no_log = request.get_json()["no_log"]
+            if not isinstance(no_log, bool):
+                return "no_log must be a boolean value"
        with lock:  # Need to get lock to keep multiple threads from hitting code
+            if not no_log:
+                print("request IP: " + str(request.remote_addr))
+                print(json.dumps(request.get_json()),flush=True)
+                print("start time: ", datetime.datetime.now())
            MegatronGenerate.send_do_generate()  # Tell other ranks we're doing generate
-            response, response_seg, response_logprobs, _ = \
+            try:
-                generate_and_post_process(
+                response, response_seg, response_logprobs, _ = \
-                    self.model,
+                    generate_and_post_process(
-                    prompts=prompts,
+                        self.model,
-                    tokens_to_generate=tokens_to_generate,
+                        prompts=prompts,
-                    return_output_log_probs=logprobs,
+                        tokens_to_generate=tokens_to_generate,
-                    top_k_sampling=top_k,
+                        return_output_log_probs=logprobs,
-                    top_p_sampling=top_p,
+                        top_k_sampling=top_k,
-                    temperature=temperature,
+                        top_p_sampling=top_p,
-                    add_BOS=add_BOS,
+                        temperature=temperature,
-                    use_eod_token_for_early_termination=True)
+                        add_BOS=add_BOS,
+                        use_eod_token_for_early_termination=True,
+                        stop_on_double_eol=stop_on_double_eol,
+                        stop_on_eol=stop_on_eol,
+                        random_seed=random_seed)
+            except ValueError as ve:
+                return "Length of prompt + tokens_to_generate longer than allowed"
+            print("end time: ", datetime.datetime.now())
        return jsonify({"text": response,
            "segments": response_seg,

--- a/megatron/training.py
+++ b/megatron/training.py
@@ -21,7 +21,6 @@ import sys
 import time
 # The earliest we can measure the start time.
 _TRAIN_START_TIME = time.time()
 import torch
 from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
@@ -51,6 +50,7 @@ from megatron.data.data_samplers import build_pretraining_data_loader
 from megatron.utils import calc_params_l2_norm
 from megatron.schedules import get_forward_backward_func
 from megatron.utils import report_memory
+from megatron.model.vision.knn_monitor import compute_feature_bank
 # >>>
 from lutil import pax
@@ -440,6 +440,12 @@ def train_step(forward_step_func, data_iterator,
    optimizer.reduce_model_grads(args, timers)
    # <<<
+    if args.vision_pretraining and args.vision_pretraining_type == "dino":
+        unwrapped_model = unwrap_model(model[0],
+                                       (torchDDP, LocalDDP, Float16Module))
+        unwrapped_model.cancel_gradients_last_layer(args.curr_iteration)
    # Update parameters.
    timers('optimizer').start()
    update_successful, grad_norm, num_zeros_in_grad = optimizer.step(args, timers, ITERATION)
@@ -454,6 +460,11 @@ def train_step(forward_step_func, data_iterator,
    optimizer.debug_model(ITERATION, "after gather params.", 0)
    # <<<
+    if args.vision_pretraining and args.vision_pretraining_type == "dino":
+        unwrapped_model = unwrap_model(model[0],
+                                       (torchDDP, LocalDDP, Float16Module))
+        unwrapped_model.update_momentum(args.curr_iteration)
    # Update learning rate.
    if update_successful:
        increment = get_num_microbatches() * \
@@ -688,6 +699,7 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
    report_memory_flag = True
    while iteration < args.train_iters:
        update_num_microbatches(args.consumed_train_samples)
+        args.curr_iteration = iteration
        loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = \
            train_step(forward_step_func,
                       train_data_iterator,
@@ -780,6 +792,9 @@ def evaluate(forward_step_func,
    """Evaluation."""
    args = get_args()
+    if args.vision_pretraining and args.vision_pretraining_type == "dino":
+        compute_feature_bank(model)
    # Turn on evaluation mode which disables dropout.
    for model_module in model:
        model_module.eval()
@@ -938,7 +953,6 @@ def build_train_valid_test_data_iterators(
    args.do_valid = flags[1].item()
    args.do_test = flags[2].item()
    # Build iterators.
    dl_type = args.dataloader_type
    assert dl_type in ['single', 'cyclic']

--- a/pretrain_vit.py
+++ b/pretrain_vit.py
@@ -22,20 +22,32 @@ from megatron import get_args, get_timers, mpu, print_rank_0
 from megatron.data.vit_dataset import build_train_valid_datasets
 from megatron.model import ModelType
 from megatron.model.vision.classification import VitClassificationModel
+from megatron.model.vision.classification import MitClassificationModel
 from megatron.training import pretrain
 from megatron.utils import average_losses_across_data_parallel_group
 def model_provider(pre_process=True, post_process=True):
    """Build the model."""
-    print_rank_0("building VIT model ...")
    args = get_args()
-    model = VitClassificationModel(num_classes=args.num_classes,
+    if args.vision_backbone_type == 'vit':
-                                   pre_process=pre_process,
+        print_rank_0("building VIT model ...")
-                                   post_process=post_process)
+        model = VitClassificationModel(num_classes=args.num_classes,
+                                       pre_process=pre_process,
+                                       post_process=post_process)
+    elif args.vision_backbone_type == 'mit':
+        print_rank_0("building MIT model ...")
+        model = MitClassificationModel(num_classes=args.num_classes,
+                                       pre_process=pre_process,
+                                       post_process=post_process)
+    else:
+        raise Exception('{} vision backbone is not supported.'.format(
+                              args.vision_backbone_type))
    return model
 def get_batch(data_iterator):
    """Build the batch."""
    data = next(data_iterator)
@@ -46,6 +58,7 @@ def get_batch(data_iterator):
    return images, labels
 def loss_func(labels, output_tensor):
    logits = output_tensor.contiguous().float()
    loss = F.cross_entropy(logits, labels)
@@ -58,6 +71,7 @@ def loss_func(labels, output_tensor):
    return loss, {"loss": averaged_loss[0], "accuracy": averaged_loss[1]}
 def forward_step(data_iterator, model):
    """Forward step."""
    timers = get_timers()
@@ -98,5 +112,5 @@ if __name__ == "__main__":
        model_provider,
        ModelType.encoder_or_decoder,
        forward_step,
-        args_defaults={'dataloader_type': 'cyclic'}
+        args_defaults={'dataloader_type': 'cyclic', 'vision_pretraining': True}
    )
--- a/pretrain_vision_dino.py
+++ b/pretrain_vision_dino.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+import numpy as np
+import torch.distributed as dist
+from functools import partial
+from megatron import get_args, get_timers, mpu, print_rank_0
+from megatron.data.vit_dataset import build_train_valid_datasets
+from megatron.model.vision.dino import DINOPretrainModel
+from megatron.model.vision.knn_monitor import knn_predict, get_feature_bank
+from megatron.training import pretrain
+from megatron.utils import average_losses_across_data_parallel_group, unwrap_model
+from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
+from megatron.model import DistributedDataParallel as LocalDDP
+from megatron.model import Float16Module
+from megatron.model import ModelType
+def model_provider(pre_process=True, post_process=True):
+    """Build the model."""
+    return DINOPretrainModel(pre_process=pre_process, post_process=post_process)
+def get_batch(data_iterator):
+    """Build the batch."""
+    data = next(data_iterator)
+    # only data parallelism; no need for broadcast
+    if isinstance(data[0], list):
+        images = [aug.cuda() for aug in data[0]]
+    else:
+        images = data[0].cuda()
+    labels = data[1].cuda()
+    return images, labels
+def loss_func(model, labels, output_tensor, collect_data=False):
+    args = get_args()
+    model = unwrap_model(
+        model,
+        (torchDDP, LocalDDP, Float16Module)
+    )
+    if model.training:
+        student_output, teacher_output = output_tensor
+        loss = model.dino_loss(student_output, teacher_output, args.curr_iteration)
+        averaged_loss = average_losses_across_data_parallel_group([loss])
+        return loss, {"loss": averaged_loss[0]}
+    else:
+        _, teacher_feature = output_tensor
+        feature_bank, feature_labels, classes = get_feature_bank()
+        feature = F.normalize(teacher_feature.float(), dim=1)
+        knn_accs = []
+        for k in [10, 20, 100, 200]:
+            pred_labels = knn_predict(feature, feature_bank,
+                                      feature_labels, classes, k, 0.07)
+            knn_acc = (pred_labels[:, 0] == labels).float().mean()
+            knn_accs.append(knn_acc)
+        averaged_loss = average_losses_across_data_parallel_group(knn_accs)
+        return 0, {"knn_acc_10": averaged_loss[0],
+                   "knn_acc_20": averaged_loss[1],
+                   "knn_acc_100": averaged_loss[2],
+                   "knn_acc_200": averaged_loss[3]}
+def forward_step(data_iterator, model):
+    """Forward step."""
+    timers = get_timers()
+    # Get the batch.
+    timers("batch-generator").start()
+    (
+        images,
+        labels,
+    ) = get_batch(data_iterator)
+    timers("batch-generator").stop()
+    return model(images), partial(loss_func, model, labels)
+def train_valid_test_datasets_provider(train_val_test_num_samples):
+    """Build train, valid, and test datasets."""
+    args = get_args()
+    print_rank_0(
+        "> building train, validation, and test datasets " "for VIT ..."
+    )
+    train_ds, valid_ds = build_train_valid_datasets(
+        data_path=args.data_path,
+        image_size=(args.img_h, args.img_w)
+    )
+    print_rank_0("> finished creating VIT datasets ...")
+    return train_ds, valid_ds, None
+if __name__ == "__main__":
+    pretrain(
+        train_valid_test_datasets_provider,
+        model_provider,
+        ModelType.encoder_or_decoder,
+        forward_step,
+        args_defaults={'dataloader_type': 'cyclic', 'vision_pretraining': True}
+    )
--- a/pretrain_vision_inpaint.py
+++ b/pretrain_vision_inpaint.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Pretrain VIT"""
+import torch
+import torch.nn.functional as F
+from functools import partial
+from megatron import get_args, get_timers, mpu, print_rank_0, print_rank_last
+from megatron.data.vit_dataset import build_train_valid_datasets
+from megatron.model.vision.inpainting import VitInpaintingModel
+from megatron.model.vision.inpainting import MitInpaintingModel
+from megatron.training import pretrain
+from megatron.utils import average_losses_across_data_parallel_group
+from tasks.vision.metrics import SSIM, PSNR
+from megatron.model import ModelType
+def model_provider(pre_process=True, post_process=True):
+    """Build the model."""
+    args = get_args()
+    if args.vision_backbone_type == 'vit':
+        model = VitInpaintingModel(pre_process=pre_process,
+                                   post_process=post_process)
+    elif args.vision_backbone_type == 'mit':
+        model = MitInpaintingModel(pre_process=pre_process,
+                                   post_process=post_process)
+    else:
+        raise Exception('{} vision backbone is not supported.'.format(
+                              args.vision_backbone_type))
+    return model
+def get_batch(data_iterator):
+    """Build the batch."""
+    data = next(data_iterator)
+    # only data parallelism; no need for broadcast
+    images = data[0][0].cuda()
+    masks = data[0][1].cuda()
+    return images, masks
+def loss_func(images, masks, masked_images, outputs, collect_data=False):
+    outputs = outputs.contiguous().float()
+    masks_flip = 1-masks
+    flip_masked_outputs = outputs.masked_fill(masks_flip.bool(), 0)
+    flip_masked_images = images.masked_fill(masks_flip.bool(), 0)
+    ssim_fun = SSIM()
+    psnr_fun = PSNR()
+    if not collect_data:
+        mask_count = torch.count_nonzero(masks)
+        loss = F.mse_loss(
+            flip_masked_outputs,
+            flip_masked_images.float(),
+            reduction="sum"
+        )
+        loss = loss/mask_count
+        ssim = ssim_fun(flip_masked_outputs, flip_masked_images.float())
+        psnr = psnr_fun(flip_masked_outputs, flip_masked_images.float())
+        averaged_loss = average_losses_across_data_parallel_group(
+            [loss, psnr, ssim]
+        )
+        return loss, {"loss": averaged_loss[0],
+                      "psnr": averaged_loss[1],
+                      'ssim': averaged_loss[2]}
+    else:
+        synth_images = masked_images.float() + flip_masked_outputs
+        ssim = ssim_fun(synth_images, images.float())
+        psnr = psnr_fun(synth_images, images.float())
+        return torch.cat((images, masked_images, synth_images), dim=2), ssim, psnr
+def forward_step(data_iterator, model):
+    """Forward step."""
+    timers = get_timers()
+    # Get the batch.
+    timers("batch-generator").start()
+    (
+        images,
+        masks,
+    ) = get_batch(data_iterator)
+    timers("batch-generator").stop()
+    masked_images = images.masked_fill(masks.bool(), 0)
+    outputs = model(masked_images)
+    # Forward mode
+    return outputs, partial(loss_func, images, masks, masked_images)
+def process_non_loss_data(data, iteration, writer):
+    psnr_sum = 0
+    ssim_sum = 0
+    for (output_tb, ssim, psnr) in data:
+        output_tb[output_tb < 0] = 0
+        output_tb[output_tb > 1] = 1
+        writer.add_images("gt-input-output-vald", output_tb,
+                          global_step=iteration, walltime=None,
+                          dataformats='NCHW')
+        psnr_sum = psnr_sum + psnr.item()
+        ssim_sum = ssim_sum + ssim.item()
+    psnr = psnr_sum/len(data)
+    ssim = ssim_sum/len(data)
+    writer.add_scalar('PSNR generate value-validation', psnr, iteration)
+    writer.add_scalar('SSIM generate value-validation', ssim, iteration)
+def train_valid_test_datasets_provider(train_val_test_num_samples):
+    """Build train, valid, and test datasets."""
+    args = get_args()
+    print_rank_0(
+        "> building train, validation, and test datasets " "for VIT ..."
+    )
+    train_ds, valid_ds = build_train_valid_datasets(
+        data_path=args.data_path,
+        image_size=(args.img_h, args.img_w)
+    )
+    print_rank_0("> finished creating VIT datasets ...")
+    return train_ds, valid_ds, None
+if __name__ == "__main__":
+    pretrain(
+        train_valid_test_datasets_provider,
+        model_provider,
+        ModelType.encoder_or_decoder,
+        forward_step,
+        process_non_loss_data,
+        args_defaults={'dataloader_type': 'cyclic', 'vision_pretraining': True}
+    )
--- a/tasks/vision/classification.py
+++ b/tasks/vision/classification.py
--- a/tasks/vision/eval_utils.py
+++ b/tasks/vision/eval_utils.py
--- a/tools/run_text_generation_server.py
+++ b/tools/run_text_generation_server.py
@@ -78,4 +78,7 @@ if __name__ == "__main__":
        choice = torch.cuda.LongTensor(1)
        torch.distributed.broadcast(choice, 0)
        if choice[0].item() == 0:
-            generate_and_post_process(model)
+            try:
+                generate_and_post_process(model)
+            except ValueError as ve:
+                pass