Merge branch 'main' into nmt-main

41276b6c · Vijay Korthikanti · a44360ed · fc7f4f03 · 41276b6c · 41276b6c
Commit 41276b6c authored Oct 03, 2022 by Vijay Korthikanti
20 changed files
--- a/megatron/text_generation_server.py
+++ b/megatron/text_generation_server.py
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 import datetime
 import torch
 import json

--- a/megatron/timers.py
+++ b/megatron/timers.py
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+
+"""Megatron timers."""
+
+from abc import ABC
+from abc import abstractmethod
+import time
+
+import torch
+
+
+
+class TimerBase(ABC):
+
+    def __init__(self, name):
+        self.name = name
+
+    @abstractmethod
+    def start(self, barrier=False):
+        pass
+
+    @abstractmethod
+    def stop(self, barrier=False):
+        pass
+
+    @abstractmethod
+    def reset(self):
+        pass
+
+    @abstractmethod
+    def elapsed(self, reset=True, barrier=False):
+        pass
+
+
+
+class DummyTimer(TimerBase):
+
+    def __init__(self):
+        super().__init__('dummy timer')
+
+    def start(self, barrier=False):
+        return
+
+    def stop(self, barrier=False):
+        return
+
+    def reset(self):
+        return
+
+    def elapsed(self, reset=True, barrier=False):
+        raise Exception('dummy timer should not be used to '
+                        'calculate elapsed time')
+
+
+
+class Timer(TimerBase):
+    """
+    Comment on using `barrier`: If this flag is passed, then all
+    the caller processes will wait till all reach the timing routine.
+    It is up to the user to make sure all the ranks in `barrier_group`
+    call it otherwise, it will result in a hang.
+    Comment on `barrier_group`: By default it is set to None which
+    in torch distributed land, it will result in the global communicator.
+    """
+
+    def __init__(self, name):
+        super().__init__(name)
+        self._elapsed = 0.0
+        self._started = False
+        # Note that None will default to the global process group
+        self._barrier_group = None
+        self._start_time = time.time()
+
+
+    def set_barrier_group(self, barrier_group):
+        self._barrier_group = barrier_group
+
+
+    def start(self, barrier=False):
+        """Start the timer."""
+        assert not self._started, 'timer has already been started'
+        if barrier:
+            torch.distributed.barrier(group=self._barrier_group)
+        torch.cuda.synchronize()
+        self._start_time = time.time()
+        self._started = True
+
+
+    def stop(self, barrier=False):
+        """Stop the timer."""
+        assert self._started, 'timer is not started'
+        if barrier:
+            torch.distributed.barrier(group=self._barrier_group)
+        torch.cuda.synchronize()
+        self._elapsed += (time.time() - self._start_time)
+        self._started = False
+
+
+    def reset(self):
+        """Reset timer."""
+        self._elapsed = 0.0
+        self._started = False
+
+
+    def elapsed(self, reset=True, barrier=False):
+        """Calculate the elapsed time."""
+        _started = self._started
+        # If the timing in progress, end it first.
+        if self._started:
+            self.stop(barrier=barrier)
+        # Get the elapsed time.
+        _elapsed = self._elapsed
+        # Reset the elapsed time
+        if reset:
+            self.reset()
+        # If timing was in progress, set it back.
+        if _started:
+            self.start(barrier=barrier)
+        return _elapsed
+
+
+
+class Timers:
+    """Group of timers."""
+
+    def __init__(self, log_level, log_option):
+        self._log_level = log_level
+        self._log_option = log_option
+        self._timers = {}
+        self._log_levels = {}
+        self._dummy_timer = DummyTimer()
+        self._max_log_level = 2
+
+
+    def __call__(self, name, log_level=None):
+        # If the timer has already been set, then check if the log-level
+        # is provided, it matches the one that the timer was created with.
+        if name in self._timers:
+            if log_level is not None:
+                assert log_level == self._log_levels[name], \
+                    'input log level {} does not match already existing '\
+                    'log level {} for {} timer'.format(
+                        log_level, self._log_levels[name], name)
+            return self._timers[name]
+        # If timer does not exist and no log level is provided,
+        # set it to the max log level which is 2.
+        if log_level is None:
+            log_level = self._max_log_level
+        assert log_level <= self._max_log_level, \
+            'log level {} is larger than max supported log level {}'.format(
+                log_level, self._max_log_level)
+        # Now if the input log level is larger than the one set for
+        # the timers class, just ignore it and return a dummy timer.
+        if log_level > self._log_level:
+            return self._dummy_timer
+        # Otherwise, initalize the timer and set the level.
+        self._timers[name] = Timer(name)
+        self._log_levels[name] = log_level
+        return self._timers[name]
+
+
+    def _get_elapsed_time_all_ranks(self, names, reset, barrier):
+        """
+        Assumptions:
+            - All the ranks call this function.
+            - `names` are identical on all ranks.
+        If the above assumptions are not met, calling this function will
+        result in hang.
+        Arguments:
+            - names: list of timer names
+            - reset: reset the timer after recording the elapsed time
+            - barrier: if set, do a global barrier before time measurments
+        """
+
+        # First make sure all the callers are in sync.
+        if barrier:
+            torch.distributed.barrier()
+
+        world_size = torch.distributed.get_world_size()
+        rank = torch.distributed.get_rank()
+
+        # Here we can use gather on the rank we want to print the
+        # timing, however, there is no gather_base support in
+        # pytorch yet. It is simpler to deal with a single tensor
+        # and since we are only gathering a small amount of data,
+        # it should be ok to use all-gather instead of gather.
+        rank_name_to_time = torch.zeros((world_size, len(names)),
+                                        dtype=torch.float,
+                                        device=torch.cuda.current_device())
+        for i, name in enumerate(names):
+            if name in self._timers:
+                # Here we don't need to pass the barrier flag as all
+                # the processes are already in sync. This avoids the
+                # issue of different timers having different barrier
+                # groups inside their class.
+                rank_name_to_time[rank, i] = self._timers[name].elapsed(
+                    reset=reset)
+
+        # See the note above for why we are not using gather.
+        torch.distributed._all_gather_base(rank_name_to_time.view(-1),
+                                           rank_name_to_time[rank, :].view(-1))
+
+        return rank_name_to_time
+
+
+    def _get_global_min_max_time(self, names, reset, barrier, normalizer):
+        """Report only min and max times across all ranks."""
+
+        rank_name_to_time = self._get_elapsed_time_all_ranks(names, reset,
+                                                             barrier)
+        name_to_min_max_time = {}
+        for i, name in enumerate(names):
+            rank_to_time = rank_name_to_time[:, i]
+            # filter out the ones we did not have any timings for
+            rank_to_time = rank_to_time[rank_to_time > 0.0]
+            # If the timer exists:
+            if rank_to_time.numel() > 0:
+                name_to_min_max_time[name] = (
+                    rank_to_time.min().item() / normalizer,
+                    rank_to_time.max().item() / normalizer)
+        return name_to_min_max_time
+
+
+    def _get_global_min_max_time_string(self, names, reset, barrier,
+                                        normalizer, max_only):
+        name_to_min_max_time = self._get_global_min_max_time(
+            names, reset, barrier, normalizer)
+        if not name_to_min_max_time:
+            return None
+        output_string = '(min, max) time across ranks (ms):'
+        for name in name_to_min_max_time:
+            min_time, max_time = name_to_min_max_time[name]
+            if max_only:
+                output_string += '\n    {}: {:.2f}'.format(
+                    (name+' ').ljust(48, '.'), max_time)
+            else:
+                output_string += '\n    {}: ({:.2f}, {:.2f})'.format(
+                    (name+' ').ljust(48, '.'), min_time, max_time)
+        return output_string
+
+
+    def _get_all_ranks_time_string(self, names, reset, barrier, normalizer):
+        """Report times across all ranks."""
+        rank_name_to_time = self._get_elapsed_time_all_ranks(names, reset,
+                                                             barrier)
+
+        output_string = 'times across ranks (ms):'
+        no_reported_timing = True
+        for i, name in enumerate(names):
+            not_yet_found = True
+            for rank in range(torch.distributed.get_world_size()):
+                if rank_name_to_time[rank, i] > 0:
+                    no_reported_timing = False
+                    if not_yet_found:
+                        not_yet_found = False
+                        output_string += '\n  {}:'.format(name)
+                    output_string += '\n     rank {:2d}: {:.2f}'.format(
+                        rank, rank_name_to_time[rank, i] / normalizer)
+        if no_reported_timing:
+            return None
+        return output_string
+
+
+    def log(self, names, rank=None, normalizer=1.0, reset=True, barrier=False):
+        """Log a group of timers."""
+
+        # Print.
+        assert normalizer > 0.0
+        if self._log_option in ['max', 'minmax']:
+            max_only = False
+            if self._log_option == 'max':
+                max_only = True
+            output_string = self._get_global_min_max_time_string(
+                names, reset, barrier, normalizer/1000.0, max_only)
+        elif self._log_option == 'all':
+            output_string = self._get_all_ranks_time_string(names,
+                                                            reset, barrier,
+                                                            normalizer/1000.0)
+        else:
+            raise Exception('unknown timing log option {}'.format(
+                self._log_option))
+
+        # If no input rank is provided, log on last rank.
+        if rank is None:
+            rank = torch.distributed.get_world_size() - 1
+        if rank == torch.distributed.get_rank() and output_string is not None:
+            print(output_string, flush=True)
+
+
+    def write(self, names, writer, iteration, normalizer=1.0,
+              reset=False, barrier=False):
+        """Write timers to a tensorboard writer
+        Note that we only report maximum time across ranks to tensorboard.
+        """
+        # currently when using add_scalars,
+        # torch.utils.add_scalars makes each timer its own run, which
+        # polutes the runs list, so we just add each as a scalar
+        assert normalizer > 0.0
+        name_to_min_max_time = self._get_global_min_max_time(
+            names, reset, barrier, normalizer)
+        if writer is not None:
+            for name in name_to_min_max_time:
+                _, max_time = name_to_min_max_time[name]
+                writer.add_scalar(name + '-time', max_time, iteration)
--- a/megatron/tokenizer/__init__.py
+++ b/megatron/tokenizer/__init__.py
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.


 from .tokenizer import build_tokenizer
--- a/megatron/tokenizer/tokenizer.py
+++ b/megatron/tokenizer/tokenizer.py
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.

 """Megatron tokenizers."""


--- a/megatron/training.py
+++ b/megatron/training.py
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.

 """Pretrain utilities."""

@@ -119,23 +106,28 @@ def pretrain(train_valid_test_dataset_provider,
    timers = get_timers()

    # Model, optimizer, and learning rate.
-    timers('model-and-optimizer-setup').start()
-    model, optimizer, opt_param_scheduler = setup_model_and_optimizer(model_provider,
-                                                               model_type)
+    timers('model-and-optimizer-setup', log_level=0).start(barrier=True)
+    model, optimizer, opt_param_scheduler = setup_model_and_optimizer(
+        model_provider, model_type)
    timers('model-and-optimizer-setup').stop()
    print_datetime('after model, optimizer, and learning rate '
                   'scheduler are built')

    # Data stuff.
-    timers('train/valid/test-data-iterators-setup').start()
+    timers('train/valid/test-data-iterators-setup', log_level=0).start(
+        barrier=True)
    if args.virtual_pipeline_model_parallel_size is not None:
        all_data_iterators = [
-            build_train_valid_test_data_iterators(train_valid_test_dataset_provider)
+            build_train_valid_test_data_iterators(
+                train_valid_test_dataset_provider)
            for _ in range(len(model))
        ]
-        train_data_iterator = [data_iterators[0] for data_iterators in all_data_iterators]
-        valid_data_iterator = [data_iterators[1] for data_iterators in all_data_iterators]
-        test_data_iterator = [data_iterators[2] for data_iterators in all_data_iterators]
+        train_data_iterator = [data_iterators[0]
+                               for data_iterators in all_data_iterators]
+        valid_data_iterator = [data_iterators[1]
+                               for data_iterators in all_data_iterators]
+        test_data_iterator = [data_iterators[2]
+                              for data_iterators in all_data_iterators]
    else:
        train_data_iterator, valid_data_iterator, test_data_iterator \
            = build_train_valid_test_data_iterators(
@@ -145,7 +137,8 @@ def pretrain(train_valid_test_dataset_provider,

    # Print setup timing.
    print_rank_0('done with setup ...')
-    timers.log(['model-and-optimizer-setup', 'train/valid/test-data-iterators-setup'])
+    timers.log(['model-and-optimizer-setup',
+                'train/valid/test-data-iterators-setup'], barrier=True)
    print_rank_0('training ...')

    iteration = 0
@@ -373,13 +366,9 @@ def setup_model_and_optimizer(model_provider_func,

    if args.load is not None:
        timers = get_timers()
-        # Extra barrier is added to make sure all ranks report the
-        # max time.
-        torch.distributed.barrier()
-        timers('load-checkpoint').start()
+        timers('load-checkpoint', log_level=0).start(barrier=True)
        args.iteration = load_checkpoint(model, optimizer, opt_param_scheduler)
-        torch.distributed.barrier()
-        timers('load-checkpoint').stop()
+        timers('load-checkpoint').stop(barrier=True)
        timers.log(['load-checkpoint'])
    else:
        args.iteration = 0
@@ -412,19 +401,21 @@ def train_step(forward_step_func, data_iterator,
    optimizer.zero_grad()

    # Forward pass.
+    timers('forward-backward', log_level=1).start(
+        barrier=args.barrier_with_L1_time)
    forward_backward_func = get_forward_backward_func()
+    fwd_bwd_timers = timers if args.timing_log_level > 1 else None
    losses_reduced = forward_backward_func(
        forward_step_func, data_iterator, model,
-        optimizer, timers, forward_only=False)
+        optimizer, fwd_bwd_timers, forward_only=False)
+    timers('forward-backward').stop()

    # Empty unused memory.
    if args.empty_unused_memory_level >= 1:
        torch.cuda.empty_cache()

    # Reduce gradients.
-    timers('backward-reduce-model-grads').start()
    optimizer.reduce_model_grads(args, timers)
-    timers('backward-reduce-model-grads').stop()

    # Vision gradients.
    if args.vision_pretraining and args.vision_pretraining_type == "dino":
@@ -433,15 +424,13 @@ def train_step(forward_step_func, data_iterator,
        unwrapped_model.cancel_gradients_last_layer(args.curr_iteration)

    # Update parameters.
-    timers('optimizer').start()
+    timers('optimizer', log_level=1).start(barrier=args.barrier_with_L1_time)
    update_successful, grad_norm, num_zeros_in_grad = optimizer.step(args, timers)
    timers('optimizer').stop()

    # Gather params.
    if update_successful:
-        timers('backward-gather-model-params').start()
        optimizer.gather_model_params(args, timers)
-        timers('backward-gather-model-params').stop()

    # Vision momentum.
    if args.vision_pretraining and args.vision_pretraining_type == "dino":
@@ -511,33 +500,32 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
        nan_iters_key, 0) + int(got_nan)

    # Logging.
-    timers_to_log = []
-
-    def add_to_logging(name):
-        if name in timers.timers:
-            timers_to_log.append(name)
-    add_to_logging('forward-compute')
-    add_to_logging('forward-recv')
-    add_to_logging('forward-send')
-    add_to_logging('forward-backward-send-forward-backward-recv')
-    add_to_logging('backward-compute')
-    add_to_logging('backward-recv')
-    add_to_logging('backward-send')
-    add_to_logging('backward-send-forward-recv')
-    add_to_logging('backward-send-backward-recv')
-    add_to_logging('backward-params-all-reduce')
-    add_to_logging('backward-layernorm-all-reduce')
-    add_to_logging('backward-embedding-all-reduce')
-    add_to_logging('backward-reduce-model-grads')
-    add_to_logging('backward-gather-model-params')
-    add_to_logging('optimizer-copy-to-main-grad')
-    add_to_logging('optimizer-unscale-and-check-inf')
-    add_to_logging('optimizer-clip-main-grad')
-    add_to_logging('optimizer-count-zeros')
-    add_to_logging('optimizer-inner-step')
-    add_to_logging('optimizer-copy-main-to-model-params')
-    add_to_logging('optimizer')
-    add_to_logging('batch-generator')
+    timers_to_log = [
+        'forward-backward',
+        'forward-compute',
+        'backward-compute',
+        'batch-generator',
+        'forward-recv',
+        'forward-send',
+        'backward-recv',
+        'backward-send',
+        'forward-send-forward-recv',
+        'forward-send-backward-recv',
+        'backward-send-forward-recv',
+        'backward-send-backward-recv',
+        'forward-backward-send-forward-backward-recv',
+        'layernorm-grads-all-reduce',
+        'embedding-grads-all-reduce',
+        'grads-all-reduce',
+        'grads-reduce-scatter',
+        'params-all-gather',
+        'optimizer-copy-to-main-grad',
+        'optimizer-unscale-and-check-inf',
+        'optimizer-clip-main-grad',
+        'optimizer-count-zeros',
+        'optimizer-inner-step',
+        'optimizer-copy-main-to-model-params',
+        'optimizer']

    # Calculate batch size.
    batch_size = args.micro_batch_size * args.data_parallel_size * \
@@ -547,8 +535,12 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
                       total_loss_dict[skipped_iters_key]

    # Tensorboard values.
-    if writer and (iteration % args.tensorboard_log_interval == 0 ) and \
-       is_last_rank():
+    # Timer requires all the ranks to call.
+    if args.log_timers_to_tensorboard and \
+       (iteration % args.tensorboard_log_interval == 0):
+        timers.write(timers_to_log, writer, iteration,
+                     normalizer=total_iterations)
+    if writer and (iteration % args.tensorboard_log_interval == 0):
        if args.log_learning_rate_to_tensorboard:
            writer.add_scalar('learning-rate', learning_rate, iteration)
            writer.add_scalar('learning-rate vs samples', learning_rate,
@@ -581,9 +573,6 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
            writer.add_scalar('params-norm', params_norm, iteration)
            writer.add_scalar('params-norm vs samples', params_norm,
                              args.consumed_train_samples)
-        if args.log_timers_to_tensorboard:
-            timers.write(timers_to_log, writer, iteration,
-                         normalizer=total_iterations)
        if args.log_memory_to_tensorboard:
            mem_stats = torch.cuda.memory_stats()
            writer.add_scalar(
@@ -603,7 +592,7 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
            )

    if iteration % args.log_interval == 0:
-        elapsed_time = timers('interval-time').elapsed()
+        elapsed_time = timers('interval-time').elapsed(barrier=True)
        elapsed_time_per_iteration = elapsed_time / total_iterations
        if writer:
            if args.log_timers_to_tensorboard:
@@ -653,11 +642,9 @@ def save_checkpoint_and_time(iteration, model, optimizer, opt_param_scheduler):
    timers = get_timers()
    # Extra barrier is added to make sure
    # all ranks report the max time.
-    torch.distributed.barrier()
-    timers('save-checkpoint').start()
+    timers('save-checkpoint', log_level=0).start(barrier=True)
    save_checkpoint(iteration, model, optimizer, opt_param_scheduler)
-    torch.distributed.barrier()
-    timers('save-checkpoint').stop()
+    timers('save-checkpoint').stop(barrier=True)
    timers.log(['save-checkpoint'])


@@ -681,7 +668,7 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
    # Iterations.
    iteration = args.iteration

-    timers('interval-time').start()
+    timers('interval-time', log_level=0).start(barrier=True)
    print_datetime('before the start of training step')
    report_memory_flag = True
    while iteration < args.train_iters:

--- a/megatron/utils.py
+++ b/megatron/utils.py
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.

 """General utilities."""


--- a/pretrain_bert.py
+++ b/pretrain_bert.py
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.

 """Pretrain BERT"""

@@ -104,7 +91,7 @@ def forward_step(data_iterator, model):
    timers = get_timers()

    # Get the batch.
-    timers('batch-generator').start()
+    timers('batch-generator', log_level=2).start()
    tokens, types, sentence_order, loss_mask, lm_labels, padding_mask = get_batch(
        data_iterator)
    timers('batch-generator').stop()

--- a/pretrain_gpt.py
+++ b/pretrain_gpt.py
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.

 """Pretrain GPT"""

@@ -89,7 +76,7 @@ def forward_step(data_iterator, model):
    timers = get_timers()

    # Get the batch.
-    timers('batch-generator').start()
+    timers('batch-generator', log_level=2).start()
    tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
        data_iterator)
    timers('batch-generator').stop()

--- a/pretrain_ict.py
+++ b/pretrain_ict.py
-# coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.

 """Pretrain BERT for Inverse Cloze Task"""

@@ -134,7 +121,7 @@ def forward_step(data_iterator, model):
    timers = get_timers()

    # Get the batch.
-    timers('batch-generator').start()
+    timers('batch-generator', log_level=2).start()
    query_tokens, query_mask, \
    context_tokens, context_mask, context_indices = get_ict_batch(data_iterator)
    timers('batch-generator').stop()

--- a/pretrain_t5.py
+++ b/pretrain_t5.py
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.

 """Pretrain T5"""

@@ -126,7 +113,7 @@ def forward_step(data_iterator, model):
    timers = get_timers()

    # Get the batch.
-    timers('batch generator').start()
+    timers('batch generator', log_level=2).start()
    tokens_enc, tokens_dec, loss_mask, lm_labels, enc_mask, dec_mask, enc_dec_mask \
        = get_batch(data_iterator)
    timers('batch generator').stop()

--- a/pretrain_vision_classify.py
+++ b/pretrain_vision_classify.py
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.

 """Pretrain VIT"""

@@ -77,7 +64,7 @@ def forward_step(data_iterator, model):
    timers = get_timers()

    # Get the batch.
-    timers("batch-generator").start()
+    timers("batch-generator", log_level=2).start()
    (
        images,
        labels,

--- a/pretrain_vision_dino.py
+++ b/pretrain_vision_dino.py
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.

 import torch
 import torch.nn.functional as F
@@ -84,7 +71,7 @@ def forward_step(data_iterator, model):
    timers = get_timers()

    # Get the batch.
-    timers("batch-generator").start()
+    timers("batch-generator", log_level=2).start()
    (
        images,
        labels,

--- a/pretrain_vision_inpaint.py
+++ b/pretrain_vision_inpaint.py
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.

 """Pretrain VIT"""

@@ -91,7 +78,7 @@ def forward_step(data_iterator, model):
    timers = get_timers()

    # Get the batch.
-    timers("batch-generator").start()
+    timers("batch-generator", log_level=2).start()
    (
        images,
        masks,

--- a/tasks/data_utils.py
+++ b/tasks/data_utils.py
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.

 """ Tasks data utility."""


--- a/tasks/eval_utils.py
+++ b/tasks/eval_utils.py
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.

 """Evaluation utilities."""


--- a/tasks/finetune_utils.py
+++ b/tasks/finetune_utils.py
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.

 """Finetune utilities."""

@@ -67,7 +54,7 @@ def _cross_entropy_forward_step(batch, model):
    timers = get_timers()

    # Get the batch.
-    timers('batch-generator').start()
+    timers('batch-generator', log_level=2).start()
    try:
        batch_ = next(batch)
    except BaseException:
@@ -178,7 +165,7 @@ def _train(model, optimizer, opt_param_scheduler, forward_step,
    report_memory_flag = True

    # For each remaining epoch
-    timers('interval-time').start()
+    timers('interval-time', log_level=0).start(barrier=True)
    for epoch in range(start_epoch, args.epochs):
        print_rank_0('working on epoch {} ...'.format(epoch + 1))

@@ -261,7 +248,7 @@ def finetune(train_valid_datasets_provider, model_provider,
        'batch size scaling is not supported for finetuning'

    # Train and validation data loaders.
-    timers('train/valid/test dataset/dataloder').start()
+    timers('train/valid/test dataset/dataloder', log_level=0).start()
    if args.epochs > 0:
        train_dataset, valid_dataset = train_valid_datasets_provider()
        train_dataloader, valid_dataloader = _build_train_valid_dataloaders(
@@ -271,21 +258,21 @@ def finetune(train_valid_datasets_provider, model_provider,
    timers('train/valid/test dataset/dataloder').stop()

    # Build calback function.
-    timers('callback function').start()
+    timers('callback function', log_level=0).start()
    end_of_epoch_callback = None
    if end_of_epoch_callback_provider is not None:
        end_of_epoch_callback = end_of_epoch_callback_provider()
    timers('callback function').stop()

    # Build model, optimizer and learning rate scheduler.
-    timers('model and optimizer').start()
+    timers('model and optimizer', log_level=0).start()
    model, optimizer, opt_param_scheduler = setup_model_and_optimizer(model_provider, model_type)
    timers('model and optimizer').stop()

    # If pretrained checkpoint is provided and we have not trained for
    # any iteration (i.e., iteration is zero), then load the pretrained
    # checkpoint.
-    timers('pretrained checkpoint').start()
+    timers('pretrained checkpoint', log_level=0).start(barrier=True)
    if args.iteration == 0 and args.pretrained_checkpoint is not None:
        original_load = args.load
        args.load = args.pretrained_checkpoint
@@ -302,7 +289,7 @@ def finetune(train_valid_datasets_provider, model_provider,
    # Print setup timing.
    print_rank_0('done with setups ...')
    timers.log(['train/valid/test dataset/dataloder', 'callback function',
-                'model and optimizer', 'pretrained checkpoint'])
+                'model and optimizer', 'pretrained checkpoint'], barrier=True)
    print_rank_0('training ...')

    # Finetune the model.

--- a/tasks/glue/data.py
+++ b/tasks/glue/data.py
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.

 """GLUE dataset."""


--- a/tasks/glue/finetune.py
+++ b/tasks/glue/finetune.py
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.

 """GLUE finetuning/evaluation."""


--- a/tasks/glue/mnli.py
+++ b/tasks/glue/mnli.py
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.

 """MNLI dataset."""


--- a/tasks/glue/qqp.py
+++ b/tasks/glue/qqp.py
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.

 """QQP dataset."""