Merge branch 'main' into nmt-main

41276b6c · Vijay Korthikanti · a44360ed · fc7f4f03 · 41276b6c · 41276b6c
Commit 41276b6c authored Oct 03, 2022 by Vijay Korthikanti
20 changed files
--- a/megatron/mpu/tests/test_initialize.py
+++ b/megatron/mpu/tests/test_initialize.py
-# coding=utf-8
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 from commons import print_separator
 from commons import initialize_distributed

--- a/megatron/mpu/tests/test_layers.py
+++ b/megatron/mpu/tests/test_layers.py
-# coding=utf-8
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 from mpu import layers
 from commons import set_random_seed

--- a/megatron/mpu/tests/test_random.py
+++ b/megatron/mpu/tests/test_random.py
-# coding=utf-8
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 from commons import print_separator
 from commons import initialize_distributed

--- a/megatron/mpu/utils.py
+++ b/megatron/mpu/utils.py
-# coding=utf-8
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 import torch

--- a/megatron/optimizer/__init__.py
+++ b/megatron/optimizer/__init__.py
-# coding=utf-8
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 from apex.optimizers import FusedAdam as Adam
 from apex.optimizers import FusedSGD as SGD
@@ -145,6 +132,7 @@ def get_megatron_optimizer(model,
                      args.use_contiguous_buffers_in_local_ddp,
                      args.fp16,
                      args.bf16,
+                      args.params_dtype,
                      grad_scaler,
                      model)

--- a/megatron/optimizer/clip_grads.py
+++ b/megatron/optimizer/clip_grads.py
-# coding=utf-8
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 """Gradient clipping."""

--- a/megatron/optimizer/distrib_optimizer.py
+++ b/megatron/optimizer/distrib_optimizer.py
-# coding=utf-8
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 """Megatron distributed optimizer."""
@@ -351,7 +338,7 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
    def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad,
                 params_have_main_grad, use_contiguous_buffers_in_local_ddp,
-                 fp16, bf16, grad_scaler, models):
+                 fp16, bf16, params_dtype, grad_scaler, models):
        """
        See top of class definition for argument descriptions.
@@ -365,7 +352,7 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
        super().__init__(
            optimizer, clip_grad, log_num_zeros_in_grad,
            params_have_main_grad, use_contiguous_buffers_in_local_ddp,
-            fp16, bf16, grad_scaler, models)
+            fp16, bf16, params_dtype, grad_scaler, models)
        # Verify that contiguous buffers are being used.
        # - Note: this should already be checked in arguments.py.
@@ -394,6 +381,21 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
                                                   self.model_param_gbuf_map,
                                                   self.opt_group_ranges)
+        # Initialize param buffers.
+        # - These are views on the DDP model's grad buffers, that share
+        #   storage & have their own dtype. This is safe because the param
+        #   dtype size is always <= grad dtype size.
+        self.param_buffers = []
+        for model_index, model in enumerate(self.models):
+            current_param_buffers = {}
+            for dtype, grad_buffer in model._grad_buffers.items():
+                param_buffer = torch.tensor(grad_buffer.data.storage()._untyped(),
+                                            dtype = params_dtype,
+                                            device = grad_buffer.data.device)
+                param_buffer = param_buffer[:grad_buffer.numel_padded]
+                current_param_buffers[dtype] = param_buffer
+            self.param_buffers.append(current_param_buffers)
        # Update optimizer groups.
        # - Also, leverage state_dict() and load_state_dict() to
        #   recast preexisting per-param state tensors.
@@ -449,8 +451,9 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
        # Grad scaler.
        if 'grad_scaler' not in state_dict:
-            print_rank_0('***WARNING*** found an old checkpoint, will not '
+            if self.fp16:
-                         'load grad scaler ...')
+                print_rank_0('***WARNING*** found an old checkpoint, will not '
+                             'load grad scaler ...')
        else:
            if self.grad_scaler:
                self.grad_scaler.load_state_dict(state_dict['grad_scaler'])
@@ -487,36 +490,48 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
                _zero_grad_group_helper(group, set_to_none)
-    def get_model_grad_buffer_dp_views(self):
+    @staticmethod
+    def get_model_buffer_dp_views(model_buffers):
        """
-        Get shard views of each of the DDP's grad buffers.
+        Get shard views of each of the DDP's param/grad buffers.
        In this nested list, the top level is grouped by the virtual model
-        index and the grad buffer's data type. The sub-level is a list of
+        index and the buffer's data type. The sub-level is a list of
-        shards of that grad buffer, where each shard in the list represents
+        shards of that buffer, where each shard in the list represents
-        a contiguous view of the grad buffer, that is owned by a data-parallel
+        a contiguous view of the buffer, that is owned by a data-parallel
        rank. The shard boundary does not respect parameter boundaries, and
        so the elements of some parameters are split across data parallel
        ranks.
-        Additionally, return references to the entire grad buffers, for use
+        Additionally, return references to the entire buffers, for use
        in _reduce_scatter_base and _all_gather_base.
        """
        data_parallel_world_size = mpu.get_data_parallel_world_size()
-        # Grad buffer views.
+        # Buffer views.
-        gbuf_view_items = []
+        view_items = []
-        for model_index, model in enumerate(self.models):
+        for model_index, buffers in enumerate(model_buffers):
-            for dtype, gbuf in model._grad_buffers.items():
+            for dtype, buf in buffers.items():
+                assert buf.numel() % data_parallel_world_size == 0
+                shard_size = int(buf.numel() / data_parallel_world_size)
+                buf_views = [buf[(r*shard_size):((r+1)*shard_size)]
+                             for r in range(data_parallel_world_size)]
+                view_items.append((model_index, dtype, buf, buf_views))
-                assert gbuf.numel_padded % data_parallel_world_size == 0
+        return view_items
-                shard_size = int(gbuf.numel_padded / data_parallel_world_size)
-                gbuf_views = [gbuf.data[(r*shard_size):((r+1)*shard_size)]
-                              for r in range(data_parallel_world_size)]
-                gbuf_view_items.append((model_index, dtype, gbuf.data, gbuf_views))
-        return gbuf_view_items
+    def get_model_grad_buffer_dp_views(self):
+        return self.get_model_buffer_dp_views([
+            {dtype : mem_buffer.data}
+            for model in self.models
+            for dtype, mem_buffer in model._grad_buffers.items()])
+    def get_model_param_buffer_dp_views(self):
+        return self.get_model_buffer_dp_views(self.param_buffers)
    def reduce_model_grads(self, args, timers):
@@ -532,17 +547,20 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
        """
        # All-reduce layer-norm grads (for sequence parallelism).
-        timers('backward-layernorm-all-reduce').start()
+        timers('layernorm-grads-all-reduce', log_level=1).start(
+            barrier=args.barrier_with_L1_time)
        self.allreduce_layernorm_grads(args)
-        timers('backward-layernorm-all-reduce').stop()
+        timers('layernorm-grads-all-reduce').stop()
        # All-reduce embedding grads.
-        timers('backward-embedding-all-reduce').start()
+        timers('embedding-grads-all-reduce', log_level=1).start(
+            barrier=args.barrier_with_L1_time)
        self.allreduce_embedding_grads(args)
-        timers('backward-embedding-all-reduce').stop()
+        timers('embedding-grads-all-reduce').stop()
        # Reduce-scatter setup.
-        timers('backward-params-all-reduce').start()
+        timers('grads-reduce-scatter', log_level=1).start(
+            barrier=args.barrier_with_L1_time)
        data_parallel_rank = mpu.get_data_parallel_rank()
        data_parallel_world_size = mpu.get_data_parallel_world_size()
        data_parallel_group = mpu.get_data_parallel_group()
@@ -563,46 +581,49 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
                group = data_parallel_group,
            )
-        timers('backward-params-all-reduce').stop()
+        timers('grads-reduce-scatter').stop()
    def gather_model_params(self, args, timers):
        """
        All-gather updated model params.
-        The DDP's grad buffer is used for the all-gather, and thus no
+        The DDP's param buffer is used for the all-gather, and thus no
        tensors are dynamically allocated. After the all-gather, the params
-        can be copied from param.main_grad to param.
+        can be copied from the param buffer to the param.
        """
-        timers('backward-params-all-gather').start()
+        timers('params-all-gather', log_level=1).start(
+            barrier=args.barrier_with_L1_time)
        data_parallel_rank = mpu.get_data_parallel_rank()
        data_parallel_group = mpu.get_data_parallel_group()
        # All-gather updated main params.
-        # - All grad buffer views are guaranteed to have the same num elements
+        # - All param buffer views are guaranteed to have the same num elements
-        #   across all data parallel ranks, with grad buffer padding that is done
+        #   across all data parallel ranks, due to grad buffer padding that is
-        #   in distributed.py. Thus, all sub-views will have consistent start/end
+        #   done in distributed.py, and extended to the param buffers. Thus,
-        #   indexes across data parallel ranks.
+        #   all sub-views will have consistent start/end indexes across data
-        gbuf_view_items = self.get_model_grad_buffer_dp_views()
+        #   parallel ranks.
-        for index, (model_index, dtype, gbuf, gbuf_views) \
+        pbuf_view_items = self.get_model_param_buffer_dp_views()
-            in enumerate(gbuf_view_items):
+        for index, (model_index, dtype, pbuf, pbuf_views) \
+            in enumerate(pbuf_view_items):
            torch.distributed._all_gather_base(
-                gbuf,
+                pbuf,
-                gbuf_views[data_parallel_rank],
+                pbuf_views[data_parallel_rank],
                group = data_parallel_group,
            )
-        # Each model param now contains its updated values in its
+        # Copy from param buffer to each param.
-        # '.main_grad' field.
+        for model_id, model in enumerate(self.models):
-        for model in self.models:
            for dtype, param_map in model._grad_buffer_param_index_map.items():
-                for param in param_map:
+                for param, buf_range in param_map.items():
-                    param.detach().copy_(param.main_grad)
+                    param_buf = self.param_buffers[model_id][dtype]
+                    param_buf_shard = param_buf[buf_range[0]:buf_range[1]]
+                    param.view(-1).detach().copy_(param_buf_shard)
-        timers('backward-params-all-gather').stop()
+        timers('params-all-gather').stop()
    def _collect_main_grad_data_for_unscaling(self):
@@ -680,14 +701,17 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
                                                         model_group):
                    param_range_map = self.get_model_param_range_map(model_param)
-                    param_range = param_range_map["param"]
+                    world_range = param_range_map["gbuf_world"]
-                    assert param_range.size == shard_main_param.nelement()
-                    model_grad = model_param.main_grad
+                    assert world_range.size == shard_main_param.nelement()
-                    shard_model_grad = model_grad.view(-1) \
-                        [param_range.start:param_range.end]
+                    model_id, dtype = self.model_param_gbuf_map[model_param]
+                    model_param_buffer = self.param_buffers[model_id][dtype]
+                    shard_model_param = model_param_buffer.view(-1) \
+                        [world_range.start:world_range.end]
-                    shard_model_grad.data.copy_(shard_main_param)
+                    shard_model_param.data.copy_(shard_main_param)
        # Copy shard groups to model groups.
        copy_group_params(self.shard_fp32_from_float16_groups,

--- a/megatron/optimizer/grad_scaler.py
+++ b/megatron/optimizer/grad_scaler.py
-# coding=utf-8
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 """Megatron grad scaler."""

--- a/megatron/optimizer/optimizer.py
+++ b/megatron/optimizer/optimizer.py
-# coding=utf-8
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 """Megatron optimizer."""
@@ -294,21 +281,24 @@ class MegatronOptimizer(ABC):
        """All-reduce all grads, and all-reduce embeddings."""
        # All-reduce layer-norm grads (for sequence parallelism).
-        timers('backward-layernorm-all-reduce').start()
+        timers('layernorm-grads-all-reduce', log_level=1).start(
+            barrier=args.barrier_with_L1_time)
        self.allreduce_layernorm_grads(args)
-        timers('backward-layernorm-all-reduce').stop()
+        timers('layernorm-grads-all-reduce').stop()
        # All-reduce if needed.
        if args.DDP_impl == 'local':
-            timers('backward-params-all-reduce').start()
+            timers('grads-all-reduce', log_level=1).start(
+                barrier=args.barrier_with_L1_time)
            for model in self.models:
                model.allreduce_gradients()
-            timers('backward-params-all-reduce').stop()
+            timers('grads-all-reduce').stop()
        # All-reduce embedding grads.
-        timers('backward-embedding-all-reduce').start()
+        timers('embedding-grads-all-reduce', log_level=1).start(
+            barrier=args.barrier_with_L1_time)
        self.allreduce_embedding_grads(args)
-        timers('backward-embedding-all-reduce').stop()
+        timers('embedding-grads-all-reduce').stop()
 class MixedPrecisionOptimizer(MegatronOptimizer):
@@ -332,6 +322,7 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
            is using a contiguous buffer to hold the model grads.
        fp16: if true, the model is running in fp16.
        bf16: if true, the model is running in bfloat16.
+        params_dtype: used by distributed optimizer.
        grad_scaler: used for scaling gradients. Note that this can be
            None. This case happens when `bf16 = True` and we don't
            use any loss scale. Note that for `bf16 = True`, we can have
@@ -343,7 +334,7 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
    def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad,
                 params_have_main_grad, use_contiguous_buffers_in_local_ddp,
-                 fp16, bf16, grad_scaler,
+                 fp16, bf16, params_dtype, grad_scaler,
                 models):
        super().__init__(
@@ -353,6 +344,7 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
        self.fp16 = fp16
        self.bf16 = bf16
+        self.params_dtype = params_dtype
        self.grad_scaler = grad_scaler
        # None grad scaler is only supported for bf16.
@@ -416,7 +408,8 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
    def step(self, args, timers):
        # Copy gradients from model params to main params.
-        timers('optimizer-copy-to-main-grad').start()
+        timers('optimizer-copy-to-main-grad', log_level=1).start(
+            barrier=args.barrier_with_L1_time)
        self._copy_model_grads_to_main_grads()
        timers('optimizer-copy-to-main-grad').stop()
@@ -425,7 +418,8 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
        if self.grad_scaler:
            # Unscale and check for inf/nan.
-            timers('optimizer-unscale-and-check-inf').start()
+            timers('optimizer-unscale-and-check-inf', log_level=1).start(
+                barrier=args.barrier_with_L1_time)
            found_inf_flag = self._unscale_main_grads_and_check_for_nan()
            timers('optimizer-unscale-and-check-inf').stop()
@@ -438,25 +432,29 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
                return False, None, None
        # Clip the main gradients.
-        timers('optimizer-clip-main-grad').start()
+        timers('optimizer-clip-main-grad', log_level=1).start(
+            barrier=args.barrier_with_L1_time)
        grad_norm = None
        if self.clip_grad > 0.0:
            grad_norm = self.clip_grad_norm(self.clip_grad)
        timers('optimizer-clip-main-grad').stop()
        # Count the zeros in the grads.
-        timers('optimizer-count-zeros').start()
+        timers('optimizer-count-zeros', log_level=1).start(
+            barrier=args.barrier_with_L1_time)
        num_zeros_in_grad = self.count_zeros() if \
                            self.log_num_zeros_in_grad else None
        timers('optimizer-count-zeros').stop()
        # Step the optimizer.
-        timers('optimizer-inner-step').start()
+        timers('optimizer-inner-step', log_level=1).start(
+            barrier=args.barrier_with_L1_time)
        self.optimizer.step()
        timers('optimizer-inner-step').stop()
        # Update params from main params.
-        timers('optimizer-copy-main-to-model-params').start()
+        timers('optimizer-copy-main-to-model-params', log_level=1).start(
+            barrier=args.barrier_with_L1_time)
        self._copy_main_params_to_model_params()
        timers('optimizer-copy-main-to-model-params').stop()
@@ -496,12 +494,12 @@ class Float16OptimizerWithFloat16Params(MixedPrecisionOptimizer):
    def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad,
                 params_have_main_grad, use_contiguous_buffers_in_local_ddp,
-                 fp16, bf16, grad_scaler, models):
+                 fp16, bf16, params_dtype, grad_scaler, models):
        super().__init__(
            optimizer, clip_grad, log_num_zeros_in_grad,
            params_have_main_grad, use_contiguous_buffers_in_local_ddp,
-            fp16, bf16, grad_scaler, models)
+            fp16, bf16, params_dtype, grad_scaler, models)
        # ======================
        # main parameter stuff
@@ -671,8 +669,9 @@ class Float16OptimizerWithFloat16Params(MixedPrecisionOptimizer):
        # Grad scaler.
        if 'grad_scaler' not in state_dict:
-            print_rank_0('***WARNING*** found an old checkpoint, will not '
+            if self.fp16:
-                         'load grad scaler ...')
+                print_rank_0('***WARNING*** found an old checkpoint, will not '
+                             'load grad scaler ...')
        else:
            if self.grad_scaler:
                self.grad_scaler.load_state_dict(state_dict['grad_scaler'])
@@ -725,7 +724,8 @@ class FP32Optimizer(MegatronOptimizer):
        Always return successful since there is no overflow."""
        # Copy main_grads to grads.
-        timers('optimizer-copy-to-main-grad').start()
+        timers('optimizer-copy-to-main-grad', log_level=1).start(
+            barrier=args.barrier_with_L1_time)
        if self.params_have_main_grad:
            for param_group in self.optimizer.param_groups:
                for param in param_group['params']:
@@ -739,20 +739,23 @@ class FP32Optimizer(MegatronOptimizer):
        timers('optimizer-copy-to-main-grad').stop()
        # Clip gradients.
-        timers('optimizer-clip-main-grad').start()
+        timers('optimizer-clip-main-grad', log_level=1).start(
+            barrier=args.barrier_with_L1_time)
        grad_norm = None
        if self.clip_grad > 0.0:
            grad_norm = self.clip_grad_norm(self.clip_grad)
        timers('optimizer-clip-main-grad').stop()
        # count the zeros in the grads
-        timers('optimizer-count-zeros').start()
+        timers('optimizer-count-zeros', log_level=1).start(
+            barrier=args.barrier_with_L1_time)
        num_zeros_in_grad = self.count_zeros() if \
                            self.log_num_zeros_in_grad else None
        timers('optimizer-count-zeros').stop()
        # Update parameters.
-        timers('optimizer-inner-step').start()
+        timers('optimizer-inner-step', log_level=1).start(
+            barrier=args.barrier_with_L1_time)
        self.optimizer.step()
        timers('optimizer-inner-step').stop()

--- a/megatron/optimizer_param_scheduler.py
+++ b/megatron/optimizer_param_scheduler.py
-# coding=utf-8
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 """Learning rate decay and weight decay incr functions."""

--- a/megatron/p2p_communication.py
+++ b/megatron/p2p_communication.py
-# coding=utf-8
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 from functools import reduce
 import operator
@@ -163,7 +150,7 @@ def recv_forward(tensor_shape=None, dtype_=None, timers=None):
        input_tensor = None
    else:
        if timers is not None:
-            timers('forward-recv').start()
+            timers('forward-recv', log_level=2).start()
        input_tensor, _ = _communicate(
            tensor_send_next=None,
            tensor_send_prev=None,
@@ -182,7 +169,7 @@ def recv_backward(tensor_shape=None, timers=None):
        output_tensor_grad = None
    else:
        if timers is not None:
-            timers('backward-recv').start()
+            timers('backward-recv', log_level=2).start()
        _, output_tensor_grad = _communicate(
            tensor_send_next=None,
            tensor_send_prev=None,
@@ -199,7 +186,7 @@ def send_forward(output_tensor, tensor_shape=None, dtype_=None, timers=None):
    if not mpu.is_pipeline_last_stage():
        if timers is not None:
-            timers('forward-send').start()
+            timers('forward-send', log_level=2).start()
        _communicate(
            tensor_send_next=output_tensor,
            tensor_send_prev=None,
@@ -215,7 +202,7 @@ def send_backward(input_tensor_grad, tensor_shape=None, timers=None):
    """Send tensor to previous rank in pipeline (backward send)."""
    if not mpu.is_pipeline_first_stage():
        if timers is not None:
-            timers('backward-send').start()
+            timers('backward-send', log_level=2).start()
        _communicate(
            tensor_send_next=None,
            tensor_send_prev=input_tensor_grad,
@@ -232,7 +219,7 @@ def send_forward_recv_backward(output_tensor, tensor_shape=None, timers=None):
        output_tensor_grad = None
    else:
        if timers is not None:
-            timers('forward-send-backward-recv').start()
+            timers('forward-send-backward-recv', log_level=2).start()
        _, output_tensor_grad = _communicate(
            tensor_send_next=output_tensor,
            tensor_send_prev=None,
@@ -250,7 +237,7 @@ def send_backward_recv_forward(input_tensor_grad, tensor_shape=None, timers=None
        input_tensor = None
    else:
        if timers is not None:
-            timers('backward-send-forward-recv').start()
+            timers('backward-send-forward-recv', log_level=2).start()
        input_tensor, _ = _communicate(
            tensor_send_next=None,
            tensor_send_prev=input_tensor_grad,
@@ -265,7 +252,7 @@ def send_backward_recv_forward(input_tensor_grad, tensor_shape=None, timers=None
 def send_forward_recv_forward(output_tensor, recv_prev, tensor_shape=None, timers=None):
    """Batched recv from previous rank and send to next rank in pipeline."""
    if timers is not None:
-        timers('forward-send-forward-recv').start()
+        timers('forward-send-forward-recv', log_level=2).start()
    input_tensor, _ = _communicate(
        tensor_send_next=output_tensor,
        tensor_send_prev=None,
@@ -280,7 +267,7 @@ def send_forward_recv_forward(output_tensor, recv_prev, tensor_shape=None, timer
 def send_backward_recv_backward(input_tensor_grad, recv_next, tensor_shape=None, timers=None):
    """Batched recv from next rank and send to previous rank in pipeline."""
    if timers is not None:
-        timers('backward-send-backward-recv').start()
+        timers('backward-send-backward-recv', log_level=2).start()
    _, output_tensor_grad = _communicate(
        tensor_send_next=None,
        tensor_send_prev=input_tensor_grad,
@@ -297,7 +284,8 @@ def send_forward_backward_recv_forward_backward(
        recv_next, tensor_shape=None, timers=None):
    """Batched send and recv with previous and next ranks in pipeline."""
    if timers is not None:
-        timers('forward-backward-send-forward-backward-recv').start()
+        timers('forward-backward-send-forward-backward-recv',
+               log_level=2).start()
    input_tensor, output_tensor_grad = _communicate(
        tensor_send_next=output_tensor,
        tensor_send_prev=input_tensor_grad,

--- a/megatron/schedules.py
+++ b/megatron/schedules.py
-# coding=utf-8
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 from contextlib import contextmanager
 import torch
@@ -107,6 +94,7 @@ def forward_step(forward_step_func,
                 model,
                 input_tensor,
                 forward_data_store,
+                 timers,
                 collect_non_loss_data=False):
    """Forward step for passed-in model.
@@ -115,9 +103,9 @@ def forward_step(forward_step_func,
    Returns output tensor."""
    args = get_args()
-    timers = get_timers()
-    timers('forward-compute').start()
+    if timers is not None:
+        timers('forward-compute', log_level=2).start()
    unwrapped_model = unwrap_model(
        model, (torchDDP, LocalDDP, Float16Module))
@@ -138,7 +126,8 @@ def forward_step(forward_step_func,
            data = loss_func(output_tensor, non_loss_data=True)
            forward_data_store.append(data)
-    timers('forward-compute').stop()
+    if timers is not None:
+        timers('forward-compute').stop()
    # If T5 model (or other model with encoder and decoder)
    # and in decoder stack, then send encoder_hidden_state
@@ -151,7 +140,8 @@ def forward_step(forward_step_func,
    return [output_tensor]
-def backward_step(optimizer, input_tensor, output_tensor, output_tensor_grad):
+def backward_step(optimizer, input_tensor, output_tensor,
+                  output_tensor_grad, timers):
    """Backward step through passed-in output tensor.
    If last stage, output_tensor_grad is None, otherwise gradient of loss
@@ -165,8 +155,8 @@ def backward_step(optimizer, input_tensor, output_tensor, output_tensor_grad):
    # connections.
    args = get_args()
-    timers = get_timers()
+    if timers is not None:
-    timers('backward-compute').start()
+        timers('backward-compute', log_level=2).start()
    # Retain the grad on the input_tensor.
    unwrap_input_tensor_grad = False
@@ -207,7 +197,8 @@ def backward_step(optimizer, input_tensor, output_tensor, output_tensor_grad):
    if unwrap_input_tensor_grad:
        input_tensor_grad = input_tensor_grad[0]
-    timers('backward-compute').stop()
+    if timers is not None:
+        timers('backward-compute').stop()
    return input_tensor_grad
@@ -243,18 +234,19 @@ def forward_backward_no_pipelining(forward_step_func,
        for i in range(get_num_microbatches() - 1):
            output_tensor = forward_step(forward_step_func, data_iterator,
                                         model, input_tensor, forward_data_store,
-                                         collect_non_loss_data)
+                                         timers, collect_non_loss_data)
            if not forward_only:
                backward_step(optimizer, input_tensor, output_tensor,
-                              output_tensor_grad)
+                              timers, output_tensor_grad)
    # Run computation for last microbatch out of context handler (want to
    # synchronize gradients).
    output_tensor = forward_step(forward_step_func, data_iterator,
                                 model, input_tensor, forward_data_store,
-                                 collect_non_loss_data)
+                                 timers, collect_non_loss_data)
    if not forward_only:
-        backward_step(optimizer, input_tensor, output_tensor, output_tensor_grad)
+        backward_step(optimizer, input_tensor, output_tensor,
+                      output_tensor_grad, timers)
    return forward_data_store
@@ -269,6 +261,9 @@ def forward_backward_pipelining_with_interleaving(forward_step_func,
    communication between pipeline stages as needed.
    Returns dictionary with losses if the last stage, empty dict otherwise."""
+    args = get_args()
    input_tensors = [[] for _ in range(len(model))]
    output_tensors = [[] for _ in range(len(model))]
    forward_data_store = []
@@ -278,7 +273,6 @@ def forward_backward_pipelining_with_interleaving(forward_step_func,
    pipeline_parallel_size = mpu.get_pipeline_model_parallel_world_size()
    pipeline_parallel_rank = mpu.get_pipeline_model_parallel_rank()
-    args = get_args()
    if args.sequence_parallel:
        seq_length = args.seq_length // mpu.get_tensor_model_parallel_world_size()
    else:
@@ -337,6 +331,7 @@ def forward_backward_pipelining_with_interleaving(forward_step_func,
                                     model[model_chunk_id],
                                     input_tensor, 
                                     forward_data_store,
+                                     timers,
                                     collect_non_loss_data)
        output_tensors[model_chunk_id].append(output_tensor)
@@ -364,7 +359,8 @@ def forward_backward_pipelining_with_interleaving(forward_step_func,
            backward_step(optimizer,
                          input_tensor,
                          output_tensor,
-                          output_tensor_grad)
+                          output_tensor_grad,
+                          timers)
        return input_tensor_grad
@@ -620,8 +616,7 @@ def forward_backward_pipelining_without_interleaving(forward_step_func,
    Returns dictionary with losses if the last stage, empty dict otherwise."""
    args = get_args()
-    timers = get_timers()
    assert len(model) == 1
    model = model[0]
@@ -656,7 +651,7 @@ def forward_backward_pipelining_without_interleaving(forward_step_func,
        input_tensor = recv_forward(recv_tensor_shapes, timers=timers)
        output_tensor = forward_step(forward_step_func, data_iterator, model,
                                     input_tensor, forward_data_store,
-                                     collect_non_loss_data)
+                                     timers, collect_non_loss_data)
        send_forward(output_tensor, send_tensor_shapes, timers=timers)
        if not forward_only:
@@ -676,7 +671,7 @@ def forward_backward_pipelining_without_interleaving(forward_step_func,
        output_tensor = forward_step(forward_step_func, data_iterator, model,
                                     input_tensor, forward_data_store,
-                                     collect_non_loss_data)
+                                     timers, collect_non_loss_data)
        if forward_only:
            send_forward(output_tensor, send_tensor_shapes, timers=timers)
@@ -701,7 +696,7 @@ def forward_backward_pipelining_without_interleaving(forward_step_func,
            input_tensor_grad = \
                backward_step(optimizer, input_tensor, output_tensor,
-                              output_tensor_grad)
+                              output_tensor_grad, timers)
            if last_iteration:
                input_tensor = None
@@ -721,7 +716,7 @@ def forward_backward_pipelining_without_interleaving(forward_step_func,
            input_tensor_grad = \
                backward_step(optimizer, input_tensor, output_tensor,
-                              output_tensor_grad)
+                              output_tensor_grad, timers)
            send_backward(input_tensor_grad, recv_tensor_shapes, timers=timers)

--- a/megatron/static/index.html
+++ b/megatron/static/index.html
 <!-- coding=utf-8-->
-<!-- Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.-->
+<!-- Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.-->
-<!---->
-<!-- Licensed under the Apache License, Version 2.0 (the "License");-->
-<!-- you may not use this file except in compliance with the License.-->
-<!-- You may obtain a copy of the License at-->
-<!---->
-<!--     http://www.apache.org/licenses/LICENSE-2.0-->
-<!---->
-<!-- Unless required by applicable law or agreed to in writing, software-->
-<!-- distributed under the License is distributed on an "AS IS" BASIS,-->
-<!-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.-->
-<!-- See the License for the specific language governing permissions and-->
-<!-- limitations under the License.-->
 <!DOCTYPE html>
 <html lang="en">

--- a/megatron/text_generation/__init__.py
+++ b/megatron/text_generation/__init__.py
-# coding=utf-8
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 from .api import (

--- a/megatron/text_generation/api.py
+++ b/megatron/text_generation/api.py
-# coding=utf-8
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 """Inference API."""

--- a/megatron/text_generation/communication.py
+++ b/megatron/text_generation/communication.py
-# coding=utf-8
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 """Communications utilities."""

--- a/megatron/text_generation/forward_step.py
+++ b/megatron/text_generation/forward_step.py
-# coding=utf-8
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 """Forward step utilities."""

--- a/megatron/text_generation/generation.py
+++ b/megatron/text_generation/generation.py
-# coding=utf-8
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 """Generation utilities."""
@@ -47,10 +34,15 @@ def score_and_return_on_first_stage(model, tokens, lengths):
    batch_size = tokens.size(0)
    max_prompt_length = lengths.max().item()
    assert max_prompt_length == tokens.size(1)
-    max_sequence_length = min(max_prompt_length, args.max_position_embeddings)
+    if max_prompt_length > args.max_position_embeddings:
+        raise ValueError("Length of prompt + tokens_to_generate longer than allowed")
+    if max_prompt_length * batch_size >= MAX_TOKENS_TO_OOM:
+        raise ValueError("Too many tokens.  " + str(max_prompt_length*batch_size)+ " is greater than "+str(MAX_TOKENS_TO_OOM))
    # forward step.
-    forward_step = ForwardStep(model, batch_size, max_sequence_length)
+    forward_step = ForwardStep(model, batch_size, max_prompt_length)
    # ===================
    # Pre-allocate memory
@@ -58,7 +50,7 @@ def score_and_return_on_first_stage(model, tokens, lengths):
    # Log probability of the sequence (prompt + generated tokens).
    output_log_probs = None
-    output_log_probs_size = (batch_size, max_sequence_length - 1)
+    output_log_probs_size = (batch_size, max_prompt_length - 1)
    if mpu.is_pipeline_last_stage():
        output_log_probs = torch.empty(output_log_probs_size,
@@ -395,7 +387,7 @@ def beam_search_and_return_on_first_stage(model, tokens, lengths, beam_size, sto
            # if cannot find stop token, add open beams to hyps
            if not done:
                for beam_id in range(beam_size):
-                    beam_hyp.add(tokens[beam_id].clone(), scores[beam_id], context_length + 1 - prompt_length)
+                    beam_hyp.add(tokens[beam_id].clone(), scores[beam_id].squeeze(), context_length + 1 - prompt_length)
            # rank based on scores
            sorted_hyps = sorted(beam_hyp.beams, key=lambda x: x[0], reverse=True)

--- a/megatron/text_generation/sampling.py
+++ b/megatron/text_generation/sampling.py
-# coding=utf-8
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 """Sampling utilities.
 Part of this code is inspired by:

--- a/megatron/text_generation/tokenization.py
+++ b/megatron/text_generation/tokenization.py
-# coding=utf-8
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 """Tokenization utilities."""