Merge master into realm-mlm

5e56e563 · Neel Kant · 6c0a5bd8 · 569b3dab · 5e56e563 · 5e56e563
Commit 5e56e563 authored Apr 28, 2020 by Neel Kant
20 changed files
--- a/megatron/mpu/__init__.py
+++ b/megatron/mpu/__init__.py
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/megatron/mpu/cross_entropy.py
+++ b/megatron/mpu/cross_entropy.py
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -72,7 +72,6 @@ class _VocabParallelCrossEntropy(torch.autograd.Function):
                                     op=torch.distributed.ReduceOp.SUM,
                                     group=get_model_parallel_group())
        # Loss = log(sum(exp(logits))) - predicted-logit.
        loss = torch.log(sum_exp_logits) - predicted_logits

--- a/megatron/mpu/data.py
+++ b/megatron/mpu/data.py
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/megatron/mpu/grads.py
+++ b/megatron/mpu/grads.py
--- a/megatron/mpu/initialize.py
+++ b/megatron/mpu/initialize.py
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/megatron/mpu/layers.py
+++ b/megatron/mpu/layers.py
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -89,6 +89,7 @@ class VocabParallelEmbedding(torch.nn.Module):
        embedding_dim: size of hidden state.
        init_method: method to initialize weights.
    """
    def __init__(self, num_embeddings, embedding_dim,
                 init_method=init.xavier_normal_):
        super(VocabParallelEmbedding, self).__init__()
@@ -108,7 +109,7 @@ class VocabParallelEmbedding(torch.nn.Module):
                self.num_embeddings, get_model_parallel_rank(),
                get_model_parallel_world_size())
        self.num_embeddings_per_partition = self.vocab_end_index - \
-                                            self.vocab_start_index
+            self.vocab_start_index
        # Allocate weights.
        self.weight = Parameter(torch.Tensor(self.num_embeddings_per_partition,
@@ -147,6 +148,7 @@ class ParallelEmbedding(torch.nn.Module):
        embedding_dim: size of hidden state.
        init_method: method to initialize weights.
    """
    def __init__(self, num_embeddings, embedding_dim,
                 init_method=init.xavier_normal_,
                 keep_master_weight_for_test=False):
@@ -205,6 +207,7 @@ class ColumnParallelLinear(torch.nn.Module):
                                     set to False. It returns the master weights
                                     used for initialization.
    """
    def __init__(self, input_size, output_size, bias=True, gather_output=True,
                 init_method=init.xavier_normal_, stride=1,
                 keep_master_weight_for_test=False):
@@ -279,6 +282,7 @@ class RowParallelLinear(torch.nn.Module):
                                     set to False. It returns the master weights
                                     used for initialization.
    """
    def __init__(self, input_size, output_size, bias=True,
                 input_is_parallel=False,
                 init_method=init.xavier_normal_, stride=1,
@@ -327,4 +331,3 @@ class RowParallelLinear(torch.nn.Module):
        else:
            output = output_
        return output
--- a/megatron/mpu/mappings.py
+++ b/megatron/mpu/mappings.py
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -131,11 +131,14 @@ class _GatherFromModelParallelRegion(torch.autograd.Function):
 def copy_to_model_parallel_region(input_):
    return _CopyToModelParallelRegion.apply(input_)
 def reduce_from_model_parallel_region(input_):
    return _ReduceFromModelParallelRegion.apply(input_)
 def scatter_to_model_parallel_region(input_):
    return _ScatterToModelParallelRegion.apply(input_)
 def gather_from_model_parallel_region(input_):
    return _GatherFromModelParallelRegion.apply(input_)
--- a/megatron/mpu/random.py
+++ b/megatron/mpu/random.py
--- a/megatron/mpu/tests/commons.py
+++ b/megatron/mpu/tests/commons.py
--- a/megatron/mpu/tests/test_cross_entropy.py
+++ b/megatron/mpu/tests/test_cross_entropy.py
--- a/megatron/mpu/tests/test_data.py
+++ b/megatron/mpu/tests/test_data.py
--- a/megatron/mpu/tests/test_initialize.py
+++ b/megatron/mpu/tests/test_initialize.py
--- a/megatron/mpu/tests/test_layers.py
+++ b/megatron/mpu/tests/test_layers.py
--- a/megatron/mpu/tests/test_random.py
+++ b/megatron/mpu/tests/test_random.py
--- a/megatron/mpu/utils.py
+++ b/megatron/mpu/utils.py
 # coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/megatron/text_generation_utils.py
+++ b/megatron/text_generation_utils.py
--- a/megatron/tokenizer/__init__.py
+++ b/megatron/tokenizer/__init__.py
--- a/megatron/tokenizer/bert_tokenization.py
+++ b/megatron/tokenizer/bert_tokenization.py
--- a/megatron/tokenizer/gpt2_tokenization.py
+++ b/megatron/tokenizer/gpt2_tokenization.py
--- a/megatron/tokenizer/tokenizer.py
+++ b/megatron/tokenizer/tokenizer.py