large update including model parallelism and gpt2

Co-authored-by: shoeybi <shoeybim@gmail.com> Co-authored-by: raulpuric <raulpuric@berkeley.edu> Co-authored-by: jaredcasper <jaredcasper@gmail.com> Co-authored-by: mpatwary <mostofa.patwary@gmail.com> Co-authored-by: plegresl <plegresl@gmail.com>

large update including model parallelism and gpt2
Co-authored-by: shoeybi <shoeybim@gmail.com> Co-authored-by: raulpuric <raulpuric@berkeley.edu> Co-authored-by: jaredcasper <jaredcasper@gmail.com> Co-authored-by: mpatwary <mostofa.patwary@gmail.com> Co-authored-by: plegresl <plegresl@gmail.com>
abe36e2e · Raul Puri · 0399d32c · abe36e2e · abe36e2e · abe36e2e
Commit abe36e2e authored Jul 29, 2019 by Raul Puri
20 changed files
--- a/mpu/tests/test_layers.py
+++ b/mpu/tests/test_layers.py
--- a/mpu/tests/test_random.py
+++ b/mpu/tests/test_random.py
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("../..")
+import torch
+import mpu
+from commons import initialize_distributed
+from commons import print_separator
+def test_set_cuda_rng_state(model_parallel_size):
+    if torch.distributed.get_rank() == 0:
+        print('> testing set_rng_state with size {} ...'.
+              format(model_parallel_size))
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+    size = 123
+    seed = 1234
+    torch.cuda.manual_seed(1234)
+    tensor = torch.cuda.FloatTensor(size)
+    # Get the state
+    rng_state = torch.cuda.get_rng_state()
+    rng_state_copy = rng_state.clone()
+    # Do some stuff.
+    for _ in range(5):
+        torch.randn(size, out=tensor)
+    result_1 = tensor.clone()
+    assert rng_state.sub(rng_state_copy).max() == 0
+    assert torch.cuda.get_rng_state().sub(rng_state_copy).max() > 0
+    # State should be different.
+    new_rng_state = torch.cuda.get_rng_state()
+    max_diff = new_rng_state.sub(rng_state).max()
+    print('   max diff in rng state (should be non-zero) on global rank {}: {}'.
+          format(torch.distributed.get_rank(), max_diff))
+    assert max_diff > 0
+    # Reset the rng state and do the same stuff.
+    mpu.random._set_cuda_rng_state(rng_state)
+    for _ in range(5):
+        torch.randn(size, out=tensor)
+    mpu.random._set_cuda_rng_state(rng_state)
+    for _ in range(5):
+        torch.randn(size, out=tensor)
+    result_2 = tensor.clone()
+    # Results should be the same
+    error = result_2.sub(result_1).abs().max()
+    print('   max error in generated tensors (should be zero) on '
+          'global rank {}: {}'.format(torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+    # Input state should have remained intact.
+    error = rng_state.sub(rng_state_copy).max()
+    print('   max error in rng state (should be zero) on global rank {}: {}'.
+          format(torch.distributed.get_rank(), error))
+    assert error == 0
+    # Reset groups
+    mpu.destroy_model_parallel()
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+def test_cuda_rng_tracker(model_parallel_size):
+    if torch.distributed.get_rank() == 0:
+        print('> testing cuda rng tracker with size {} ...'.
+              format(model_parallel_size))
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+    seed_1 = 1234
+    seed_2 = 4321
+    size = [12, 21]
+    tensor = torch.cuda.FloatTensor(size)
+    # Set to seed_1 and generate two tensors.
+    torch.cuda.manual_seed(seed_1)
+    torch.randn(size, out=tensor)
+    target_11 = tensor.clone()
+    torch.randn(size, out=tensor)
+    target_12 = tensor.clone()
+    # Set to seed_2 and generate two tensors.
+    torch.cuda.manual_seed(seed_2)
+    torch.randn(size, out=tensor)
+    target_21 = tensor.clone()
+    torch.randn(size, out=tensor)
+    target_22 = tensor.clone()
+    # Now if we interleave seed_1 and seed_2,
+    # we should still get the same tensors
+    torch.cuda.manual_seed(seed_1)
+    mpu.get_cuda_rng_tracker().add('test', seed_2)
+    torch.randn(size, out=tensor)
+    result_11 = tensor.clone()
+    with mpu.get_cuda_rng_tracker().fork('test'):
+        torch.randn(size, out=tensor)
+        result_21 = tensor.clone()
+    torch.randn(size, out=tensor)
+    result_12 = tensor.clone()
+    with mpu.get_cuda_rng_tracker().fork('test'):
+        torch.randn(size, out=tensor)
+        result_22 = tensor.clone()
+    diff = result_11.sub(result_21).abs().max()
+    diff = min(diff, result_12.sub(result_22).abs().max())
+    print('   max diff in generated tensors (should be non-zero) on '
+          'global rank {}: {}'.format(torch.distributed.get_rank(), diff))
+    assert diff > 1.0e-6
+    error = max(result_11.sub(target_11).abs().max(),
+                result_12.sub(target_12).abs().max())
+    error = max(error, result_21.sub(target_21).abs().max())
+    error = max(error, result_22.sub(target_22).abs().max())
+    print('   max error in generated tensors (should be zero) on '
+          'global rank {}: {}'.format(torch.distributed.get_rank(), error))
+    assert error < 1.0e-6
+    # Reset the tracker
+    mpu.get_cuda_rng_tracker().reset()
+    # Reset groups
+    mpu.destroy_model_parallel()
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+def test_model_parallel_cuda_manual_seed(model_parallel_size):
+    if torch.distributed.get_rank() == 0:
+        print('> testing model parallel cuda manual seed with size {} ...'.
+              format(model_parallel_size))
+    mpu.initialize_model_parallel(model_parallel_size)
+    model_parallel_size = mpu.get_model_parallel_world_size()
+    mpu.model_parallel_cuda_manual_seed(12345)
+    assert torch.cuda.initial_seed() == 12345
+    with mpu.get_cuda_rng_tracker().fork():
+        assert torch.cuda.initial_seed() == (12345 + 2718 +
+                                             mpu.get_model_parallel_rank())
+    # Reset the tracker
+    mpu.get_cuda_rng_tracker().reset()
+    # Reset groups
+    mpu.destroy_model_parallel()
+    torch.distributed.barrier()
+    if torch.distributed.get_rank() == 0:
+        print('>> passed the test :-)')
+if __name__ == '__main__':
+    initialize_distributed()
+    world_size = torch.distributed.get_world_size()
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test set rng state')
+        test_set_cuda_rng_state(model_parallel_size)
+        model_parallel_size *= 2
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test cuda rng tracker')
+        test_cuda_rng_tracker(model_parallel_size)
+        model_parallel_size *= 2
+    model_parallel_size = 1
+    while model_parallel_size <= world_size:
+        print_separator('test model parallel cuda manual seed')
+        test_model_parallel_cuda_manual_seed(model_parallel_size)
+        model_parallel_size *= 2
--- a/mpu/transformer.py
+++ b/mpu/transformer.py
--- a/mpu/utils.py
+++ b/mpu/utils.py
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+def ensure_divisibility(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator."""
+    assert numerator % denominator == 0, '{} is not divisible by {}'.format(
+        numerator, denominator)
+def divide(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator and return
+    the division value."""
+    ensure_divisibility(numerator, denominator)
+    return numerator // denominator
+def split_tensor_along_last_dim(tensor, num_partitions,
+                                contiguous_split_chunks=False):
+    """Split a tensor along its last dimension.
+    Arguments:
+        tensor: input tensor.
+        num_partitions: number of partitions to split the tensor
+        contiguous_split_chunks: If True, make each chunk contiguous
+                                 in memory.
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    last_dim_size = divide(tensor.size()[last_dim], num_partitions)
+    # Split.
+    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+    # Note: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+    return tensor_list
+class VocabUtility:
+    """Split the vocabulary into `world_size` chunks amd return the
+        first and last index of the vocabulary belonging to the `rank`
+        partition: Note that indecies in [fist, last)"""
+    @staticmethod
+    def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size,
+                                                  rank, world_size):
+        index_f = rank * per_partition_vocab_size
+        index_l = index_f + per_partition_vocab_size
+        return index_f, index_l
+    @staticmethod
+    def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size):
+        per_partition_vocab_size = divide(global_vocab_size, world_size)
+        return VocabUtility.vocab_range_from_per_partition_vocab_size(
+            per_partition_vocab_size, rank, world_size)
--- a/openwebtext/README.md
+++ b/openwebtext/README.md
--- a/openwebtext/blacklist_urls.py
+++ b/openwebtext/blacklist_urls.py
--- a/openwebtext/cleanup_dataset.py
+++ b/openwebtext/cleanup_dataset.py
--- a/openwebtext/find_duplicates.py
+++ b/openwebtext/find_duplicates.py
--- a/openwebtext/group_duplicates_url.py
+++ b/openwebtext/group_duplicates_url.py
--- a/openwebtext/make_gpt2_dataset.py
+++ b/openwebtext/make_gpt2_dataset.py
--- a/openwebtext/make_gpt2_sizes.py
+++ b/openwebtext/make_gpt2_sizes.py
+import glob
+import json
+import os
+import time
+import sys
+import numpy as np
+if __name__ == '__main__':
+    print('building the shard sizes ...')
+    path = sys.argv[1]
+    print('> reading numpy files from {}'.format(path))
+    npy_files = glob.glob(path + '/*.npy')
+    npy_files.sort()
+    print('  found {} numpy files'.format(len(npy_files)))
+    size_dict = {}
+    counter = 0
+    start_time = time.time()
+    for filename in npy_files:
+        data = np.load(filename, allow_pickle=True)
+        size = np.hstack(data).size
+        np_filename = os.path.basename(filename)
+        size_dict[np_filename] = size
+        counter += 1
+        if counter % 10 == 0:
+            print('   processed {} files in {:.2f} seconds'.format(
+                counter, time.time() - start_time))
+    output_filename = os.path.join(path, 'sizes.txt')
+    with open(output_filename, 'w') as f:
+        json.dump(size_dict, f)
+    print('> wrote sizes to {}'.format(output_filename))
--- a/openwebtext/merge_jsons.py
+++ b/openwebtext/merge_jsons.py
--- a/openwebtext/remove_group_duplicates.py
+++ b/openwebtext/remove_group_duplicates.py
--- a/openwebtext/run_make_gpt2_dataset.sh
+++ b/openwebtext/run_make_gpt2_dataset.sh
--- a/optim/__init__.py
+++ b/optim/__init__.py
--- a/optim/adam.py
+++ b/optim/adam.py
--- a/pretrain_bert.py
+++ b/pretrain_bert.py
--- a/pretrain_gpt2.py
+++ b/pretrain_gpt2.py
--- a/scripts/generate_text.sh
+++ b/scripts/generate_text.sh
--- a/scripts/pretrain_bert.sh
+++ b/scripts/pretrain_bert.sh