added global variables

83aa9219 · Mohammad · 599e959a · 83aa9219 · 83aa9219 · 83aa9219
Commit 83aa9219 authored Mar 27, 2020 by Mohammad
5 changed files
--- a/arguments.py
+++ b/arguments.py
@@ -23,6 +23,7 @@ import torch
 _GLOBAL_ARGS = None
 def parse_args(extra_args_provider=None):
    global _GLOBAL_ARGS
@@ -200,6 +201,8 @@ def add_mixed_precision_args(parser):
 def add_distributed_args(parser):
    group = parser.add_argument_group(title='mixed precision')
+    group.add_argument('--model-parallel-size', type=int, default=1,
+                       help='Size of the model parallel.')
    group.add_argument('--distributed-backend', default='nccl',
                       choices=['nccl', 'gloo'],
                       help='Which backend to use for distributed training.')
@@ -389,8 +392,6 @@ def add_data_args_(parser):
    group = parser.add_argument_group('data', 'data configurations')
-    group.add_argument('--model-parallel-size', type=int, default=1,
-                       help='size of the model parallel.')
    group.add_argument('--shuffle', action='store_true',
                       help='Shuffle data. Shuffling is deterministic '
                       'based on seed and current epoch.')

--- a/megatron/data/tokenizer.py
+++ b/megatron/data/tokenizer.py
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """Megatron tokenizer."""
 from abc import ABC
 from abc import abstractmethod
-from megatron.utils import vocab_size_with_padding
+from megatron.arguments import get_args
 from .bert_tokenization import FullTokenizer as FullBertTokenizer
-def add_tokenizer_to_args(args, tokenizer_type):
+def build_tokenizer():
-    """Instantiate tokenizer based on input type and add it to args."""
+    """Initialize tokenizer."""
+    # Retrieve args.
+    args = get_args()
+    if args.rank == 0:
+        print('building {} tokenizer ...'.format(args.tokenizer_type),
+              flush=True)
-    # Make sure we have not already called this method.
-    if hasattr(args, 'tokenizer'):
-        raise Exception('args already has a tokenizer')
    # Select and instantiate the tokenizer.
-    if tokenizer_type == 'BertWordPieceLowerCase':
+    if args.tokenizer_type == 'BertWordPieceLowerCase':
-        args.tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab,
+        tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file,
-                                                 lower_case=True)
+                                                    lower_case=True)
    else:
        raise NotImplementedError('{} tokenizer is not '
-                                  'implemented.'.format(tokenizer_type))
+                                  'implemented.'.format(args.tokenizer_type))
    # Add vocab size.
-    args.vocab_size = vocab_size_with_padding(args.tokenizer.vocab_size, args)
+    args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size)
+    return tokenizer
+def _vocab_size_with_padding(orig_vocab_size):
+    """Pad vocab size so it is divisible by model parallel size and
+    still having GPU friendly size."""
+    args = get_args()
+    after = orig_vocab_size
+    multiple = args.make_vocab_size_divisible_by * \
+               args.model_parallel_size
+    while (after % multiple) != 0:
+        after += 1
+    if args.rank == 0:
+        print(' > padded vocab (size: {}) with {} dummy tokens '
+              '(new size: {})'.format(
+                  orig_vocab_size, after - orig_vocab_size, after), flush=True)
+    return after
 class AbstractTokenizer(ABC):

--- a/megatron/global_vars.py
+++ b/megatron/global_vars.py
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Megatron global variables."""
+import os
+import sys
+from megatron.data.tokenizer import build_tokenizer
+from .arguments import parse_args
+from .utils import Timers
+_GLOBAL_ARGS = None
+_GLOBAL_TOKENIZER = None
+_GLOBAL_TENSORBOARD_WRITER = None
+_GLOBAL_ADLR_AUTORESUME = None
+_GLOBAL_TIMERS = None
+def get_args():
+    """Return arguments."""
+    _ensure_var_is_initialized(_GLOBAL_ARGS, 'args')
+    return _GLOBAL_ARGS
+def get_tokenizer():
+    """Return tokenizer."""
+    _ensure_var_is_initialized(_GLOBAL_TOKENIZER, 'tokenizer')
+    return _GLOBAL_TOKENIZER
+def get_tensorboard_writer():
+    """Return tensorboard writer. It can be None so no need
+    to check if it is initialized."""
+    return _GLOBAL_TENSORBOARD_WRITER
+def get_adlr_autoresume():
+    """ADLR autoresume object. It can be None so no need
+    to check if it is initialized."""
+    return _GLOBAL_ADLR_AUTORESUME
+def get_timers():
+    """Return timers."""
+    _ensure_var_is_initialized(_GLOBAL_TIMERS, 'timers')
+    return _GLOBAL_TIMERS
+def set_global_variables(extra_args_provider=None):
+    """Set args, tokenizer, tensorboard-writer, adlr-autoresume, and timers."""
+    _parse_args(extra_args_provider=extra_args_provider)
+    _build_tokenizer()
+    _set_tensorboard_writer()
+    _set_adlr_autoresume()
+    _set_timers()
+def _parse_args(extra_args_provider=None):
+    """Parse entire arguments."""
+    global _GLOBAL_ARGS
+    _ensure_var_is_not_initialized(_GLOBAL_ARGS, 'args')
+    _GLOBAL_ARGS = parse_args(extra_args_provider=extra_args_provider)
+def _build_tokenizer():
+    """Initialize tokenizer."""
+    global _GLOBAL_TOKENIZER
+    _ensure_var_is_not_initialized(_GLOBAL_TOKENIZER, 'tokenizer')
+    _GLOBAL_TOKENIZER = build_tokenizer()
+def _set_tensorboard_writer():
+    """Set tensorboard writer."""
+    global _GLOBAL_TENSORBOARD_WRITER
+    _ensure_var_is_not_initialized(_GLOBAL_TENSORBOARD_WRITER,
+                                   'tensorboard writer')
+    args = get_args()
+    if hasattr(args, 'tensorboard_dir') and \
+       args.tensorboard_dir and args.rank == 0:
+        try:
+            from torch.utils.tensorboard import SummaryWriter
+            print('> setting tensorboard ...')
+            _GLOBAL_TENSORBOARD_WRITER = SummaryWriter(
+                log_dir=args.tensorboard_dir)
+        except ModuleNotFoundError:
+            print('WARNING: TensorBoard writing requested but is not '
+                  'available (are you using PyTorch 1.1.0 or later?), '
+                  'no TensorBoard logs will be written.', flush=True)
+def _set_adlr_autoresume():
+    """Initialize ADLR autoresume."""
+    global _GLOBAL_ADLR_AUTORESUME
+    _ensure_var_is_not_initialized(_GLOBAL_ADLR_AUTORESUME, 'adlr autoresume')
+    args = get_args()
+    if args.adlr_autoresume:
+        if args.rank == 0:
+            print('enabling autoresume ...', flush=True)
+        sys.path.append(os.environ.get('SUBMIT_SCRIPTS', '.'))
+        try:
+            from userlib.auto_resume import AutoResume
+        except:
+            print('ADLR autoresume is not available, exiting ...')
+            sys.exit()
+        _GLOBAL_ADLR_AUTORESUME = AutoResume
+def _set_timers():
+    """Initialize timers."""
+    global _GLOBAL_TIMERS
+    _ensure_var_is_not_initialized(_GLOBAL_TIMERS, 'timers')
+    _GLOBAL_TIMERS = Timers()
+def _ensure_var_is_initialized(var, name):
+    """Make sure the input variable is not None."""
+    assert var is not None, '{} is not initialized.'.format(name)
+def _ensure_var_is_not_initialized(var, name):
+    """Make sure the input variable is not None."""
+    assert var is None, '{} is already initialized.'.format(name)
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -22,7 +22,7 @@ import torch
 from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
 from apex.optimizers import FusedAdam as Adam
-from arguments import get_args
+from megatron.arguments import get_args
 from megatron import mpu
 from megatron.fp16 import FP16_Module
 from megatron.fp16 import FP16_Optimizer
@@ -129,7 +129,7 @@ def initialize_megatron(message, args):
    initialize_distributed(args)
    if torch.distributed.get_rank() == 0:
        print(message, flush=True)
-        print_args(args, writer)
+    print_args(args, writer)
    # Autoresume.
    torch.distributed.barrier()

--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -150,13 +150,15 @@ def check_adlr_autoresume_termination(iteration, model, optimizer,
 def print_args(args, writer=None):
    """Print arguments."""
-    print('arguments:', flush=True)
+    print_rank_0('arguments:')
+    str_list = []
    for arg in vars(args):
        dots = '.' * (29 - len(arg))
-        print('  {} {} {}'.format(arg, dots, getattr(args, arg)), flush=True)
+        str_list.append('  {} {} {}'.format(arg, dots, getattr(args, arg)))
        if writer:
            writer.add_text(arg, str(getattr(args, arg)))
+    for arg in sorted(str_list, key= lambda a: a.lower()):
+        print_rank_0(arg)
 def print_params_min_max_norm(optimizer, iteration):
@@ -290,6 +292,7 @@ def initialize_distributed(args):
    device = args.rank % torch.cuda.device_count()
    if args.local_rank is not None:
        device = args.local_rank
    torch.cuda.set_device(device)
    # Call the init process
    init_method = 'tcp://'