Commit 83aa9219 authored by Mohammad's avatar Mohammad
Browse files

added global variables

parent 599e959a
...@@ -23,6 +23,7 @@ import torch ...@@ -23,6 +23,7 @@ import torch
_GLOBAL_ARGS = None _GLOBAL_ARGS = None
def parse_args(extra_args_provider=None): def parse_args(extra_args_provider=None):
global _GLOBAL_ARGS global _GLOBAL_ARGS
...@@ -200,6 +201,8 @@ def add_mixed_precision_args(parser): ...@@ -200,6 +201,8 @@ def add_mixed_precision_args(parser):
def add_distributed_args(parser): def add_distributed_args(parser):
group = parser.add_argument_group(title='mixed precision') group = parser.add_argument_group(title='mixed precision')
group.add_argument('--model-parallel-size', type=int, default=1,
help='Size of the model parallel.')
group.add_argument('--distributed-backend', default='nccl', group.add_argument('--distributed-backend', default='nccl',
choices=['nccl', 'gloo'], choices=['nccl', 'gloo'],
help='Which backend to use for distributed training.') help='Which backend to use for distributed training.')
...@@ -389,8 +392,6 @@ def add_data_args_(parser): ...@@ -389,8 +392,6 @@ def add_data_args_(parser):
group = parser.add_argument_group('data', 'data configurations') group = parser.add_argument_group('data', 'data configurations')
group.add_argument('--model-parallel-size', type=int, default=1,
help='size of the model parallel.')
group.add_argument('--shuffle', action='store_true', group.add_argument('--shuffle', action='store_true',
help='Shuffle data. Shuffling is deterministic ' help='Shuffle data. Shuffling is deterministic '
'based on seed and current epoch.') 'based on seed and current epoch.')
......
# coding=utf-8
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Megatron tokenizer.""" """Megatron tokenizer."""
from abc import ABC from abc import ABC
from abc import abstractmethod from abc import abstractmethod
from megatron.utils import vocab_size_with_padding from megatron.arguments import get_args
from .bert_tokenization import FullTokenizer as FullBertTokenizer from .bert_tokenization import FullTokenizer as FullBertTokenizer
def add_tokenizer_to_args(args, tokenizer_type): def build_tokenizer():
"""Instantiate tokenizer based on input type and add it to args.""" """Initialize tokenizer."""
# Retrieve args.
args = get_args()
if args.rank == 0:
print('building {} tokenizer ...'.format(args.tokenizer_type),
flush=True)
# Make sure we have not already called this method.
if hasattr(args, 'tokenizer'):
raise Exception('args already has a tokenizer')
# Select and instantiate the tokenizer. # Select and instantiate the tokenizer.
if tokenizer_type == 'BertWordPieceLowerCase': if args.tokenizer_type == 'BertWordPieceLowerCase':
args.tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab, tokenizer = _BertWordPieceTokenizer(vocab_file=args.vocab_file,
lower_case=True) lower_case=True)
else: else:
raise NotImplementedError('{} tokenizer is not ' raise NotImplementedError('{} tokenizer is not '
'implemented.'.format(tokenizer_type)) 'implemented.'.format(args.tokenizer_type))
# Add vocab size. # Add vocab size.
args.vocab_size = vocab_size_with_padding(args.tokenizer.vocab_size, args) args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size)
return tokenizer
def _vocab_size_with_padding(orig_vocab_size):
"""Pad vocab size so it is divisible by model parallel size and
still having GPU friendly size."""
args = get_args()
after = orig_vocab_size
multiple = args.make_vocab_size_divisible_by * \
args.model_parallel_size
while (after % multiple) != 0:
after += 1
if args.rank == 0:
print(' > padded vocab (size: {}) with {} dummy tokens '
'(new size: {})'.format(
orig_vocab_size, after - orig_vocab_size, after), flush=True)
return after
class AbstractTokenizer(ABC): class AbstractTokenizer(ABC):
......
# coding=utf-8
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Megatron global variables."""
import os
import sys
from megatron.data.tokenizer import build_tokenizer
from .arguments import parse_args
from .utils import Timers
_GLOBAL_ARGS = None
_GLOBAL_TOKENIZER = None
_GLOBAL_TENSORBOARD_WRITER = None
_GLOBAL_ADLR_AUTORESUME = None
_GLOBAL_TIMERS = None
def get_args():
"""Return arguments."""
_ensure_var_is_initialized(_GLOBAL_ARGS, 'args')
return _GLOBAL_ARGS
def get_tokenizer():
"""Return tokenizer."""
_ensure_var_is_initialized(_GLOBAL_TOKENIZER, 'tokenizer')
return _GLOBAL_TOKENIZER
def get_tensorboard_writer():
"""Return tensorboard writer. It can be None so no need
to check if it is initialized."""
return _GLOBAL_TENSORBOARD_WRITER
def get_adlr_autoresume():
"""ADLR autoresume object. It can be None so no need
to check if it is initialized."""
return _GLOBAL_ADLR_AUTORESUME
def get_timers():
"""Return timers."""
_ensure_var_is_initialized(_GLOBAL_TIMERS, 'timers')
return _GLOBAL_TIMERS
def set_global_variables(extra_args_provider=None):
"""Set args, tokenizer, tensorboard-writer, adlr-autoresume, and timers."""
_parse_args(extra_args_provider=extra_args_provider)
_build_tokenizer()
_set_tensorboard_writer()
_set_adlr_autoresume()
_set_timers()
def _parse_args(extra_args_provider=None):
"""Parse entire arguments."""
global _GLOBAL_ARGS
_ensure_var_is_not_initialized(_GLOBAL_ARGS, 'args')
_GLOBAL_ARGS = parse_args(extra_args_provider=extra_args_provider)
def _build_tokenizer():
"""Initialize tokenizer."""
global _GLOBAL_TOKENIZER
_ensure_var_is_not_initialized(_GLOBAL_TOKENIZER, 'tokenizer')
_GLOBAL_TOKENIZER = build_tokenizer()
def _set_tensorboard_writer():
"""Set tensorboard writer."""
global _GLOBAL_TENSORBOARD_WRITER
_ensure_var_is_not_initialized(_GLOBAL_TENSORBOARD_WRITER,
'tensorboard writer')
args = get_args()
if hasattr(args, 'tensorboard_dir') and \
args.tensorboard_dir and args.rank == 0:
try:
from torch.utils.tensorboard import SummaryWriter
print('> setting tensorboard ...')
_GLOBAL_TENSORBOARD_WRITER = SummaryWriter(
log_dir=args.tensorboard_dir)
except ModuleNotFoundError:
print('WARNING: TensorBoard writing requested but is not '
'available (are you using PyTorch 1.1.0 or later?), '
'no TensorBoard logs will be written.', flush=True)
def _set_adlr_autoresume():
"""Initialize ADLR autoresume."""
global _GLOBAL_ADLR_AUTORESUME
_ensure_var_is_not_initialized(_GLOBAL_ADLR_AUTORESUME, 'adlr autoresume')
args = get_args()
if args.adlr_autoresume:
if args.rank == 0:
print('enabling autoresume ...', flush=True)
sys.path.append(os.environ.get('SUBMIT_SCRIPTS', '.'))
try:
from userlib.auto_resume import AutoResume
except:
print('ADLR autoresume is not available, exiting ...')
sys.exit()
_GLOBAL_ADLR_AUTORESUME = AutoResume
def _set_timers():
"""Initialize timers."""
global _GLOBAL_TIMERS
_ensure_var_is_not_initialized(_GLOBAL_TIMERS, 'timers')
_GLOBAL_TIMERS = Timers()
def _ensure_var_is_initialized(var, name):
"""Make sure the input variable is not None."""
assert var is not None, '{} is not initialized.'.format(name)
def _ensure_var_is_not_initialized(var, name):
"""Make sure the input variable is not None."""
assert var is None, '{} is already initialized.'.format(name)
...@@ -22,7 +22,7 @@ import torch ...@@ -22,7 +22,7 @@ import torch
from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
from apex.optimizers import FusedAdam as Adam from apex.optimizers import FusedAdam as Adam
from arguments import get_args from megatron.arguments import get_args
from megatron import mpu from megatron import mpu
from megatron.fp16 import FP16_Module from megatron.fp16 import FP16_Module
from megatron.fp16 import FP16_Optimizer from megatron.fp16 import FP16_Optimizer
...@@ -129,7 +129,7 @@ def initialize_megatron(message, args): ...@@ -129,7 +129,7 @@ def initialize_megatron(message, args):
initialize_distributed(args) initialize_distributed(args)
if torch.distributed.get_rank() == 0: if torch.distributed.get_rank() == 0:
print(message, flush=True) print(message, flush=True)
print_args(args, writer) print_args(args, writer)
# Autoresume. # Autoresume.
torch.distributed.barrier() torch.distributed.barrier()
......
...@@ -150,13 +150,15 @@ def check_adlr_autoresume_termination(iteration, model, optimizer, ...@@ -150,13 +150,15 @@ def check_adlr_autoresume_termination(iteration, model, optimizer,
def print_args(args, writer=None): def print_args(args, writer=None):
"""Print arguments.""" """Print arguments."""
print('arguments:', flush=True) print_rank_0('arguments:')
str_list = []
for arg in vars(args): for arg in vars(args):
dots = '.' * (29 - len(arg)) dots = '.' * (29 - len(arg))
print(' {} {} {}'.format(arg, dots, getattr(args, arg)), flush=True) str_list.append(' {} {} {}'.format(arg, dots, getattr(args, arg)))
if writer: if writer:
writer.add_text(arg, str(getattr(args, arg))) writer.add_text(arg, str(getattr(args, arg)))
for arg in sorted(str_list, key= lambda a: a.lower()):
print_rank_0(arg)
def print_params_min_max_norm(optimizer, iteration): def print_params_min_max_norm(optimizer, iteration):
...@@ -290,6 +292,7 @@ def initialize_distributed(args): ...@@ -290,6 +292,7 @@ def initialize_distributed(args):
device = args.rank % torch.cuda.device_count() device = args.rank % torch.cuda.device_count()
if args.local_rank is not None: if args.local_rank is not None:
device = args.local_rank device = args.local_rank
torch.cuda.set_device(device) torch.cuda.set_device(device)
# Call the init process # Call the init process
init_method = 'tcp://' init_method = 'tcp://'
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment