Commit be5300b7 authored by Neel Kant's avatar Neel Kant
Browse files

Merge branch 'staging_optional_args' into 'staging'

made size arguments optional so they can be set from input function

See merge request ADLR/megatron-lm!53
parents 7d75b3b5 8e8e4548
"""
example usage:
python scripts/run_gpt2_eval.py \
--model-parallel-size 1 \
--num-layers 12 \
--hidden-size 768 \
--num-attention-heads 12 \
--model-path <gpt2_117_path> \
--data-path <wikitext_tokens_test_path> \
--batch-size 16 \
--cache-dir <cache dir path>
"""
import argparse
import subprocess
parser = argparse.ArgumentParser('run zero shot GPT2 eval')
parser.add_argument('--model-path', type=str, required=True,
help='Saved model path for evaluation')
parser.add_argument('--batch-size', type=int, default=4,
help='batch size to use for evaluation')
parser.add_argument('--num-attention-heads', type=int, default=12,
help='num of transformer attention heads')
parser.add_argument('--hidden-size', type=int, default=768,
help='tansformer hidden size')
parser.add_argument('--num-layers', type=int, default=12,
help='num decoder layers')
parser.add_argument('--data-path', type=str, required=True,
help='Data path for evaluation data')
parser.add_argument('--cloze-eval', action='store_true',
help='Run lambada cloze eval instead of perplexity eval.')
parser.add_argument('--easy-lambada', action='store_true',
help='use easier formulation of lambada')
parser.add_argument('--model-parallel-size', type=int, default=1,
help='model parallel size to use')
args = parser.parse_args()
multinode_args = ''
if args.model_parallel_size > 1:
multinode_args += ' -m torch.distributed.launch --nproc_per_node {} '.format(args.model_parallel_size)
CMD = ' --model-parallel-size {model_par} \
--num-layers {nlayers} \
--hidden-size {hidden} \
--log-interval 100 \
--load {model} \
--batch-size {batch} \
--num-attention-heads {natt} \
--seq-length 1024 \
--max-position-embeddings 1024 \
--tokenizer-type GPT2BPETokenizer \
--distributed-backend nccl \
--hidden-dropout 0.1 \
--attention-dropout 0.1 \
--fp16 \
--lr 1 --no-load-optim --no-load-rng --epochs 0 \
--overlapping-eval 32 \
--merge-file /home/universal-lm-data.cosmos549/repos/megatron_latest/vocab_cache/merges.txt \
--vocab-file /home/universal-lm-data.cosmos549/repos/megatron_latest/vocab_cache/vocab.json'.format(model_par=args.model_parallel_size,
nlayers=args.num_layers,
hidden=args.hidden_size,
model=args.model_path,
batch=args.batch_size,
natt=args.num_attention_heads,)
if args.cloze_eval:
CMD += ' --valid-data {} '.format(args.data_path)
CMD += ' --task LAMBADA '
if not args.easy_lambada:
CMD += ' --strict-lambada '
CMD = 'main.py' + CMD
print('Running Lambada Eval Command:', flush=True)
else:
CMD += ' --valid-data {} '.format(args.data_path)
CMD += ' --task WIKITEXT103 '
CMD = 'main.py' + CMD
print('Running PPL Eval Command:', flush=True)
CMD = 'python3 '+multinode_args+CMD
print(CMD, flush=True)
subprocess.call(CMD.split())
......@@ -35,8 +35,6 @@ def parse_args(extra_args_provider=None, defaults={}):
parser = _add_validation_args(parser)
parser = _add_data_args(parser)
parser = _add_autoresume_args(parser)
# TODO: Refactor
parser = _add_gpt2_args(parser)
# Custom arguments.
if extra_args_provider is not None:
......@@ -54,6 +52,12 @@ def parse_args(extra_args_provider=None, defaults={}):
'defaults can only be overwritten for args with None values.'
setattr(args, key, defaults[key])
# Check required arguments.
required_args = ['num_layers', 'hidden_size', 'num_attention_heads',
'max_position_embeddings']
for req_arg in required_args:
_check_arg_is_not_none(args, req_arg)
# Distributed args.
args.rank = int(os.getenv('RANK', '0'))
args.world_size = int(os.getenv("WORLD_SIZE", '1'))
......@@ -93,16 +97,20 @@ def _print_args(args):
print('---------------- end of arguments ----------------', flush=True)
def _check_arg_is_not_none(args, arg):
assert getattr(args, arg) is not None, '{} argument is None'.format(arg)
def _add_network_size_args(parser):
group = parser.add_argument_group(title='network size')
group.add_argument('--num-layers', type=int, required=True,
group.add_argument('--num-layers', type=int, default=None,
help='Number of transformer layers.')
group.add_argument('--hidden-size', type=int, required=True,
group.add_argument('--hidden-size', type=int, default=None,
help='Tansformer hidden size.')
group.add_argument('--num-attention-heads', type=int, required=True,
group.add_argument('--num-attention-heads', type=int, default=None,
help='Number of transformer attention heads.')
group.add_argument('--max-position-embeddings', type=int, required=True,
group.add_argument('--max-position-embeddings', type=int, default=None,
help='Maximum number of position embeddings to use. '
'This is the size of position embedding.')
group.add_argument('--make-vocab-size-divisible-by', type=int, default=128,
......@@ -342,28 +350,3 @@ def _add_autoresume_args(parser):
'termination signal')
return parser
########################################################################
def _add_gpt2_args(parser):
group = parser.add_argument_group(title='gpt2')
group.add_argument('--input-data-sizes-file', type=str, default='sizes.txt',
help='The filename containing all the shards '
'sizes for numpy data loader')
return parser
def add_data_args_(parser):
"""Train/valid/test data arguments."""
group = parser.add_argument_group('data', 'data configurations')
group.add_argument('--data-loader', type=str, default=None,
choices=['raw', 'lazy', 'tfrecords', 'numpy', 'binary'],
help='Which data loader to use. Default varies by model.')
return parser
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment