Commit ba2264ab authored by Raul Puri's avatar Raul Puri
Browse files

verified zeroshot tasks works

parent 159fcfeb
...@@ -30,19 +30,8 @@ parser.add_argument('--cloze-eval', action='store_true', ...@@ -30,19 +30,8 @@ parser.add_argument('--cloze-eval', action='store_true',
help='Run lambada cloze eval instead of perplexity eval.') help='Run lambada cloze eval instead of perplexity eval.')
parser.add_argument('--easy-lambada', action='store_true', parser.add_argument('--easy-lambada', action='store_true',
help='use easier formulation of lambada') help='use easier formulation of lambada')
parser.add_argument('--webtext-eval', action='store_true',
help='Run webtext PPL eval instead of wikitext PPL eval.')
parser.add_argument('--eval-iters', default=5000, type=int,
help='number of iterations to run webtext evaluation')
parser.add_argument('--model-parallel-size', type=int, default=1, parser.add_argument('--model-parallel-size', type=int, default=1,
help='model parallel size to use') help='model parallel size to use')
parser.add_argument('--load-openai', action='store_true',
help='Load weights from saved openai/hf checkpoints')
parser.add_argument('--cache-dir', type=str, default='cache',
help='directory to cache gpt2 tokenizers')
parser.add_argument('--make-vocab-size-divisible-by', type=int, default=128,
help='Pad the vocab size to be divisible by this value.'
'This is added for computational efficieny reasons.')
args = parser.parse_args() args = parser.parse_args()
multinode_args = '' multinode_args = ''
...@@ -54,43 +43,36 @@ CMD = ' --model-parallel-size {model_par} \ ...@@ -54,43 +43,36 @@ CMD = ' --model-parallel-size {model_par} \
--hidden-size {hidden} \ --hidden-size {hidden} \
--log-interval 100 \ --log-interval 100 \
--load {model} \ --load {model} \
--eval-batch-size {batch} \ --batch-size {batch} \
--num-attention-heads {natt} \ --num-attention-heads {natt} \
--seq-length 1024 \ --seq-length 1024 \
--max-position-embeddings 1024 \ --max-position-embeddings 1024 \
--tokenizer-type GPT2BPETokenizer \ --tokenizer-type GPT2BPETokenizer \
--text-key text \
--distributed-backend nccl \ --distributed-backend nccl \
--hidden-dropout 0.1 \ --hidden-dropout 0.1 \
--attention-dropout 0.1 \ --attention-dropout 0.1 \
--fp16 \ --fp16 \
--lr 1 --no-load-optim --no-load-rng --epochs 0 \
--overlapping-eval 32 \ --overlapping-eval 32 \
--make-vocab-size-divisible-by {make_vocab_size_divisible_by} \ --merge-file /home/universal-lm-data.cosmos549/repos/megatron_latest/vocab_cache/merges.txt \
--cache-dir {cache} '.format(model_par=args.model_parallel_size, --vocab-file /home/universal-lm-data.cosmos549/repos/megatron_latest/vocab_cache/vocab.json'.format(model_par=args.model_parallel_size,
nlayers=args.num_layers, nlayers=args.num_layers,
hidden=args.hidden_size, hidden=args.hidden_size,
model=args.model_path, model=args.model_path,
batch=args.batch_size, batch=args.batch_size,
natt=args.num_attention_heads, natt=args.num_attention_heads,)
make_vocab_size_divisible_by=args.make_vocab_size_divisible_by,
cache=args.cache_dir)
if args.load_openai:
CMD += ' --load-openai '
if args.cloze_eval: if args.cloze_eval:
CMD += ' --valid-data {} '.format(args.data_path) CMD += ' --valid-data {} '.format(args.data_path)
CMD += ' --cloze-eval ' CMD += ' --task LAMBADA '
if not args.easy_lambada: if not args.easy_lambada:
CMD += ' --strict-lambada ' CMD += ' --strict-lambada '
CMD = 'evaluate_gpt2.py' + CMD CMD = 'main.py' + CMD
print('Running Lambada Eval Command:', flush=True) print('Running Lambada Eval Command:', flush=True)
elif args.webtext_eval:
CMD += '--train-iters 0 --eval-iters {} --test-data {} --loose-json '.format(args.eval_iters, args.data_path)
CMD = 'pretrain_gpt2.py' + CMD
print('Running Webtext Eval Command:', flush=True)
else: else:
CMD += ' --valid-data {} '.format(args.data_path) CMD += ' --valid-data {} '.format(args.data_path)
CMD = 'evaluate_gpt2.py' + CMD CMD += ' --task WIKITEXT103 '
CMD = 'main.py' + CMD
print('Running PPL Eval Command:', flush=True) print('Running PPL Eval Command:', flush=True)
CMD = 'python3 '+multinode_args+CMD CMD = 'python3 '+multinode_args+CMD
......
...@@ -132,7 +132,7 @@ def _build_lambada_dataset(): ...@@ -132,7 +132,7 @@ def _build_lambada_dataset():
tokenizer = get_tokenizer() tokenizer = get_tokenizer()
assert len(args.valid_data) == 1 assert len(args.valid_data) == 1
val_dataset = _LambadaDataset(args.valid_data, tokenizer.eod, tokenizer, val_dataset = _LambadaDataset(args.valid_data[0], tokenizer.eod, tokenizer,
args.seq_length, args.strict_lambada) args.seq_length, args.strict_lambada)
print_rank_0(' > found {} samples.'.format(len(val_dataset))) print_rank_0(' > found {} samples.'.format(len(val_dataset)))
...@@ -145,10 +145,10 @@ def _build_wikitext103_dataset(): ...@@ -145,10 +145,10 @@ def _build_wikitext103_dataset():
tokenizer = get_tokenizer() tokenizer = get_tokenizer()
assert len(args.valid_data) == 1 assert len(args.valid_data) == 1
with open(args.valid_data, "rb") as reader: with open(args.valid_data[0], "rb") as reader:
entire_data = reader.read().decode('utf-8') entire_data = reader.read().decode('utf-8')
num_original_tokens = len(entire_data.strip().split(" ")) num_original_tokens = len(entire_data.strip().split(" "))
entire_data = get_detokenizer(args.valid_data)(entire_data) entire_data = get_detokenizer(args.valid_data[0])(entire_data)
tokenized_data = tokenizer.tokenize(entire_data) tokenized_data = tokenizer.tokenize(entire_data)
num_tokenized_tokens = len(tokenized_data) num_tokenized_tokens = len(tokenized_data)
......
...@@ -19,7 +19,7 @@ import re ...@@ -19,7 +19,7 @@ import re
def ptb_detokenizer(string): def ptb_detokenizer(string):
string = string.replace(" '", "'") string = string.replace(" '", "'")
string = string.replace(" \n", "\n") string = string.replace(" \n", "\n")
string = string.replace("\n ", "\n") string = string.replace("\n ", "\n")
string = string.replace(" n't", "n't") string = string.replace(" n't", "n't")
...@@ -75,7 +75,7 @@ _DETOKENIZERS = { ...@@ -75,7 +75,7 @@ _DETOKENIZERS = {
def get_detokenizer(path): def get_detokenizer(path):
for key in DETOKENIZERS.keys(): for key in _DETOKENIZERS.keys():
if key in path: if key in path:
print(key) print(key)
return _DETOKENIZERS[key] return _DETOKENIZERS[key]
......
...@@ -29,7 +29,7 @@ from megatron.training import get_model ...@@ -29,7 +29,7 @@ from megatron.training import get_model
from megatron.utils import get_ltor_masks_and_position_ids from megatron.utils import get_ltor_masks_and_position_ids
from tasks.finetune_utils import build_data_loader from tasks.finetune_utils import build_data_loader
from .dataset import build_dataset from .datasets import build_dataset
def get_model_provider(eval_metric): def get_model_provider(eval_metric):
...@@ -71,7 +71,8 @@ def process_batch(batch): ...@@ -71,7 +71,8 @@ def process_batch(batch):
tokenizer.eod, tokenizer.eod,
args.reset_position_ids, args.reset_position_ids,
args.reset_attention_mask, args.reset_attention_mask,
args.eod_mask_loss) args.eod_mask_loss,
args.fp16)
return tokens, labels, attention_mask, position_ids, loss_mask return tokens, labels, attention_mask, position_ids, loss_mask
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment