refactored evaluate_gpt2

8fae1cdd · Mohammad · bf3ce751 · bf3ce751 · 8fae1cdd · 8fae1cdd
Commit 8fae1cdd authored Apr 01, 2020 by Mohammad
8 changed files
--- a/evaluate_gpt2.py
+++ b/evaluate_gpt2.py
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -355,35 +355,7 @@ def _add_gpt2_args(parser):
    return parser


-def add_evaluation_args(parser):
-    """Evaluation arguments."""
-
-    group = parser.add_argument_group('validation', 'validation configurations')
-
-    group.add_argument('--eval-batch-size', type=int, default=None,
-                       help='Data Loader batch size for evaluation datasets.'
-                       'Defaults to `--batch-size`')
-    group.add_argument('--eval-seq-length', type=int, default=None,
-                       help='Maximum sequence length to process for '
-                       'evaluation. Defaults to `--seq-length`')
-    group.add_argument('--eval-max-preds-per-seq', type=int, default=None,
-                       help='Maximum number of predictions to use for '
-                       'evaluation. Defaults to '
-                       'math.ceil(`--eval-seq-length`*.15/10)*10')
-    group.add_argument('--overlapping-eval', type=int, default=32,
-                       help='sliding window for overlapping eval ')
-    group.add_argument('--cloze-eval', action='store_true',
-                       help='Evaluation dataset from `--valid-data` is a cloze task')
-    group.add_argument('--strict-lambada', action='store_true',
-                       help='use more difficult formulation of lambada')
-    group.add_argument('--eval-hf', action='store_true',
-                       help='perform evaluation with huggingface openai model.'
-                       'use `--load` to specify weights path to be loaded')
-    group.add_argument('--load-openai', action='store_true',
-                       help='load openai weights into our model. Use `--load` '
-                       'to specify weights path to be loaded')

-    return parser

 def add_text_generate_args(parser):
    """Text generate arguments."""

--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -119,7 +119,8 @@ def get_ltor_masks_and_position_ids(data,
                                    eod_token,
                                    reset_position_ids,
                                    reset_attention_mask,
-                                    eod_mask_loss):
+                                    eod_mask_loss,
+                                    fp16):
    """Build masks and position id for left to right model."""

    # Extract batch size and sequence length.
@@ -169,4 +170,8 @@ def get_ltor_masks_and_position_ids(data,
                    position_ids[b, (i+1):] -= (i + 1 - prev_index)
                    prev_index = i + 1

+    # Convert
+    if fp16:
+        attention_mask = attention_mask.half()
+
    return attention_mask, loss_mask, position_ids
--- a/pretrain_gpt2.py
+++ b/pretrain_gpt2.py
@@ -45,6 +45,7 @@ def model_provider():
 def get_batch(data_iterator):
    """Generate a batch"""
    args = get_args()
+    tokenizer = get_tokenizer()

    # Items and their type.
    keys = ['text']
@@ -65,13 +66,11 @@ def get_batch(data_iterator):
    # Get the masks and postition ids.
    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
        tokens,
-        args.eod_token,
+        tokenizer.eod,
        args.reset_position_ids,
        args.reset_attention_mask,
-        args.eod_mask_loss)
-    # Convert
-    if args.fp16:
-        attention_mask = attention_mask.half()
+        args.eod_mask_loss,
+        args.fp16)

    return tokens, labels, loss_mask, attention_mask, position_ids

@@ -160,9 +159,6 @@ def get_train_val_test_data():
    args.do_valid = flags[1].item()
    args.do_test = flags[2].item()

-    tokenizer = get_tokenizer()
-    args.eod_token = tokenizer.eod_id
-
    return train_data, val_data, test_data



--- a/tasks/main.py
+++ b/tasks/main.py
@@ -43,6 +43,10 @@ def get_tasks_args(parser):
                       'for training.')
    group.add_argument('--valid-data', nargs='*', default=None,
                       help='path(s) to the validation data.')
+    group.add_argument('--overlapping-eval', type=int, default=32,
+                       help='Sliding window for overlapping evaluation.')
+    group.add_argument('--strict-lambada', action='store_true',
+                       help='Use more difficult formulation of lambada.')    

    return parser


--- a/tasks/zeroshot_gpt2/datasets.py
+++ b/tasks/zeroshot_gpt2/datasets.py
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Zero-shot datasets."""
+
+import json
+import math
+
+import numpy as np
+import torch
+
+from megatron import get_args
+from megatron import get_tokenizer
+from megatron import print_rank_0
+from .detokenizer import get_detokenizer
+
+
+def build_dataset(task):
+    """Helper function to select and build dataset."""
+
+    if task == 'LAMBADA':
+        return _build_lambada_dataset()
+    if task == 'WIKITEXT103':
+        return _build_wikitext103_dataset()
+
+    raise NotImplementedError('dataset for {} task is not '
+                              'implemented.'.format(task))
+
+
+class _LMDataset(torch.utils.data.Dataset):
+
+    def __init__(self, tokens, seq_len, pad_idx, num_original_tokens,
+                 num_tokenized_tokens, overalapping_eval=None):
+        self.tokens = tokens
+        self.seq_len = seq_len
+        self.pad_idx = pad_idx
+        self.overalapping_eval = overalapping_eval
+        if self.overalapping_eval is None:
+            self.overalapping_eval = self.seq_len
+        self.overalapping_eval = max(1, self.overalapping_eval)
+        self.num_original_tokens = num_original_tokens
+        self.num_tokenized_tokens = num_tokenized_tokens
+        self.total_targets = len(self.tokens) - 1
+        # remove first sequence tokens
+        targets = max(self.total_targets - self.overalapping_eval, 0)
+        self.total_sequences = max(
+            math.ceil(targets / self.overalapping_eval) + 1, 1)
+
+    def __len__(self):
+        return self.total_sequences
+
+    def __getitem__(self, idx):
+        start_idx = idx * self.overalapping_eval
+        end_idx = start_idx + self.seq_len
+        tokens = self.tokens[start_idx:end_idx+1]
+        num_tokens = len(tokens)
+        pad_mask = [1]*num_tokens
+        if num_tokens < self.seq_len+1:
+            num_pad = (self.seq_len+1-num_tokens)
+            pad_mask += [0]*(num_pad)
+            tokens += [self.pad_idx] * num_pad
+        pad_mask = np.array(pad_mask[1:])
+        if self.overalapping_eval != self.seq_len and idx != 0:
+            pad_mask[:-self.overalapping_eval] *= 0
+
+        return {'text': np.array(tokens), 'pad_mask': pad_mask}
+
+
+class _LambadaDataset(torch.utils.data.Dataset):
+
+    def __init__(self, path, pad_idx, tokenizer, seq_len, strict=False):
+        print_rank_0('> building lambada dataset from {} ...'.format(path))
+        self.seq_len = seq_len
+        self.pad_idx = pad_idx
+        self.tokenizer = tokenizer
+        self.strict = strict
+
+        self.tokens = []
+        self.labels = []
+        with open(path, 'r') as f:
+            for line in f.readlines():
+                text = json.loads(line)['text']
+                tokens, labels = self.get_tokens(text)
+                self.tokens.append(tokens)
+                self.labels.append(labels)
+
+    def get_tokens(self, text):
+        if not self.strict:
+            tokens = self.tokenizer.tokenize(text)
+            return tokens[:-1], [tokens[-1]]
+        last_token = text.split()[-1]
+        start_idx = text.rfind(last_token)
+        beginning_tokens = self.tokenizer.tokenize(text[:start_idx].strip())
+        last_token = self.tokenizer.tokenize(' '+last_token)
+        return beginning_tokens, last_token
+
+    def __len__(self):
+        return len(self.tokens)
+
+    def __getitem__(self, idx):
+        tokens = self.tokens[idx]
+        num_tokens = len(tokens)
+        pad_mask = [0]*num_tokens
+        labels = self.labels[idx]
+        pad_mask += [1]*len(labels)
+        tokens = tokens+labels
+        num_tokens = len(tokens)
+        if num_tokens < self.seq_len+1:
+            num_pad = (self.seq_len+1-num_tokens)
+            pad_mask += [0]*(num_pad)
+            tokens += [self.pad_idx] * num_pad
+        pad_mask = np.array(pad_mask[1:])
+
+        return {'text': np.array(tokens), 'pad_mask': pad_mask}
+
+
+def _build_lambada_dataset():
+    """Build lambada dataset."""
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    assert len(args.valid_data) == 1
+    val_dataset = _LambadaDataset(args.valid_data, tokenizer.eod, tokenizer,
+                                  args.seq_length, args.strict_lambada)
+    print_rank_0(' > found {} samples.'.format(len(val_dataset)))
+
+    return val_dataset
+
+
+def _build_wikitext103_dataset():
+    """"""
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    assert len(args.valid_data) == 1
+    with open(args.valid_data, "rb") as reader:
+        entire_data = reader.read().decode('utf-8')
+    num_original_tokens = len(entire_data.strip().split(" "))
+    entire_data = get_detokenizer(args.valid_data)(entire_data)
+    tokenized_data = tokenizer.tokenize(entire_data)
+    num_tokenized_tokens = len(tokenized_data)
+
+    val_dataset = _LMDataset(tokenized_data, args.seq_length, tokenizer.eod,
+                             num_original_tokens, num_tokenized_tokens,
+                             args.overlapping_eval)
+    print_rank_0(' > number of original tokens: {}, number of detokenized '
+                 'tokens: {}'.format(num_original_tokens, num_tokenized_tokens))
+
+    return val_dataset
--- a/detokenizer.py
+++ b/detokenizer.py
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Detokenization."""
+
 import re

+
 def ptb_detokenizer(string):
-	string = string.replace(" '", "'")
+        string = string.replace(" '", "'")
 	string = string.replace(" \n", "\n")
 	string = string.replace("\n ", "\n")
 	string = string.replace(" n't", "n't")
@@ -44,17 +62,22 @@ def wikitext_detokenizer(string):

 	return string

+
 def lambada_detokenizer(string):
 	return string

-def get_detokenizer(path):
-	for key in DETOKENIZERS.keys():
-		if key in path:
-			print(key)
-			return DETOKENIZERS[key]

-DETOKENIZERS = {
+_DETOKENIZERS = {
 	'ptb': ptb_detokenizer,
 	'wikitext': wikitext_detokenizer,
 	'lambada': lambada_detokenizer,
 }
+
+
+def get_detokenizer(path):
+	for key in DETOKENIZERS.keys():
+		if key in path:
+			print(key)
+			return _DETOKENIZERS[key]
+
+
--- a/tasks/zeroshot_gpt2/evaluate.py
+++ b/tasks/zeroshot_gpt2/evaluate.py
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""GPT2 zero-shot evaluation."""
+
+import math
+
+import torch
+
+from megatron import get_args
+from megatron import get_tokenizer
+from megatron import mpu
+from megatron import print_rank_0
+from megatron.checkpointing import load_checkpoint
+from megatron.model import GPT2Model
+from megatron.training import get_model
+from megatron.utils import get_ltor_masks_and_position_ids
+from tasks.finetune_utils import build_data_loader
+
+from .dataset import build_dataset
+
+
+def get_model_provider(eval_metric):
+    """Based on evaluation metric set the parallel-output flag and
+    return the model provider."""
+
+    def model_provider():
+        """Build the model."""
+
+        if eval_metric == 'loss':
+            parallel_output = True
+        elif eval_metric == 'accuracy':
+            parallel_output = False
+        else:
+            raise NotImplementedError('output type for {} evaluation metric '
+                                      'is not supported.'.format(eval_metric))
+
+        print_rank_0('building GPT2 model ...')
+        model = GPT2Model(num_tokentypes=0, parallel_output=parallel_output)
+
+        return model
+
+    return model_provider
+
+
+def process_batch(batch):
+    """Process batch and produce inputs for the model."""
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    loss_mask = batch['pad_mask'].long().cuda().contiguous().byte()
+    tokens_ = batch['text'].long().cuda().contiguous()
+    labels = tokens_[:, 1:].contiguous()
+    tokens = tokens_[:, :-1].contiguous()
+
+    # Get the masks and postition ids.
+    attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
+        tokens,
+        tokenizer.eod,
+        args.reset_position_ids,
+        args.reset_attention_mask,
+        args.eod_mask_loss)
+
+    return tokens, labels, attention_mask, position_ids, loss_mask
+
+
+def forward_step(batch, model, eval_metric):
+    """Forward step."""
+
+    # Get the batch.
+    tokens, labels, attention_mask, position_ids, loss_mask = process_batch(
+        batch)
+
+    # Forward model.
+    output = model(tokens, position_ids, attention_mask)
+
+    # For loss, return the unreduced loss.
+    if eval_metric == 'loss':
+        losses = mpu.vocab_parallel_cross_entropy(
+            output.contiguous().float(), labels.contiguous())
+        loss = torch.sum(
+            losses.view(-1) * loss_mask.contiguous().view(-1).float())
+        return loss
+
+    # For accuracy, return the number of correctly predicted samples.
+    if eval_metric == 'accuracy':
+        outputs = torch.argmax(output, -1)
+        correct = (outputs == labels).float()
+        correct[(1 - loss_mask).bool()] = 1
+        correct = correct.prod(-1)
+        return correct.sum()
+
+    raise NotImplementedError('forward method for evaluation metric {} '
+                              'is not implemented.'.format(eval_metric))
+
+
+def evaluate(data_loader, model, eval_metric):
+    """Evaluation."""
+    args = get_args()
+
+    # Turn on evaluation mode which disables dropout.
+    model.eval()
+
+    total_output = 0.0
+    with torch.no_grad():
+        # For all the batches in the dataset.
+        for iteration, batch in enumerate(data_loader):
+            if iteration % args.log_interval == 0:
+                print_rank_0('> working on iteration: {}'.format(iteration))
+            # Forward evaluation.
+            output = forward_step(batch, model, eval_metric)
+
+            # Reduce across processes.
+            torch.distributed.all_reduce(output,
+                                         group=mpu.get_data_parallel_group())
+
+            total_output += output
+
+    return total_output
+
+
+def evaluate_and_print_results(task, data_loader, model, eval_metric):
+    """Evaluate and print results on screen."""
+
+    # Evaluate and get results.
+    output = evaluate(data_loader, model, eval_metric)
+
+    string = ' validation results on {} | '.format(task)
+    if eval_metric == 'loss':
+        num_tokenized_tokens = data_loader.dataset.num_tokenized_tokens
+        num_original_tokens = data_loader.dataset.num_original_tokens
+        val_loss = output / (num_tokenized_tokens - 1)
+        ppl = math.exp(min(20, val_loss))
+        token_ratio = (num_tokenized_tokens - 1) / (num_original_tokens - 1)
+        adjusted_ppl = math.exp(min(20, val_loss * token_ratio))
+        string += 'avg loss: {:.4E} | '.format(val_loss)
+        string += 'ppl: {:.4E} | '.format(ppl)
+        string += 'adjusted ppl: {:.4E} | '.format(adjusted_ppl)
+        string += 'token ratio: {} |'.format(token_ratio)
+
+    elif eval_metric == 'accuracy':
+        num_examples = len(data_loader.dataset)
+        acc = output / num_examples
+        string += 'number correct: {:.4E} | '.format(output)
+        string += 'total examples: {:.4E} | '.format(num_examples)
+        string += 'avg accuracy: {:.4E}'.format(acc)
+
+    else:
+        raise NotImplementedError('evaluation method for {} metric is not '
+                                  'implemented yet.'.format(eval_metric))
+
+    length = len(string) + 1
+    print_rank_0('-' * length)
+    print_rank_0(string)
+    print_rank_0('-' * length)
+
+
+def main():
+    """Main program."""
+    args = get_args()
+
+    if args.task == 'LAMBADA':
+        eval_metric = 'accuracy'
+    elif args.task == 'WIKITEXT103':
+        eval_metric = 'loss'
+    else:
+        raise NotImplementedError('{} task is not implemented.'.format(
+            args.task))
+
+    # Set up model and load checkpoint.
+    model = get_model(get_model_provider(eval_metric))
+    if args.load is not None:
+        _ = load_checkpoint(model, None, None)
+
+    # Data stuff.
+    dataset = build_dataset(args.task)
+    dataloader = build_data_loader(dataset, args.batch_size,
+                                   args.num_workers, drop_last=False)
+
+    # Run evaluation.
+    evaluate_and_print_results(args.task, dataloader, model, eval_metric)
+
+    print_rank_0('done :-)')