refactored evaluate_gpt2

8fae1cdd · Mohammad · bf3ce751 · bf3ce751 · 8fae1cdd · 8fae1cdd
Commit 8fae1cdd authored Apr 01, 2020 by Mohammad
8 changed files
--- a/evaluate_gpt2.py
+++ b/evaluate_gpt2.py
-# coding=utf-8
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Pretrain BERT"""
-
-import os
-import json
-import math
-import random
-import numpy as np
-import torch
-
-
-from arguments import get_args
-from configure_data import configure_data
-from megatron.fp16 import FP16_Module
-from megatron.fp16 import FP16_Optimizer
-from megatron.learning_rates import AnnealingLR
-from megatron.model import GPT2Model
-from megatron.model import DistributedDataParallel as DDP
-from megatron import mpu
-from apex.optimizers import FusedAdam as Adam
-from megatron.utils import Timers
-from megatron.utils import load_checkpoint
-from megatron.utils import report_memory
-from megatron.utils import print_params_min_max_norm
-from megatron import print_rank_0
-
-from megatron.data_utils import make_tokenizer
-
-from detokenizer import *
-
-def get_model(args):
-    """Build the model."""
-
-    print_rank_0('building GPT2 model ...')
-    model = GPT2Model(num_layers=args.num_layers,
-                      vocab_size=args.vocab_size,
-                      hidden_size=args.hidden_size,
-                      num_attention_heads=args.num_attention_heads,
-                      embedding_dropout_prob=args.hidden_dropout,
-                      attention_dropout_prob=args.attention_dropout,
-                      output_dropout_prob=args.hidden_dropout,
-                      max_sequence_length=args.max_position_embeddings,
-                      checkpoint_activations=args.checkpoint_activations,
-                      checkpoint_num_layers=args.checkpoint_num_layers,
-                      parallel_output=not args.cloze_eval)
-
-    print_rank_0(' > number of parameters: {}'.format(
-        sum([p.nelement() for p in model.parameters()])))
-
-    # GPU allocation.
-    model.cuda(torch.cuda.current_device())
-
-    # Fp16 conversion.
-    if args.fp16:
-        model = FP16_Module(model)
-
-    # Wrap model for distributed training.
-    model = DDP(model)
-
-    return model
-
-
-def setup_model(args):
-    """Setup model and optimizer."""
-
-    model = get_model(args)
-
-    if args.load is not None:
-        _ = load_checkpoint(
-            model, None, None, args)
-
-    return model
-
-def get_masks_and_position_ids(data,
-                               eod_token,
-                               reset_position_ids,
-                               reset_attention_mask):
-
-    # Extract batch size and sequence length.
-    batch_size, seq_length = data.size()
-
-    # Attention mask (lower triangular).
-    if reset_attention_mask:
-        att_mask_batch = batch_size
-    else:
-        att_mask_batch = 1
-    attention_mask = torch.tril(torch.ones(
-        (att_mask_batch, seq_length, seq_length), device=data.device)).view(
-            att_mask_batch, 1, seq_length, seq_length)
-
-    # Loss mask.
-    loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device)
-    loss_mask[data == eod_token] = 0.0
-
-    # Position ids.
-    position_ids = torch.arange(seq_length, dtype=torch.long,
-                                device=data.device)
-    position_ids = position_ids.unsqueeze(0).expand_as(data)
-    # We need to clone as the ids will be modifed based on batch index.
-    if reset_position_ids:
-        position_ids = position_ids.clone()
-
-    if reset_position_ids or reset_attention_mask:
-        # Loop through the batches:
-        for b in range(batch_size):
-
-            # Find indecies where EOD token is.
-            eod_index = position_ids[b, data[b] == eod_token]
-            # Detach indecies from positions if going to modify positions.
-            if reset_position_ids:
-                eod_index = eod_index.clone()
-
-            # Loop through EOD indecies:
-            prev_index = 0
-            for j in range(eod_index.size()[0]):
-                i = eod_index[j]
-                # Mask attention loss.
-                if reset_attention_mask:
-                    attention_mask[b, 0, (i+1):, :(i+1)] = 0
-                # Reset positions.
-                if reset_position_ids:
-                    position_ids[b, (i+1):] -= (i + 1 - prev_index)
-                    prev_index = i + 1
-
-    return attention_mask, loss_mask, position_ids
-
-def get_batch(data_iterator, args, timers):
-    ''' get_batch subdivides the source data into chunks of
-    length args.seq_length. If source is equal to the example
-    output of the data loading example, with a seq_length limit
-    of 2, we'd get the following two Variables for i = 0:
-    ┌ a g m s ┐ ┌ b h n t ┐
-    └ b h n t ┘ └ c i o u ┘
-    Note that despite the name of the function, the subdivison of data is not
-    done along the batch dimension (i.e. dimension 1), since that was handled
-    by the data loader. The chunks are along dimension 0, corresponding
-    to the seq_len dimension in the LSTM. A Variable representing an appropriate
-    shard reset mask of the same dimensions is also returned.
-    '''
-    # Items and their type.
-    keys = ['text', 'pad_mask']
-    datatype = torch.int64
-
-    # Broadcast data.
-    timers('data loader').start()
-    if data_iterator is not None:
-        data = next(data_iterator)
-    else:
-        data = None
-    timers('data loader').stop()
-    data_b = mpu.broadcast_data(keys, data, datatype)
-
-    # Unpack.
-    tokens_ = data_b['text'].long()
-    lm_labels = tokens_[:, 1:].contiguous()
-    tokens = tokens_[:, :-1].contiguous()
-    padding_mask = data_b['pad_mask'].byte()
-
-    # Get the masks and postition ids.
-    attention_mask, loss_mask, position_ids = get_masks_and_position_ids(
-        tokens,
-        args.eod_token,
-        args.reset_position_ids,
-        args.reset_attention_mask)
-
-    # Convert
-    if args.fp16:
-        attention_mask = attention_mask.half()
-
-    return tokens, lm_labels, attention_mask, position_ids, padding_mask
-
-
-def forward_step(data_iterator, model, args, timers):
-    """Forward step."""
-
-    # Get the batch.
-    timers('batch generator').start()
-    batch = get_batch(data_iterator, args, timers)
-    if batch is None:
-        return None
-    tokens, lm_labels, attention_mask, position_ids, loss_mask = batch
-    timers('batch generator').stop()
-    # Forward model.
-    if args.eval_hf:
-        output, _ = model(tokens)
-    else:
-        output = model(tokens, position_ids, attention_mask)
-
-    if not args.cloze_eval:
-        #losses = torch.nn.CrossEntropyLoss(reduce=False)(
-        losses = mpu.vocab_parallel_cross_entropy(
-            output.contiguous().float(), lm_labels.contiguous())
-        loss_mask = loss_mask.contiguous()
-        loss_mask = loss_mask.view(-1)
-        lm_loss = torch.sum(
-            losses.view(-1) * loss_mask.float())
-    else:
-        outputs = torch.argmax(output, -1)
-        correct = (outputs == lm_labels).float()
-        correct[(1-loss_mask).bool()] = 1
-        correct = correct.prod(-1)
-        lm_loss = correct.sum()
-#        loss_mask = loss_mask.contiguous().view(-1).float()
-#        lm_loss = torch.sum(acc * loss_mask)
-
-    return lm_loss
-
-
-def evaluate(data_loader, model, args, timers,
-             num_iterations=None):
-    """Evaluation."""
-
-    # Turn on evaluation mode which disables dropout.
-    model.eval()
-
-    total_lm_loss = 0
-    if num_iterations is not None:
-        max_iters = num_iterations
-    else:
-        if mpu.get_model_parallel_rank() == 0:
-            max_iters_gpu = torch.cuda.LongTensor([len(data_loader)])
-        else:
-            max_iters_gpu = torch.cuda.LongTensor([0])
-        torch.distributed.broadcast(max_iters_gpu,
-                                    mpu.get_model_parallel_src_rank(),
-                                    group=mpu.get_model_parallel_group())
-        max_iters = max_iters_gpu[0].item()
-        print_rank_0('global rank: {} | max iters: {}'.format(
-            torch.distributed.get_rank(), max_iters))
-
-    if data_loader is not None:
-        data_iterator = iter(data_loader)
-    else:
-        data_iterator = None
-
-    with torch.no_grad():
-        iteration = 0
-        while iteration < max_iters:
-            if iteration % args.log_interval == 0:
-                print_rank_0('global rank: {} | iteration: {}'.format(
-                    torch.distributed.get_rank(), iteration))
-            # Forward evaluation.
-            lm_loss = forward_step(data_iterator, model, args, timers)
-            if lm_loss is None:
-                break
-            # Reduce across processes.
-            if isinstance(model, DDP):
-                torch.distributed.all_reduce(lm_loss.data)
-                if args.cloze_eval:
-                    lm_loss.data = lm_loss.data / args.world_size
-                else:
-                    lm_loss.data = lm_loss.data / args.model_parallel_size
-
-            if not args.cloze_eval:
-                total_lm_loss += lm_loss.data.detach().float().item()/(args.num_tokenized_tokens-1)
-            else:
-                total_lm_loss += lm_loss.data.detach().float().item()
-
-            iteration += 1
-
-    # Move model back to the train mode.
-    model.train()
-
-    return total_lm_loss
-
-
-def evaluate_and_print_results(prefix, data_iterator, model,
-                               args, timers, num_iterations=None):
-    """Helper function to evaluate and dump results on screen."""
-    if not args.cloze_eval:
-        lm_loss = evaluate(data_iterator, model, args, timers, num_iterations)
-        val_loss = lm_loss
-        ppl = math.exp(min(20, val_loss))
-        token_ratio = (args.num_tokenized_tokens-1)/(args.num_original_tokens-1)
-        adjusted_ppl = math.exp(min(20, val_loss*token_ratio))
-        print_rank_0('-' * 100)
-        string = ' validation results on {} | '.format(prefix)
-        string += 'avg loss: {:.4E} | '.format(val_loss)
-        string += 'ppl: {:.4E} | '.format(ppl)
-        string += 'adjusted ppl: {:.4E} | '.format(adjusted_ppl)
-        string += 'token ratio: {} |'.format(token_ratio)
-        length = len(string) + 1
-        print_rank_0('-' * length)
-        print_rank_0(string)
-        print_rank_0('-' * length)
-
-        return val_loss
-    else:
-        num_correct = evaluate(data_iterator, model, args, timers, num_iterations)
-        acc = num_correct / args.num_examples
-        print_rank_0('-' * 100)
-        string = ' validation results on {} | '.format(prefix)
-        string += 'number correct: {:.4E} | '.format(num_correct)
-        string += 'total examples: {:.4E} | '.format(args.num_examples)
-        string += 'avg accuracy: {:.4E}'.format(acc)
-        length = len(string) + 1
-        print_rank_0('-' * length)
-        print_rank_0(string)
-        print_rank_0('-' * length)
-        return acc
-
-
-def initialize_distributed(args):
-    """Initialize torch.distributed."""
-
-    # Manually set the device ids.
-    device = args.rank % torch.cuda.device_count()
-    if args.local_rank is not None:
-        device = args.local_rank
-    torch.cuda.set_device(device)
-    # Call the init process
-    init_method = 'tcp://'
-    master_ip = os.getenv('MASTER_ADDR', 'localhost')
-    master_port = os.getenv('MASTER_PORT', '6000')
-    init_method += master_ip + ':' + master_port
-    torch.distributed.init_process_group(
-        backend=args.distributed_backend,
-        world_size=args.world_size, rank=args.rank,
-        init_method=init_method)
-
-    # Set the model-parallel / data-parallel communicators.
-    mpu.initialize_model_parallel(args.model_parallel_size)
-
-
-def set_random_seed(seed):
-    """Set random seed for reproducability."""
-
-    if seed is not None and seed > 0:
-        random.seed(seed)
-        np.random.seed(seed)
-        torch.manual_seed(seed)
-        mpu.model_parallel_cuda_manual_seed(seed)
-
-
-class LM_Eval_Dataset(torch.utils.data.Dataset):
-    def __init__(self, tokens, seq_len, pad_idx, overalapping_eval=None, **kwargs):
-        self.tokens = tokens
-        self.seq_len = seq_len
-        self.pad_idx = pad_idx
-        self.overalapping_eval = overalapping_eval
-        if self.overalapping_eval is None:
-            self.overalapping_eval = self.seq_len
-        self.overalapping_eval = max(1, self.overalapping_eval)
-
-        self.total_targets = len(self.tokens) - 1
-        # remove first sequence tokens
-        targets = max(self.total_targets - self.overalapping_eval, 0)
-        self.total_sequences = max(math.ceil(targets / self.overalapping_eval)+1, 1)
-
-    def __len__(self):
-        return self.total_sequences
-
-    def __getitem__(self, idx):
-        start_idx = idx * self.overalapping_eval
-        end_idx = start_idx + self.seq_len
-        tokens = self.tokens[start_idx:end_idx+1]
-        num_tokens = len(tokens)
-        pad_mask = [1]*num_tokens
-        if num_tokens < self.seq_len+1:
-            num_pad = (self.seq_len+1-num_tokens) 
-            pad_mask += [0]*(num_pad)
-            tokens += [self.pad_idx] * num_pad
-        pad_mask = np.array(pad_mask[1:])
-        if self.overalapping_eval != self.seq_len and idx!=0:
-            pad_mask[:-self.overalapping_eval] *= 0
-
-        return {'text': np.array(tokens), 'pad_mask': pad_mask}
-
-class Lambada_Eval_Dataset(torch.utils.data.Dataset):
-    def __init__(self, path, tokenizer, seq_len, strict=False, **kwargs):
-        self.seq_len = seq_len
-        self.pad_idx = tokenizer.get_command('pad').Id
-        self.tokenizer = tokenizer
-        self.strict = strict
-
-        self.tokens = []
-        self.labels = []
-        with open(path, 'r') as f:
-            for line in f.readlines():
-                text = json.loads(line)['text']
-                tokens, labels = self.get_tokens(text)
-                self.tokens.append(tokens)
-                self.labels.append(labels)
-
-    def get_tokens(self, text):
-        if not self.strict:
-            tokens = self.tokenizer.EncodeAsIds(text).tokenization
-            return tokens[:-1], [tokens[-1]]
-        last_token = text.split()[-1]
-        start_idx = text.rfind(last_token)
-        beginning_tokens = self.tokenizer.EncodeAsIds(text[:start_idx].strip()).tokenization
-        last_token = self.tokenizer.EncodeAsIds(' '+last_token).tokenization
-        return beginning_tokens, last_token
-
-    def __len__(self):
-        return len(self.tokens)
-
-    def __getitem__(self, idx):
-
-        tokens = self.tokens[idx]
-        num_tokens = len(tokens)
-        pad_mask = [0]*num_tokens
-        labels = self.labels[idx]
-        pad_mask += [1]*len(labels)
-        tokens = tokens+labels
-        num_tokens = len(tokens)
-        if num_tokens < self.seq_len+1:
-            num_pad = (self.seq_len+1-num_tokens) 
-            pad_mask += [0]*(num_pad)
-            tokens += [self.pad_idx] * num_pad
-        pad_mask = np.array(pad_mask[1:])
-
-        return {'text': np.array(tokens), 'pad_mask': pad_mask}
-
-def get_tokenizer(args):
-    tokenizer_args = {
-        'tokenizer_type': args.tokenizer_type,
-        'corpus': None,
-        'model_path': args.tokenizer_path,
-        'vocab_size': args.vocab_size,
-        'model_type': args.tokenizer_model_type,
-        'cache_dir': args.cache_dir}
-    return make_tokenizer(**tokenizer_args) 
-
-def get_eval_data(args):
-    val_dataloader = None
-    if mpu.get_model_parallel_rank() == 0:
-        eval_batch_size = args.eval_batch_size
-        eval_batch_size = args.batch_size if eval_batch_size is None else eval_batch_size
-        seq_len = args.seq_length
-        valid_data = args.valid_data
-        valid_data = valid_data[0] if isinstance(valid_data, list) else valid_data
-
-        tokenizer = get_tokenizer(args)
-
-        if not args.cloze_eval:
-
-            with open(valid_data, "rb") as reader:
-                entire_data = reader.read().decode('utf-8')
-            num_original_tokens = len(entire_data.strip().split(" "))
-            entire_data = get_detokenizer(valid_data)(entire_data)
-            tokenized_data = tokenizer.EncodeAsIds(entire_data).tokenization
-            num_tokenized_tokens = len(tokenized_data)
-            string = 'Original Tokens: %d, Detokenized tokens: %d' % (num_tokenized_tokens, num_original_tokens)
-            print_rank_0(string)
-
-            eod_token = tokenizer.get_command('pad').Id
-            val_dataset = LM_Eval_Dataset(tokenized_data, seq_len, eod_token,
-                                          args.overlapping_eval)
-        else:
-            val_dataset = Lambada_Eval_Dataset(valid_data, tokenizer, seq_len, args.strict_lambada)
-            num_tokenized_tokens = 0
-            num_original_tokens = 0
-        val_dataloader = torch.utils.data.DataLoader(
-            val_dataset, batch_size=eval_batch_size, drop_last=False)
-
-        before = tokenizer.num_tokens
-        after = before
-        multiple = args.make_vocab_size_divisible_by * \
-                   mpu.get_model_parallel_world_size()
-        while (after % multiple) != 0:
-            after += 1
-        print_rank_0('> padded vocab (size: {}) with {} dummy tokens (new size: {})'.
-              format(before, after - before, after))
-        eod_token = tokenizer.get_command('pad').Id
-        num_examples = len(val_dataset)
-        token_counts = torch.cuda.LongTensor([after, eod_token, num_examples,
-                                              num_original_tokens,
-                                              num_tokenized_tokens])
-    else:
-        token_counts = torch.cuda.LongTensor([0, 0, 0, 0, 0])
-    torch.distributed.broadcast(token_counts,
-                                mpu.get_model_parallel_src_rank(),
-                                group=mpu.get_model_parallel_group())
-    args.vocab_size = token_counts[0].item()
-    args.eod_token = token_counts[1].item()
-    args.num_examples = token_counts[2].item()
-    args.num_original_tokens = token_counts[3].item()
-    args.num_tokenized_tokens = token_counts[4].item()
-
-    print('global rank: {} | vocab size: {} | eod token: {} | '
-          'num_examples: {} | num_original_tokens: {} | '
-          'num_tokenized_tokens: {}'.format(
-              torch.distributed.get_rank(), args.vocab_size,
-              args.eod_token, args.num_examples, args.num_original_tokens,
-              args.num_tokenized_tokens ))
-    return val_dataloader
-
-def main():
-    """Main training program."""
-
-    print('Evaluate GPT2 model')
-
-    # Disable CuDNN.
-    torch.backends.cudnn.enabled = False
-
-    # Timer.
-    timers = Timers()
-
-    # Arguments.
-    args = get_args()
-
-    # Pytorch distributed.
-    initialize_distributed(args)
-
-    # Random seeds for reproducability.
-    set_random_seed(args.seed)
-
-    # Data stuff.
-    eval_data = get_eval_data(args)
-
-    # Model, optimizer, and learning rate.
-    if args.eval_hf:
-        from pytorch_pretrained_bert import GPT2LMHeadModel
-        from pytorch_pretrained_bert import GPT2Model as HFGPT2Model
-        if args.num_layers == 24:
-            model_path = args.load
-            #model_path = '/home/universal-lm-data.cosmos549/repos/gpt2_mp/models/345M'
-            hfmodel = HFGPT2Model.from_pretrained(model_path, cache_dir='gpt2_weights', from_tf=True).cuda()
-            model = GPT2LMHeadModel(hfmodel.config)
-            model.transformer.load_state_dict(hfmodel.state_dict())
-            model.cuda()
-        else:
-            model = GPT2LMHeadModel.from_pretrained('gpt2', cache_dir='gpt2_weights').cuda()
-    else:
-        if args.load_openai:
-            from megatron.utils import move_weights
-            model_path = args.load
-            args.load = None
-            model = setup_model(args)
-            from pytorch_pretrained_bert import GPT2LMHeadModel
-            from pytorch_pretrained_bert import GPT2Model as HFGPT2Model
-
-            model_path = 'gpt2'
-            from_tf = False
-            print('loading openai weights')
-            model.cpu()
-            if args.num_layers == 24:
-                #model_path = '/home/universal-lm-data.cosmos549/repos/gpt2_mp/models/345M'
-                hfmodel = HFGPT2Model.from_pretrained(model_path, cache_dir='gpt2_weights', from_tf=True)
-                gpt2model = GPT2LMHeadModel(hfmodel.config)
-                gpt2model.transformer.load_state_dict(hfmodel.state_dict())
-                gpt2model
-            else:
-                gpt2model = GPT2LMHeadModel.from_pretrained('gpt2', cache_dir='gpt2_weights')
-            model2fill = model
-            while isinstance(model2fill, (DDP, FP16_Module)):
-                model2fill = model2fill.module
-            move_weights(model2fill, gpt2model)
-            model.cuda()
-        else:
-            model = setup_model(args)
-
-    # Run on test data.
-    prefix = "wiki" #os.path.basename(args.valid_data)
-    evaluate_and_print_results(prefix, eval_data,
-                               model, args, timers)
-
-
-if __name__ == "__main__":
-    main()
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -355,35 +355,7 @@ def _add_gpt2_args(parser):
    return parser


-def add_evaluation_args(parser):
-    """Evaluation arguments."""
-
-    group = parser.add_argument_group('validation', 'validation configurations')
-
-    group.add_argument('--eval-batch-size', type=int, default=None,
-                       help='Data Loader batch size for evaluation datasets.'
-                       'Defaults to `--batch-size`')
-    group.add_argument('--eval-seq-length', type=int, default=None,
-                       help='Maximum sequence length to process for '
-                       'evaluation. Defaults to `--seq-length`')
-    group.add_argument('--eval-max-preds-per-seq', type=int, default=None,
-                       help='Maximum number of predictions to use for '
-                       'evaluation. Defaults to '
-                       'math.ceil(`--eval-seq-length`*.15/10)*10')
-    group.add_argument('--overlapping-eval', type=int, default=32,
-                       help='sliding window for overlapping eval ')
-    group.add_argument('--cloze-eval', action='store_true',
-                       help='Evaluation dataset from `--valid-data` is a cloze task')
-    group.add_argument('--strict-lambada', action='store_true',
-                       help='use more difficult formulation of lambada')
-    group.add_argument('--eval-hf', action='store_true',
-                       help='perform evaluation with huggingface openai model.'
-                       'use `--load` to specify weights path to be loaded')
-    group.add_argument('--load-openai', action='store_true',
-                       help='load openai weights into our model. Use `--load` '
-                       'to specify weights path to be loaded')

-    return parser

 def add_text_generate_args(parser):
    """Text generate arguments."""

--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -119,7 +119,8 @@ def get_ltor_masks_and_position_ids(data,
                                    eod_token,
                                    reset_position_ids,
                                    reset_attention_mask,
-                                    eod_mask_loss):
+                                    eod_mask_loss,
+                                    fp16):
    """Build masks and position id for left to right model."""

    # Extract batch size and sequence length.
@@ -169,4 +170,8 @@ def get_ltor_masks_and_position_ids(data,
                    position_ids[b, (i+1):] -= (i + 1 - prev_index)
                    prev_index = i + 1

+    # Convert
+    if fp16:
+        attention_mask = attention_mask.half()
+
    return attention_mask, loss_mask, position_ids
--- a/pretrain_gpt2.py
+++ b/pretrain_gpt2.py
@@ -45,6 +45,7 @@ def model_provider():
 def get_batch(data_iterator):
    """Generate a batch"""
    args = get_args()
+    tokenizer = get_tokenizer()

    # Items and their type.
    keys = ['text']
@@ -65,13 +66,11 @@ def get_batch(data_iterator):
    # Get the masks and postition ids.
    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
        tokens,
-        args.eod_token,
+        tokenizer.eod,
        args.reset_position_ids,
        args.reset_attention_mask,
-        args.eod_mask_loss)
-    # Convert
-    if args.fp16:
-        attention_mask = attention_mask.half()
+        args.eod_mask_loss,
+        args.fp16)

    return tokens, labels, loss_mask, attention_mask, position_ids

@@ -160,9 +159,6 @@ def get_train_val_test_data():
    args.do_valid = flags[1].item()
    args.do_test = flags[2].item()

-    tokenizer = get_tokenizer()
-    args.eod_token = tokenizer.eod_id
-
    return train_data, val_data, test_data



--- a/tasks/main.py
+++ b/tasks/main.py
@@ -43,6 +43,10 @@ def get_tasks_args(parser):
                       'for training.')
    group.add_argument('--valid-data', nargs='*', default=None,
                       help='path(s) to the validation data.')
+    group.add_argument('--overlapping-eval', type=int, default=32,
+                       help='Sliding window for overlapping evaluation.')
+    group.add_argument('--strict-lambada', action='store_true',
+                       help='Use more difficult formulation of lambada.')    

    return parser


--- a/tasks/zeroshot_gpt2/datasets.py
+++ b/tasks/zeroshot_gpt2/datasets.py
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Zero-shot datasets."""
+
+import json
+import math
+
+import numpy as np
+import torch
+
+from megatron import get_args
+from megatron import get_tokenizer
+from megatron import print_rank_0
+from .detokenizer import get_detokenizer
+
+
+def build_dataset(task):
+    """Helper function to select and build dataset."""
+
+    if task == 'LAMBADA':
+        return _build_lambada_dataset()
+    if task == 'WIKITEXT103':
+        return _build_wikitext103_dataset()
+
+    raise NotImplementedError('dataset for {} task is not '
+                              'implemented.'.format(task))
+
+
+class _LMDataset(torch.utils.data.Dataset):
+
+    def __init__(self, tokens, seq_len, pad_idx, num_original_tokens,
+                 num_tokenized_tokens, overalapping_eval=None):
+        self.tokens = tokens
+        self.seq_len = seq_len
+        self.pad_idx = pad_idx
+        self.overalapping_eval = overalapping_eval
+        if self.overalapping_eval is None:
+            self.overalapping_eval = self.seq_len
+        self.overalapping_eval = max(1, self.overalapping_eval)
+        self.num_original_tokens = num_original_tokens
+        self.num_tokenized_tokens = num_tokenized_tokens
+        self.total_targets = len(self.tokens) - 1
+        # remove first sequence tokens
+        targets = max(self.total_targets - self.overalapping_eval, 0)
+        self.total_sequences = max(
+            math.ceil(targets / self.overalapping_eval) + 1, 1)
+
+    def __len__(self):
+        return self.total_sequences
+
+    def __getitem__(self, idx):
+        start_idx = idx * self.overalapping_eval
+        end_idx = start_idx + self.seq_len
+        tokens = self.tokens[start_idx:end_idx+1]
+        num_tokens = len(tokens)
+        pad_mask = [1]*num_tokens
+        if num_tokens < self.seq_len+1:
+            num_pad = (self.seq_len+1-num_tokens)
+            pad_mask += [0]*(num_pad)
+            tokens += [self.pad_idx] * num_pad
+        pad_mask = np.array(pad_mask[1:])
+        if self.overalapping_eval != self.seq_len and idx != 0:
+            pad_mask[:-self.overalapping_eval] *= 0
+
+        return {'text': np.array(tokens), 'pad_mask': pad_mask}
+
+
+class _LambadaDataset(torch.utils.data.Dataset):
+
+    def __init__(self, path, pad_idx, tokenizer, seq_len, strict=False):
+        print_rank_0('> building lambada dataset from {} ...'.format(path))
+        self.seq_len = seq_len
+        self.pad_idx = pad_idx
+        self.tokenizer = tokenizer
+        self.strict = strict
+
+        self.tokens = []
+        self.labels = []
+        with open(path, 'r') as f:
+            for line in f.readlines():
+                text = json.loads(line)['text']
+                tokens, labels = self.get_tokens(text)
+                self.tokens.append(tokens)
+                self.labels.append(labels)
+
+    def get_tokens(self, text):
+        if not self.strict:
+            tokens = self.tokenizer.tokenize(text)
+            return tokens[:-1], [tokens[-1]]
+        last_token = text.split()[-1]
+        start_idx = text.rfind(last_token)
+        beginning_tokens = self.tokenizer.tokenize(text[:start_idx].strip())
+        last_token = self.tokenizer.tokenize(' '+last_token)
+        return beginning_tokens, last_token
+
+    def __len__(self):
+        return len(self.tokens)
+
+    def __getitem__(self, idx):
+        tokens = self.tokens[idx]
+        num_tokens = len(tokens)
+        pad_mask = [0]*num_tokens
+        labels = self.labels[idx]
+        pad_mask += [1]*len(labels)
+        tokens = tokens+labels
+        num_tokens = len(tokens)
+        if num_tokens < self.seq_len+1:
+            num_pad = (self.seq_len+1-num_tokens)
+            pad_mask += [0]*(num_pad)
+            tokens += [self.pad_idx] * num_pad
+        pad_mask = np.array(pad_mask[1:])
+
+        return {'text': np.array(tokens), 'pad_mask': pad_mask}
+
+
+def _build_lambada_dataset():
+    """Build lambada dataset."""
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    assert len(args.valid_data) == 1
+    val_dataset = _LambadaDataset(args.valid_data, tokenizer.eod, tokenizer,
+                                  args.seq_length, args.strict_lambada)
+    print_rank_0(' > found {} samples.'.format(len(val_dataset)))
+
+    return val_dataset
+
+
+def _build_wikitext103_dataset():
+    """"""
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    assert len(args.valid_data) == 1
+    with open(args.valid_data, "rb") as reader:
+        entire_data = reader.read().decode('utf-8')
+    num_original_tokens = len(entire_data.strip().split(" "))
+    entire_data = get_detokenizer(args.valid_data)(entire_data)
+    tokenized_data = tokenizer.tokenize(entire_data)
+    num_tokenized_tokens = len(tokenized_data)
+
+    val_dataset = _LMDataset(tokenized_data, args.seq_length, tokenizer.eod,
+                             num_original_tokens, num_tokenized_tokens,
+                             args.overlapping_eval)
+    print_rank_0(' > number of original tokens: {}, number of detokenized '
+                 'tokens: {}'.format(num_original_tokens, num_tokenized_tokens))
+
+    return val_dataset
--- a/detokenizer.py
+++ b/detokenizer.py
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Detokenization."""
+
 import re

+
 def ptb_detokenizer(string):
-	string = string.replace(" '", "'")
+        string = string.replace(" '", "'")
 	string = string.replace(" \n", "\n")
 	string = string.replace("\n ", "\n")
 	string = string.replace(" n't", "n't")
@@ -44,17 +62,22 @@ def wikitext_detokenizer(string):

 	return string

+
 def lambada_detokenizer(string):
 	return string

-def get_detokenizer(path):
-	for key in DETOKENIZERS.keys():
-		if key in path:
-			print(key)
-			return DETOKENIZERS[key]

-DETOKENIZERS = {
+_DETOKENIZERS = {
 	'ptb': ptb_detokenizer,
 	'wikitext': wikitext_detokenizer,
 	'lambada': lambada_detokenizer,
 }
+
+
+def get_detokenizer(path):
+	for key in DETOKENIZERS.keys():
+		if key in path:
+			print(key)
+			return _DETOKENIZERS[key]
+
+
--- a/tasks/zeroshot_gpt2/evaluate.py
+++ b/tasks/zeroshot_gpt2/evaluate.py
+# coding=utf-8
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""GPT2 zero-shot evaluation."""
+
+import math
+
+import torch
+
+from megatron import get_args
+from megatron import get_tokenizer
+from megatron import mpu
+from megatron import print_rank_0
+from megatron.checkpointing import load_checkpoint
+from megatron.model import GPT2Model
+from megatron.training import get_model
+from megatron.utils import get_ltor_masks_and_position_ids
+from tasks.finetune_utils import build_data_loader
+
+from .dataset import build_dataset
+
+
+def get_model_provider(eval_metric):
+    """Based on evaluation metric set the parallel-output flag and
+    return the model provider."""
+
+    def model_provider():
+        """Build the model."""
+
+        if eval_metric == 'loss':
+            parallel_output = True
+        elif eval_metric == 'accuracy':
+            parallel_output = False
+        else:
+            raise NotImplementedError('output type for {} evaluation metric '
+                                      'is not supported.'.format(eval_metric))
+
+        print_rank_0('building GPT2 model ...')
+        model = GPT2Model(num_tokentypes=0, parallel_output=parallel_output)
+
+        return model
+
+    return model_provider
+
+
+def process_batch(batch):
+    """Process batch and produce inputs for the model."""
+    args = get_args()
+    tokenizer = get_tokenizer()
+
+    loss_mask = batch['pad_mask'].long().cuda().contiguous().byte()
+    tokens_ = batch['text'].long().cuda().contiguous()
+    labels = tokens_[:, 1:].contiguous()
+    tokens = tokens_[:, :-1].contiguous()
+
+    # Get the masks and postition ids.
+    attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
+        tokens,
+        tokenizer.eod,
+        args.reset_position_ids,
+        args.reset_attention_mask,
+        args.eod_mask_loss)
+
+    return tokens, labels, attention_mask, position_ids, loss_mask
+
+
+def forward_step(batch, model, eval_metric):
+    """Forward step."""
+
+    # Get the batch.
+    tokens, labels, attention_mask, position_ids, loss_mask = process_batch(
+        batch)
+
+    # Forward model.
+    output = model(tokens, position_ids, attention_mask)
+
+    # For loss, return the unreduced loss.
+    if eval_metric == 'loss':
+        losses = mpu.vocab_parallel_cross_entropy(
+            output.contiguous().float(), labels.contiguous())
+        loss = torch.sum(
+            losses.view(-1) * loss_mask.contiguous().view(-1).float())
+        return loss
+
+    # For accuracy, return the number of correctly predicted samples.
+    if eval_metric == 'accuracy':
+        outputs = torch.argmax(output, -1)
+        correct = (outputs == labels).float()
+        correct[(1 - loss_mask).bool()] = 1
+        correct = correct.prod(-1)
+        return correct.sum()
+
+    raise NotImplementedError('forward method for evaluation metric {} '
+                              'is not implemented.'.format(eval_metric))
+
+
+def evaluate(data_loader, model, eval_metric):
+    """Evaluation."""
+    args = get_args()
+
+    # Turn on evaluation mode which disables dropout.
+    model.eval()
+
+    total_output = 0.0
+    with torch.no_grad():
+        # For all the batches in the dataset.
+        for iteration, batch in enumerate(data_loader):
+            if iteration % args.log_interval == 0:
+                print_rank_0('> working on iteration: {}'.format(iteration))
+            # Forward evaluation.
+            output = forward_step(batch, model, eval_metric)
+
+            # Reduce across processes.
+            torch.distributed.all_reduce(output,
+                                         group=mpu.get_data_parallel_group())
+
+            total_output += output
+
+    return total_output
+
+
+def evaluate_and_print_results(task, data_loader, model, eval_metric):
+    """Evaluate and print results on screen."""
+
+    # Evaluate and get results.
+    output = evaluate(data_loader, model, eval_metric)
+
+    string = ' validation results on {} | '.format(task)
+    if eval_metric == 'loss':
+        num_tokenized_tokens = data_loader.dataset.num_tokenized_tokens
+        num_original_tokens = data_loader.dataset.num_original_tokens
+        val_loss = output / (num_tokenized_tokens - 1)
+        ppl = math.exp(min(20, val_loss))
+        token_ratio = (num_tokenized_tokens - 1) / (num_original_tokens - 1)
+        adjusted_ppl = math.exp(min(20, val_loss * token_ratio))
+        string += 'avg loss: {:.4E} | '.format(val_loss)
+        string += 'ppl: {:.4E} | '.format(ppl)
+        string += 'adjusted ppl: {:.4E} | '.format(adjusted_ppl)
+        string += 'token ratio: {} |'.format(token_ratio)
+
+    elif eval_metric == 'accuracy':
+        num_examples = len(data_loader.dataset)
+        acc = output / num_examples
+        string += 'number correct: {:.4E} | '.format(output)
+        string += 'total examples: {:.4E} | '.format(num_examples)
+        string += 'avg accuracy: {:.4E}'.format(acc)
+
+    else:
+        raise NotImplementedError('evaluation method for {} metric is not '
+                                  'implemented yet.'.format(eval_metric))
+
+    length = len(string) + 1
+    print_rank_0('-' * length)
+    print_rank_0(string)
+    print_rank_0('-' * length)
+
+
+def main():
+    """Main program."""
+    args = get_args()
+
+    if args.task == 'LAMBADA':
+        eval_metric = 'accuracy'
+    elif args.task == 'WIKITEXT103':
+        eval_metric = 'loss'
+    else:
+        raise NotImplementedError('{} task is not implemented.'.format(
+            args.task))
+
+    # Set up model and load checkpoint.
+    model = get_model(get_model_provider(eval_metric))
+    if args.load is not None:
+        _ = load_checkpoint(model, None, None)
+
+    # Data stuff.
+    dataset = build_dataset(args.task)
+    dataloader = build_data_loader(dataset, args.batch_size,
+                                   args.num_workers, drop_last=False)
+
+    # Run evaluation.
+    evaluate_and_print_results(args.task, dataloader, model, eval_metric)
+
+    print_rank_0('done :-)')