Initial commit

9e8a8c05 · jerrrrry · 9e8a8c05 · 9e8a8c05 · 9e8a8c05 · 9e8a8c05
Commit 9e8a8c05 authored Oct 14, 2024 by jerrrrry
9 changed files
--- a/implementations/pytorch/tests/test_sequence_scorer.py
+++ b/implementations/pytorch/tests/test_sequence_scorer.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import argparse
+import unittest
+
+import torch
+
+from fairseq.sequence_scorer import SequenceScorer
+
+import tests.utils as test_utils
+
+
+class TestSequenceScorer(unittest.TestCase):
+
+    def test_sequence_scorer(self):
+        # construct dummy dictionary
+        d = test_utils.dummy_dictionary(vocab_size=2)
+        self.assertEqual(d.pad(), 1)
+        self.assertEqual(d.eos(), 2)
+        self.assertEqual(d.unk(), 3)
+        eos = d.eos()
+        w1 = 4
+        w2 = 5
+
+        # construct dataloader
+        data = [
+            {
+                'source': torch.LongTensor([w1, w2, eos]),
+                'target': torch.LongTensor([w1, w2, w1, eos]),
+            },
+            {
+                'source': torch.LongTensor([w2, eos]),
+                'target': torch.LongTensor([w2, w1, eos]),
+            },
+            {
+                'source': torch.LongTensor([w2, eos]),
+                'target': torch.LongTensor([w2, eos]),
+            },
+        ]
+        data_itr = test_utils.dummy_dataloader(data)
+
+        # specify expected output probabilities
+        args = argparse.Namespace()
+        unk = 0.
+        args.beam_probs = [
+            # step 0:
+            torch.FloatTensor([
+                # eos      w1   w2
+                [0.0, unk, 0.6, 0.4],  # sentence 1
+                [0.0, unk, 0.4, 0.6],  # sentence 2
+                [0.0, unk, 0.7, 0.3],  # sentence 3
+            ]),
+            # step 1:
+            torch.FloatTensor([
+                # eos      w1   w2
+                [0.0, unk, 0.2, 0.7],  # sentence 1
+                [0.0, unk, 0.8, 0.2],  # sentence 2
+                [0.7, unk, 0.1, 0.2],  # sentence 3
+            ]),
+            # step 2:
+            torch.FloatTensor([
+                # eos       w1    w2
+                [0.10, unk, 0.50, 0.4],  # sentence 1
+                [0.15, unk, 0.15, 0.7],  # sentence 2
+                [0.00, unk, 0.00, 0.0],  # sentence 3
+            ]),
+            # step 3:
+            torch.FloatTensor([
+                # eos      w1    w2
+                [0.9, unk, 0.05, 0.05],  # sentence 1
+                [0.0, unk, 0.00, 0.0],  # sentence 2
+                [0.0, unk, 0.00, 0.0],  # sentence 3
+            ]),
+        ]
+        expected_scores = [
+            [0.6, 0.7, 0.5, 0.9],  # sentence 1
+            [0.6, 0.8, 0.15],  # sentence 2
+            [0.3, 0.7],  # sentence 3
+        ]
+
+        task = test_utils.TestTranslationTask.setup_task(args, d, d)
+        model = task.build_model(args)
+        scorer = SequenceScorer([model], task.target_dictionary)
+        for id, _src, _ref, hypos in scorer.score_batched_itr(data_itr):
+            self.assertHypoTokens(hypos[0], data[id]['target'])
+            self.assertHypoScore(hypos[0], expected_scores[id])
+
+    def assertHypoTokens(self, hypo, tokens):
+        self.assertTensorEqual(hypo['tokens'], torch.LongTensor(tokens))
+
+    def assertHypoScore(self, hypo, pos_probs, normalized=True, lenpen=1.):
+        pos_scores = torch.FloatTensor(pos_probs).log()
+        self.assertAlmostEqual(hypo['positional_scores'], pos_scores)
+        self.assertEqual(pos_scores.numel(), hypo['tokens'].numel())
+        score = pos_scores.sum()
+        if normalized:
+            score /= pos_scores.numel()**lenpen
+        self.assertLess(abs(score - hypo['score']), 1e-6)
+
+    def assertAlmostEqual(self, t1, t2):
+        self.assertEqual(t1.size(), t2.size(), "size mismatch")
+        self.assertLess((t1 - t2).abs().max(), 1e-4)
+
+    def assertTensorEqual(self, t1, t2):
+        self.assertEqual(t1.size(), t2.size(), "size mismatch")
+        self.assertEqual(t1.ne(t2).long().sum(), 0)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/implementations/pytorch/tests/test_train.py
+++ b/implementations/pytorch/tests/test_train.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import contextlib
+from io import StringIO
+import unittest
+from unittest.mock import MagicMock, patch
+
+import torch
+
+from fairseq import data
+
+import train
+
+
+def mock_trainer(epoch, num_updates, iterations_in_epoch):
+    trainer = MagicMock()
+    trainer.load_checkpoint.return_value = {
+        'train_iterator': {
+            'epoch': epoch,
+            'iterations_in_epoch': iterations_in_epoch,
+            'shuffle': False,
+        },
+    }
+    trainer.get_num_updates.return_value = num_updates
+    return trainer
+
+
+def mock_dict():
+    d = MagicMock()
+    d.pad.return_value = 1
+    d.eos.return_value = 2
+    d.unk.return_value = 3
+    return d
+
+
+def get_trainer_and_epoch_itr(epoch, epoch_size, num_updates, iterations_in_epoch):
+    tokens = torch.LongTensor(list(range(epoch_size)))
+    tokens_ds = data.TokenBlockDataset(tokens, [len(tokens)], 1, include_targets=False)
+    trainer = mock_trainer(epoch, num_updates, iterations_in_epoch)
+    epoch_itr = data.EpochBatchIterator(
+        dataset=data.LanguagePairDataset(tokens_ds, tokens_ds.sizes, mock_dict(), shuffle=False),
+        max_tokens=1,
+    )
+    return trainer, epoch_itr
+
+
+class TestLoadCheckpoint(unittest.TestCase):
+
+    def setUp(self):
+        self.patches = {
+            'os.makedirs': MagicMock(),
+            'os.path.join': MagicMock(),
+            'os.path.isfile': MagicMock(return_value=True),
+        }
+        self.applied_patches = [patch(p, d) for p, d in self.patches.items()]
+        [p.start() for p in self.applied_patches]
+
+    def test_load_partial_checkpoint(self):
+        with contextlib.redirect_stdout(StringIO()):
+            trainer, epoch_itr = get_trainer_and_epoch_itr(2, 150, 200, 50)
+
+            train.load_checkpoint(MagicMock(), trainer, epoch_itr)
+            self.assertEqual(epoch_itr.epoch, 2)
+            self.assertEqual(epoch_itr.iterations_in_epoch, 50)
+
+            itr = epoch_itr.next_epoch_itr(shuffle=False)
+            self.assertEqual(epoch_itr.epoch, 2)
+            self.assertEqual(epoch_itr.iterations_in_epoch, 50)
+
+            self.assertEqual(next(itr)['net_input']['src_tokens'][0].item(), 50)
+            self.assertEqual(epoch_itr.iterations_in_epoch, 51)
+
+    def test_load_full_checkpoint(self):
+        with contextlib.redirect_stdout(StringIO()):
+            trainer, epoch_itr = get_trainer_and_epoch_itr(2, 150, 300, 150)
+
+            train.load_checkpoint(MagicMock(), trainer, epoch_itr)
+            itr = epoch_itr.next_epoch_itr(shuffle=False)
+
+            self.assertEqual(epoch_itr.epoch, 3)
+            self.assertEqual(epoch_itr.iterations_in_epoch, 0)
+            self.assertEqual(next(itr)['net_input']['src_tokens'][0].item(), 0)
+
+    def test_load_no_checkpoint(self):
+        with contextlib.redirect_stdout(StringIO()):
+            trainer, epoch_itr = get_trainer_and_epoch_itr(0, 150, 0, 0)
+            self.patches['os.path.isfile'].return_value = False
+
+            train.load_checkpoint(MagicMock(), trainer, epoch_itr)
+            itr = epoch_itr.next_epoch_itr(shuffle=False)
+
+            self.assertEqual(epoch_itr.epoch, 1)
+            self.assertEqual(epoch_itr.iterations_in_epoch, 0)
+            self.assertEqual(next(itr)['net_input']['src_tokens'][0].item(), 0)
+
+    def tearDown(self):
+        patch.stopall()
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/implementations/pytorch/tests/test_utils.py
+++ b/implementations/pytorch/tests/test_utils.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import unittest
+
+import torch
+
+from fairseq import utils
+
+
+class TestUtils(unittest.TestCase):
+
+    def test_convert_padding_direction(self):
+        pad = 1
+        left_pad = torch.LongTensor([
+            [2, 3, 4, 5, 6],
+            [1, 7, 8, 9, 10],
+            [1, 1, 1, 11, 12],
+        ])
+        right_pad = torch.LongTensor([
+            [2, 3, 4, 5, 6],
+            [7, 8, 9, 10, 1],
+            [11, 12, 1, 1, 1],
+        ])
+
+        self.assertAlmostEqual(
+            right_pad,
+            utils.convert_padding_direction(
+                left_pad,
+                pad,
+                left_to_right=True,
+            ),
+        )
+        self.assertAlmostEqual(
+            left_pad,
+            utils.convert_padding_direction(
+                right_pad,
+                pad,
+                right_to_left=True,
+            ),
+        )
+
+    def test_make_positions(self):
+        pad = 1
+        left_pad_input = torch.LongTensor([
+            [9, 9, 9, 9, 9],
+            [1, 9, 9, 9, 9],
+            [1, 1, 1, 9, 9],
+        ])
+        left_pad_output = torch.LongTensor([
+            [2, 3, 4, 5, 6],
+            [1, 2, 3, 4, 5],
+            [1, 1, 1, 2, 3],
+        ])
+        right_pad_input = torch.LongTensor([
+            [9, 9, 9, 9, 9],
+            [9, 9, 9, 9, 1],
+            [9, 9, 1, 1, 1],
+        ])
+        right_pad_output = torch.LongTensor([
+            [2, 3, 4, 5, 6],
+            [2, 3, 4, 5, 1],
+            [2, 3, 1, 1, 1],
+        ])
+
+        self.assertAlmostEqual(
+            left_pad_output,
+            utils.make_positions(left_pad_input, pad, left_pad=True),
+        )
+        self.assertAlmostEqual(
+            right_pad_output,
+            utils.make_positions(right_pad_input, pad, left_pad=False),
+        )
+
+    def assertAlmostEqual(self, t1, t2):
+        self.assertEqual(t1.size(), t2.size(), "size mismatch")
+        self.assertLess(utils.item((t1 - t2).abs().max()), 1e-4)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/implementations/pytorch/tests/utils.py
+++ b/implementations/pytorch/tests/utils.py
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import torch
+
+from fairseq import utils
+from fairseq.data import Dictionary
+from fairseq.data.language_pair_dataset import collate
+from fairseq.models import (
+    FairseqEncoder,
+    FairseqIncrementalDecoder,
+    FairseqModel,
+)
+from fairseq.tasks import FairseqTask
+
+
+def dummy_dictionary(vocab_size, prefix='token_'):
+    d = Dictionary()
+    for i in range(vocab_size):
+        token = prefix + str(i)
+        d.add_symbol(token)
+    d.finalize(padding_factor=1)  # don't add extra padding symbols
+    return d
+
+
+def dummy_dataloader(
+    samples,
+    padding_idx=1,
+    eos_idx=2,
+    batch_size=None,
+):
+    if batch_size is None:
+        batch_size = len(samples)
+
+    # add any missing data to samples
+    for i, sample in enumerate(samples):
+        if 'id' not in sample:
+            sample['id'] = i
+
+    # create dataloader
+    dataset = TestDataset(samples)
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=batch_size,
+        collate_fn=(lambda samples: collate(samples, padding_idx, eos_idx)),
+    )
+    return iter(dataloader)
+
+
+class TestDataset(torch.utils.data.Dataset):
+
+    def __init__(self, data):
+        super().__init__()
+        self.data = data
+
+    def __getitem__(self, index):
+        return self.data[index]
+
+    def __len__(self):
+        return len(self.data)
+
+
+class TestTranslationTask(FairseqTask):
+
+    def __init__(self, args, src_dict, tgt_dict, model):
+        super().__init__(args)
+        self.src_dict = src_dict
+        self.tgt_dict = tgt_dict
+        self.model = model
+
+    @classmethod
+    def setup_task(cls, args, src_dict=None, tgt_dict=None, model=None):
+        return cls(args, src_dict, tgt_dict, model)
+
+    def build_model(self, args):
+        return TestModel.build_model(args, self)
+
+    @property
+    def source_dictionary(self):
+        return self.src_dict
+
+    @property
+    def target_dictionary(self):
+        return self.tgt_dict
+
+
+class TestModel(FairseqModel):
+    def __init__(self, encoder, decoder):
+        super().__init__(encoder, decoder)
+
+    @classmethod
+    def build_model(cls, args, task):
+        encoder = TestEncoder(args, task.source_dictionary)
+        decoder = TestIncrementalDecoder(args, task.target_dictionary)
+        return cls(encoder, decoder)
+
+
+class TestEncoder(FairseqEncoder):
+    def __init__(self, args, dictionary):
+        super().__init__(dictionary)
+        self.args = args
+
+    def forward(self, src_tokens, src_lengths):
+        return src_tokens
+
+    def reorder_encoder_out(self, encoder_out, new_order):
+        return encoder_out.index_select(0, new_order)
+
+
+class TestIncrementalDecoder(FairseqIncrementalDecoder):
+    def __init__(self, args, dictionary):
+        super().__init__(dictionary)
+        assert hasattr(args, 'beam_probs') or hasattr(args, 'probs')
+        args.max_decoder_positions = getattr(args, 'max_decoder_positions', 100)
+        self.args = args
+
+    def forward(self, prev_output_tokens, encoder_out, incremental_state=None):
+        if incremental_state is not None:
+            prev_output_tokens = prev_output_tokens[:, -1:]
+        bbsz = prev_output_tokens.size(0)
+        vocab = len(self.dictionary)
+        src_len = encoder_out.size(1)
+        tgt_len = prev_output_tokens.size(1)
+
+        # determine number of steps
+        if incremental_state is not None:
+            # cache step number
+            step = utils.get_incremental_state(self, incremental_state, 'step')
+            if step is None:
+                step = 0
+            utils.set_incremental_state(self, incremental_state, 'step', step + 1)
+            steps = [step]
+        else:
+            steps = list(range(tgt_len))
+
+        # define output in terms of raw probs
+        if hasattr(self.args, 'probs'):
+            assert self.args.probs.dim() == 3, \
+                'expected probs to have size bsz*steps*vocab'
+            probs = self.args.probs.index_select(1, torch.LongTensor(steps))
+        else:
+            probs = torch.FloatTensor(bbsz, len(steps), vocab).zero_()
+            for i, step in enumerate(steps):
+                # args.beam_probs gives the probability for every vocab element,
+                # starting with eos, then unknown, and then the rest of the vocab
+                if step < len(self.args.beam_probs):
+                    probs[:, i, self.dictionary.eos():] = self.args.beam_probs[step]
+                else:
+                    probs[:, i, self.dictionary.eos()] = 1.0
+
+        # random attention
+        attn = torch.rand(bbsz, tgt_len, src_len)
+
+        return probs, attn
+
+    def get_normalized_probs(self, net_output, log_probs, _):
+        # the decoder returns probabilities directly
+        probs = net_output[0]
+        if log_probs:
+            return probs.log()
+        else:
+            return probs
+
+    def max_positions(self):
+        return self.args.max_decoder_positions
--- a/implementations/pytorch/train.py
+++ b/implementations/pytorch/train.py
+#!/usr/bin/env python3 -u
+# Copyright (c) 2017-present, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import collections
+import itertools
+import os
+import math
+import torch
+import time
+import ctypes
+import random
+import sys
+import unicodedata
+import six
+import re
+import gc
+
+from copy import deepcopy
+from functools import reduce
+from six.moves import xrange
+import numpy as np
+
+from fairseq import data, distributed_utils, options, progress_bar, tasks, utils, tokenizer
+from fairseq.trainer import Trainer
+from fairseq.meters import AverageMeter, StopwatchMeter, TimeMeter
+from fairseq.sequence_generator import SequenceGenerator
+from fairseq.data import dictionary
+from fairseq.data import language_pair_dataset
+
+from mlperf_log_utils import log_start, log_end, log_event, barrier
+from mlperf_logging.mllog import constants
+from mlperf_logging import mllog
+
+
+def generate_seeds(rng, size):
+    """
+    Generate list of random seeds
+
+    :param rng: random number generator
+    :param size: length of the returned list
+    """
+    seeds = [rng.randint(0, 2**32 - 1) for _ in range(size)]
+    return seeds
+
+
+def broadcast_seeds(seeds, device):
+    """
+    Broadcasts random seeds to all distributed workers.
+    Returns list of random seeds (broadcasted from workers with rank 0).
+
+    :param seeds: list of seeds (integers)
+    :param device: torch.device
+    """
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        seeds_tensor = torch.LongTensor(seeds).to(device)
+        torch.distributed.broadcast(seeds_tensor, 0)
+        seeds = seeds_tensor.tolist()
+    return seeds
+
+
+def setup_seeds(master_seed, epochs, device, rank, world_size):
+    """
+    Generates seeds from one master_seed.
+    Function returns (worker_seeds, shuffling_seeds), worker_seeds are later
+    used to initialize per-worker random number generators (mostly for
+    dropouts), shuffling_seeds are for RNGs resposible for reshuffling the
+    dataset before each epoch.
+    Seeds are generated on worker with rank 0 and broadcasted to all other
+    workers.
+
+    :param master_seed: master RNG seed used to initialize other generators
+    :param epochs: number of epochs
+    :param device: torch.device (used for distributed.broadcast)
+    """
+    if master_seed is None:
+        # random master seed, random.SystemRandom() uses /dev/urandom on Unix
+        master_seed = random.SystemRandom().randint(0, 2**32 - 1)
+        if rank == 0:
+            # master seed is reported only from rank=0 worker, it's to avoid
+            # confusion, seeds from rank=0 are later broadcasted to other
+            # workers
+            print(f'Using random master seed: {master_seed}')
+    else:
+        # master seed was specified from command line
+        print(f'Using master seed from command line: {master_seed}')
+
+    # initialize seeding RNG
+    seeding_rng = random.Random(master_seed)
+
+    # generate worker seeds, one seed for every distributed worker
+    worker_seeds = generate_seeds(seeding_rng, world_size)
+
+    # generate seeds for data shuffling, one seed for every epoch
+    shuffling_seeds = generate_seeds(seeding_rng, epochs)
+
+    # broadcast seeds from rank=0 to other workers
+    worker_seeds = broadcast_seeds(worker_seeds, device)
+    shuffling_seeds = broadcast_seeds(shuffling_seeds, device)
+    return worker_seeds, shuffling_seeds
+
+
+def main(args):
+    if not torch.cuda.is_available():
+        raise NotImplementedError('Training on CPU is not supported')
+    torch.cuda.set_device(args.device_id)
+
+    mllog.config(filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), 'transformer.log'))
+    mllogger = mllog.get_mllogger()
+    mllogger.logger.propagate = False
+
+    log_start(key=constants.INIT_START, log_all_ranks=True)
+
+    # preinit and warmup streams/groups for allreduce communicators
+    allreduce_communicators=None
+    if args.distributed_world_size > 1 and args.enable_parallel_backward_allred_opt:
+        allreduce_groups = [torch.distributed.new_group() for _ in range(args.parallel_backward_allred_cuda_nstreams)]
+        allreduce_streams = [torch.cuda.Stream() for _ in range(args.parallel_backward_allred_cuda_nstreams)]
+        for group, stream in zip(allreduce_groups,allreduce_streams):
+            with torch.cuda.stream(stream):
+                torch.distributed.all_reduce(torch.cuda.FloatTensor(1), group=group)
+        allreduce_communicators=(allreduce_groups,allreduce_streams)
+
+    if args.max_tokens is None:
+        args.max_tokens = 6000
+
+    print(args)
+
+    log_event(key=constants.GLOBAL_BATCH_SIZE, value=args.max_tokens*args.distributed_world_size)
+    log_event(key=constants.OPT_NAME, value=args.optimizer)
+    assert(len(args.lr) == 1)
+    log_event(key=constants.OPT_BASE_LR, value=args.lr[0] if len(args.lr) == 1 else args.lr)
+    log_event(key=constants.OPT_LR_WARMUP_STEPS, value=args.warmup_updates)
+    assert(args.max_source_positions == args.max_target_positions)
+    log_event(key=constants.MAX_SEQUENCE_LENGTH, value=args.max_target_positions, metadata={'method': 'discard'})
+    log_event(key=constants.OPT_ADAM_BETA_1, value=eval(args.adam_betas)[0])
+    log_event(key=constants.OPT_ADAM_BETA_2, value=eval(args.adam_betas)[1])
+    log_event(key=constants.OPT_ADAM_EPSILON, value=args.adam_eps)
+    log_event(key=constants.SEED, value=args.seed)
+
+    # L2 Sector Promotion
+    pValue = ctypes.cast((ctypes.c_int * 1)(), ctypes.POINTER(ctypes.c_int))
+    #result = ctypes.CDLL('/opt/dtk-24.04.1/cuda/targets/x86_64-linux/lib/libcudart.so').cudaDeviceSetLimit(ctypes.c_int(0x05), ctypes.c_int(128))
+    #result = ctypes.CDLL('/opt/dtk-24.04.1/cuda/targets/x86_64-linux/lib/libcudart.so').cudaDeviceGetLimit(pValue, ctypes.c_int(0x05))
+
+    worker_seeds, shuffling_seeds = setup_seeds(args.seed, args.max_epoch + 1,
+                                                torch.device('cuda'),
+                                                args.distributed_rank,
+                                                args.distributed_world_size,
+                                                )
+    worker_seed = worker_seeds[args.distributed_rank]
+    print(f'Worker {args.distributed_rank} is using worker seed: {worker_seed}')
+    torch.manual_seed(worker_seed)
+
+    # Setup task, e.g., translation, language modeling, etc.
+    task = tasks.setup_task(args)
+
+    # Build model and criterion
+    model = task.build_model(args)
+    criterion = task.build_criterion(args)
+
+    print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__))
+    print('| num. model params: {}'.format(sum(p.numel() for p in model.parameters())))
+
+    # Build trainer
+    if args.fp16:
+        if args.distributed_weight_update != 0:
+            from fairseq.fp16_trainer import DistributedFP16Trainer
+            trainer = DistributedFP16Trainer(args, task, model, criterion, allreduce_communicators=allreduce_communicators)
+        else:
+            from fairseq.fp16_trainer import FP16Trainer
+            trainer = FP16Trainer(args, task, model, criterion, allreduce_communicators=allreduce_communicators)
+    else:
+        if torch.cuda.get_device_capability(0)[0] >= 7:
+            print('| NOTICE: your device may support faster training with --fp16')
+
+        trainer = Trainer(args, task, model, criterion, allreduce_communicators=None)
+
+    #if (args.online_eval or args.target_bleu) and not args.remove_bpe:
+    #    args.remove_bpe='@@ '
+
+    print('| training on {} GPUs'.format(args.distributed_world_size))
+    print('| max tokens per GPU = {} and max sentences per GPU = {}'.format(args.max_tokens, args.max_sentences, ))
+
+    # Initialize dataloader
+    max_positions = trainer.get_model().max_positions()
+
+    # Send a dummy batch to warm the caching allocator
+    dummy_batch = language_pair_dataset.get_dummy_batch_isolated(args.max_tokens, max_positions, 8)
+    trainer.dummy_train_step(dummy_batch)
+
+    # Train until the learning rate gets too small or model reaches target score
+    max_epoch = args.max_epoch if args.max_epoch >= 0 else math.inf
+    max_update = args.max_update or math.inf
+    tgt_bleu = args.target_bleu or math.inf
+    current_bleu = 0.0
+    lr = trainer.get_lr()
+    train_meter = StopwatchMeter()
+    train_meter.start()
+    valid_losses = [None]
+
+    # mlperf compliance synchronization
+    if args.distributed_world_size > 1:
+        assert(torch.distributed.is_initialized())
+        torch.distributed.all_reduce(torch.cuda.FloatTensor(1))
+        torch.cuda.synchronize()
+
+    log_end(key=constants.INIT_STOP, sync=False)
+
+    log_start(key=constants.RUN_START, sync=True)
+    # second sync after RUN_START tag is printed.
+    # this ensures no rank touches data until after RUN_START tag is printed.
+    barrier()
+
+    # Load dataset splits
+    load_dataset_splits(task, ['train', 'test'])
+
+    log_event(key=constants.TRAIN_SAMPLES,
+              value=len(task.dataset(args.train_subset)),
+              sync=False)
+    log_event(key=constants.EVAL_SAMPLES,
+              value=len(task.dataset(args.gen_subset)),
+              sync=False)
+
+    ctr = 0
+
+    start = time.time()
+    epoch_itr = data.EpochBatchIterator(
+        dataset=task.dataset(args.train_subset),
+        dataloader_num_workers=args.dataloader_num_workers,
+        dataloader_pin_memory=args.enable_dataloader_pin_memory,
+        max_tokens=args.max_tokens,
+        max_sentences=args.max_sentences_valid,
+        max_positions=max_positions,
+        ignore_invalid_inputs=True,
+        required_batch_size_multiple=8,
+        seeds=shuffling_seeds,
+        num_shards=args.distributed_world_size,
+        shard_id=args.distributed_rank,
+        epoch=epoch_itr.epoch if ctr is not 0 else 0,
+        bucket_growth_factor=args.bucket_growth_factor,
+        seq_len_multiple=args.seq_len_multiple,
+        batching_scheme=args.batching_scheme,
+        batch_multiple_strategy=args.batch_multiple_strategy,
+    )
+    print("got epoch iterator", time.time() - start)
+
+    # Main training loop
+    while lr >= args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates() < max_update and current_bleu < tgt_bleu:
+        first_epoch = epoch_itr.epoch+1
+        log_start(key=constants.BLOCK_START,
+                     metadata={'first_epoch_num': first_epoch, 'epoch_count': 1},
+                     sync=False)
+        log_start(key=constants.EPOCH_START, metadata={'epoch_num': first_epoch}, sync=False)
+
+        gc.disable()
+
+        # Load the latest checkpoint if one is available
+        if ctr is 0:
+            load_checkpoint(args, trainer, epoch_itr)
+
+        # train for one epoch
+        start = time.time()
+        #exit(1)
+        train(args, trainer, task, epoch_itr, shuffling_seeds)
+        print("epoch time ", time.time() - start)
+
+        start = time.time()
+        log_end(key=constants.EPOCH_STOP, metadata={'epoch_num': first_epoch}, sync=False)
+
+        # Eval BLEU score
+        if args.online_eval or (not tgt_bleu is math.inf):
+            current_bleu = score(args, trainer, task, epoch_itr, args.gen_subset)
+            log_event(key=constants.EVAL_ACCURACY,
+                         value=float(current_bleu) / 100.0,
+                         metadata={'epoch_num': first_epoch})
+
+        gc.enable()
+
+        # Only use first validation loss to update the learning rate
+        #lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0])
+
+        # Save checkpoint
+        #if epoch_itr.epoch % args.save_interval == 0:
+        #    save_checkpoint(args, trainer, epoch_itr, valid_losses[0])
+
+        ctr = ctr + 1
+        print("validation and scoring ", time.time() - start)
+        log_end(key=constants.BLOCK_STOP,
+                     metadata={'first_epoch_num': first_epoch},
+                     sync=False)
+
+    train_meter.stop()
+    status = 'success' if current_bleu >= tgt_bleu else 'aborted'
+    log_end(key=constants.RUN_STOP,
+                 metadata={'status': status})
+    print('| done training in {:.1f} seconds'.format(train_meter.sum))
+
+
+def train(args, trainer, task, epoch_itr, shuffling_seeds):
+    """Train the model for one epoch."""
+
+    # Initialize data iterator
+    itr = epoch_itr.next_epoch_itr()
+    progress = progress_bar.build_progress_bar(args, itr, epoch_itr.epoch, no_progress_bar='simple')
+
+    # update parameters every N batches
+    if epoch_itr.epoch <= len(args.update_freq):
+        update_freq = args.update_freq[epoch_itr.epoch - 1]
+    else:
+        update_freq = args.update_freq[-1]
+
+    if args.enable_parallel_backward_allred_opt and update_freq > 1:
+        raise RuntimeError('--enable-parallel-backward-allred-opt is incompatible with --update-freq > 1')
+
+    extra_meters = collections.defaultdict(lambda: AverageMeter())
+    first_valid = args.valid_subset.split(',')[0]
+    max_update = args.max_update or math.inf
+    num_batches = len(epoch_itr)
+    if args.time_step :
+        begin = time.time()
+        end = time.time()
+    count = 0
+
+    #profile_count = 13
+    profile_count = 10000000000
+
+    for i, sample in enumerate(progress, start=epoch_itr.iterations_in_epoch):
+        if args.time_step :
+            start_step = time.time()
+        if i < num_batches - 1 and (i + 1) % update_freq > 0:
+            # buffer updates according to --update-freq
+            trainer.train_step(sample, update_params=False, last_step=(i == len(itr)-1))
+            continue
+        else:
+            log_output = trainer.train_step(sample, update_params=True, last_step=(i == len(itr)-1))
+        if args.time_step :
+            end_step = time.time()
+            #if count > 10  and sample['target'].size(0) > 248 :
+            seqs = sample['target'].size(0)
+            srclen = sample['net_input']['src_tokens'].size(1)
+            tgtlen = sample['target'].size(1)
+            srcbatch = srclen * seqs
+            tgtbatch = tgtlen * seqs
+            #print("ITER {}> Seqs: {} SrcLen: {} TgtLen: {} Src Batch: {} Tgt Batch {}".format( count, seqs, srclen, tgtlen, srcbatch, tgtbatch))
+            print("ITER {}> Seqs: {} SrcLen: {} TgtLen: {} Total Time: {:.3} Step Time: {:.3} Load Time: {:.3}".format( \
+                count,                                                                                                  \
+                sample['target'].size(0),                                                                               \
+                sample['net_input']['src_tokens'].size(1),                                                              \
+                sample['target'].size(1),                                                                               \
+                (end_step-begin)*1000.0,                                                                                \
+                (end_step-start_step)*1000.0,                                                                           \
+                (start_step-end)*1000.0))
+            count += 1
+            begin = time.time()
+
+        # log mid-epoch stats
+        stats = get_training_stats(trainer)
+        for k, v in log_output.items():
+            if k in ['loss', 'nll_loss', 'sample_size']:
+                continue  # these are already logged above
+            if 'loss' in k:
+                extra_meters[k].update(v, log_output['sample_size'])
+            else:
+                extra_meters[k].update(v)
+            stats[k] = extra_meters[k].avg
+        progress.log(stats)
+
+        # ignore the first mini-batch in words-per-second calculation
+        if i == 0:
+            trainer.get_meter('wps').reset()
+
+        if args.profile is not None and i == args.profile:
+            import sys
+            sys.exit()
+
+        num_updates = trainer.get_num_updates()
+        if args.save_interval_updates > 0 and num_updates % args.save_interval_updates == 0:
+            valid_losses = validate(args, trainer, task, epoch_itr,
+                                    [first_valid], shuffling_seeds)
+            save_checkpoint(args, trainer, epoch_itr, valid_losses[0])
+
+        if num_updates >= max_update:
+            break
+        if args.time_step :
+            end = time.time()
+
+    # log end-of-epoch stats
+    stats = get_training_stats(trainer)
+    for k, meter in extra_meters.items():
+        stats[k] = meter.avg
+    progress.print(stats)
+
+    # reset training meters
+    for k in ['train_loss', 'train_nll_loss', 'wps', 'ups', 'wpb', 'bsz', 'clip']:
+        meter = trainer.get_meter(k)
+        if meter is not None:
+            meter.reset()
+
+
+def get_training_stats(trainer):
+    stats = collections.OrderedDict()
+    stats['loss'] = '{:.3f}'.format(trainer.get_meter('train_loss').avg)
+    if trainer.get_meter('train_nll_loss').count > 0:
+        nll_loss = trainer.get_meter('train_nll_loss').avg
+        stats['nll_loss'] = '{:.3f}'.format(nll_loss)
+    else:
+        nll_loss = trainer.get_meter('train_loss').avg
+    stats['ppl'] = get_perplexity(nll_loss)
+    stats['wps'] = round(trainer.get_meter('wps').avg)
+    stats['ups'] = '{:.1f}'.format(trainer.get_meter('ups').avg)
+    stats['wpb'] = round(trainer.get_meter('wpb').avg)
+    stats['bsz'] = round(trainer.get_meter('bsz').avg)
+    stats['num_updates'] = trainer.get_num_updates()
+    stats['lr'] = trainer.get_lr()
+    stats['gnorm'] = '{:.3f}'.format(trainer.get_meter('gnorm').avg)
+    stats['clip'] = '{:.0%}'.format(trainer.get_meter('clip').avg)
+    stats['oom'] = trainer.get_meter('oom').avg
+    if trainer.get_meter('loss_scale') is not None:
+        stats['loss_scale'] = '{:.3f}'.format(trainer.get_meter('loss_scale').avg)
+
+    stats['wall'] = round(trainer.get_meter('wall').elapsed_time)
+    return stats
+
+
+def validate(args, trainer, task, epoch_itr, subsets, shuffling_seeds):
+    """Evaluate the model on the validation set(s) and return the losses."""
+    valid_losses = []
+    for subset in subsets:
+        # Initialize data iterator
+        itr = data.EpochBatchIterator(
+            dataset=task.dataset(subset),
+            max_tokens=args.max_tokens,
+            max_sentences=args.max_sentences_valid,
+            max_positions=trainer.get_model().max_positions(),
+            ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
+            required_batch_size_multiple=8,
+            seeds=shuffling_seeds,
+            num_shards=args.distributed_world_size,
+            shard_id=args.distributed_rank,
+            bucket_growth_factor=args.bucket_growth_factor,
+            seq_len_multiple=args.seq_len_multiple,
+            batching_scheme=args.batching_scheme,
+            batch_multiple_strategy=args.batch_multiple_strategy,
+        ).next_epoch_itr(shuffle=False)
+        progress = progress_bar.build_progress_bar(
+            args, itr, epoch_itr.epoch,
+            prefix='valid on \'{}\' subset'.format(subset),
+            no_progress_bar='simple'
+        )
+
+        # reset validation loss meters
+        for k in ['valid_loss', 'valid_nll_loss']:
+            meter = trainer.get_meter(k)
+            if meter is not None:
+                meter.reset()
+        extra_meters = collections.defaultdict(lambda: AverageMeter())
+
+        for sample in progress:
+            log_output = trainer.valid_step(sample)
+
+            for k, v in log_output.items():
+                if k in ['loss', 'nll_loss', 'sample_size']:
+                    continue
+                extra_meters[k].update(v)
+
+        # log validation stats
+        stats = get_valid_stats(trainer)
+        for k, meter in extra_meters.items():
+            stats[k] = meter.avg
+        progress.print(stats)
+
+        valid_losses.append(stats['valid_loss'])
+    return valid_losses
+
+def _get_ngrams_with_counter(segment, max_order):
+  """Extracts all n-grams up to a given maximum order from an input segment.
+
+  Args:
+    segment: text segment from which n-grams will be extracted.
+    max_order: maximum length in tokens of the n-grams returned by this
+        methods.
+
+  Returns:
+    The Counter containing all n-grams upto max_order in segment
+    with a count of how many times each n-gram occurred.
+  """
+  ngram_counts = collections.Counter()
+  for order in xrange(1, max_order + 1):
+    for i in xrange(0, len(segment) - order + 1):
+      ngram = tuple(segment[i:i + order])
+      ngram_counts[ngram] += 1
+  return ngram_counts
+
+class RefBleuStats:
+  def __init__(self, matches_by_order, possible_matches_by_order, reference_length, translation_length):
+    self.matches_by_order = matches_by_order
+    self.possible_matches_by_order = possible_matches_by_order
+    self.reference_length = reference_length
+    self.translation_length = translation_length
+
+  def __add__(self, other):
+    return RefBleuStats(
+        [a+b for a,b in zip(self.matches_by_order, other.matches_by_order)],
+        [a+b for a,b in zip(self.possible_matches_by_order, other.possible_matches_by_order)],
+        self.reference_length + other.reference_length,
+        self.translation_length + other.translation_length)
+
+def compute_bleu(reference_corpus, translation_corpus, args, max_order=4, use_bp=True):
+  """Computes BLEU score of translated segments against one or more references.
+
+  Args:
+    reference_corpus: list of references for each translation. Each
+        reference should be tokenized into a list of tokens.
+    translation_corpus: list of translations to score. Each translation
+        should be tokenized into a list of tokens.
+    args: CLI arguments
+    max_order: Maximum n-gram order to use when computing BLEU score.
+    use_bp: boolean, whether to apply brevity penalty.
+
+  Returns:
+    BLEU score.
+  """
+  reference_length = 0
+  translation_length = 0
+  bp = 1.0
+  geo_mean = 0
+
+  matches_by_order = [0] * max_order
+  possible_matches_by_order = [0] * max_order
+  precisions = []
+
+  for (references, translations) in zip(reference_corpus, translation_corpus):
+    reference_length += len(references)
+    translation_length += len(translations)
+    ref_ngram_counts = _get_ngrams_with_counter(references, max_order)
+    translation_ngram_counts = _get_ngrams_with_counter(translations, max_order)
+
+    overlap = dict((ngram, min(count, translation_ngram_counts[ngram])) for ngram, count in ref_ngram_counts.items())
+
+    for ngram in overlap:
+      matches_by_order[len(ngram) - 1] += overlap[ngram]
+    for ngram in translation_ngram_counts:
+      possible_matches_by_order[len(ngram) - 1] += translation_ngram_counts[ngram]
+
+  precisions = [0] * max_order
+  smooth = 1.0
+
+  # do reductions of matches_by_order and possible_matches_by_order
+  if args.distributed_world_size > 1:
+    stats = RefBleuStats(matches_by_order, possible_matches_by_order, reference_length, translation_length)
+    all_stats = distributed_utils.all_gather_list(stats)
+    stats = reduce(lambda a,b : a+b, all_stats)
+    matches_by_order = stats.matches_by_order
+    possible_matches_by_order = stats.possible_matches_by_order
+    reference_length = stats.reference_length
+    translation_length = stats.translation_length
+
+  for i in xrange(0, max_order):
+    if possible_matches_by_order[i] > 0:
+      precisions[i] = float(matches_by_order[i]) / possible_matches_by_order[i]
+      if matches_by_order[i] > 0:
+        precisions[i] = float(matches_by_order[i]) / possible_matches_by_order[i]
+      else:
+        smooth *= 2
+        precisions[i] = 1.0 / (smooth * possible_matches_by_order[i])
+    else:
+      precisions[i] = 0.0
+
+  if max(precisions) > 0:
+    p_log_sum = sum(math.log(p) for p in precisions if p)
+    geo_mean = math.exp(p_log_sum / max_order)
+
+  if use_bp:
+    if reference_length > 0:
+      ratio = translation_length / reference_length
+      bp = math.exp(1 - 1. / ratio) if ratio < 1.0 else 1.0
+    else:
+      bp = 1.0
+  bleu = geo_mean * bp
+  return np.float32(bleu)*100.0
+
+def detokenize_subtokenized_sentence(subtokenized_sentence):
+  l1 = ' '.join(''.join(subtokenized_sentence.strip().split()).split('_'))
+  l1 = l1.replace(' ,',',')
+  l1 = l1.replace(' .','.')
+  l1 = l1.replace(' !','!')
+  l1 = l1.replace(' ?','?')
+  l1 = l1.replace(' \' ','\'')
+  l1 = l1.replace(' - ','-')
+  l1 = l1.strip()
+  return l1
+
+class UnicodeRegex(object):
+    """Ad-hoc hack to recognize all punctuation and symbols."""
+
+    def __init__(self):
+        punctuation = self.property_chars("P")
+        self.nondigit_punct_re = re.compile(r"([^\d])([" + punctuation + r"])")
+        self.punct_nondigit_re = re.compile(r"([" + punctuation + r"])([^\d])")
+        self.symbol_re = re.compile("([" + self.property_chars("S") + "])")
+
+    def property_chars(self, prefix):
+      return "".join(six.unichr(x) for x in range(sys.maxunicode)
+                     if unicodedata.category(six.unichr(x)).startswith(prefix))
+
+uregex = UnicodeRegex()
+
+def bleu_tokenize(string):
+    r"""Tokenize a string following the official BLEU implementation.
+    See https://github.com/moses-smt/mosesdecoder/'
+    'blob/master/scripts/generic/mteval-v14.pl#L954-L983
+    In our case, the input string is expected to be just one line
+    and no HTML entities de-escaping is needed.
+    So we just tokenize on punctuation and symbols,
+    except when a punctuation is preceded and followed by a digit
+    (e.g. a comma/dot as a thousand/decimal separator).
+    Note that a numer (e.g. a year) followed by a dot at the end of sentence
+    is NOT tokenized,
+    i.e. the dot stays with the number because `s/(\p{P})(\P{N})/ $1 $2/g`
+    does not match this case (unless we add a space after each sentence).
+    However, this error is already in the original mteval-v14.pl
+    and we want to be consistent with it.
+    Args:
+    string: the input string
+    Returns:
+    a list of tokens
+    """
+    string = uregex.nondigit_punct_re.sub(r"\1 \2 ", string)
+    string = uregex.punct_nondigit_re.sub(r" \1 \2", string)
+    string = uregex.symbol_re.sub(r" \1 ", string)
+    return string.split()
+
+def score(args, trainer, task, epoch_itr, subset):
+
+    log_start(key=constants.EVAL_START, metadata={'epoch_num': epoch_itr.epoch}, sync=False)
+    begin = time.time()
+
+    if not subset in task.datasets.keys():
+        task.load_dataset(subset)
+
+    src_dict = deepcopy(task.source_dictionary) # This is necessary, generation of translations
+    tgt_dict = deepcopy(task.target_dictionary) # alters target dictionary messing up with the rest of training
+
+    model = trainer.get_model()
+
+    # Initialize data iterator
+    itr = data.EpochBatchIterator(
+        dataset=task.dataset(subset),
+        max_tokens=min(2560,args.max_tokens),
+        max_sentences=max(8,min((math.ceil(1024/args.distributed_world_size) // 4) * 4,128)),
+        max_positions=(256,256),
+        ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
+        required_batch_size_multiple=8,
+        num_shards=args.distributed_world_size,
+        shard_id=args.distributed_rank,
+        seq_len_multiple=args.seq_len_multiple,
+        # Use a large growth factor to get fewer buckets.
+        # Fewer buckets yield faster eval since batches are filled from single bucket
+        # and eval dataset is small.
+        bucket_growth_factor=1.2,
+        batching_scheme=args.batching_scheme,
+        batch_multiple_strategy=args.batch_multiple_strategy,
+        ).next_epoch_itr(shuffle=False)
+
+    # Initialize generator
+    gen_timer = StopwatchMeter()
+    translator = SequenceGenerator(
+	[model], tgt_dict, beam_size=args.beam,
+	stop_early=(not args.no_early_stop), normalize_scores=(not args.unnormalized),
+	len_penalty=args.lenpen,
+	sampling=args.sampling, sampling_topk=args.sampling_topk, minlen=args.min_len,
+        )
+    # Generate and compute BLEU
+    ref_toks = []
+    sys_toks = []
+    num_sentences = 0
+    has_target = True
+    if args.log_translations:
+        log = open(os.path.join(args.save_dir, 'translations_epoch{}_{}'.format(epoch_itr.epoch, args.distributed_rank)), 'w+')
+    with progress_bar.build_progress_bar(args, itr) as progress:
+        translations = translator.generate_batched_itr(
+                progress, maxlen_a=args.max_len_a, maxlen_b=args.max_len_b,
+                cuda=True, timer=gen_timer, prefix_size=args.prefix_size,
+                )
+
+        wps_meter = TimeMeter()
+        for sample_id, src_tokens, target_tokens, hypos in translations:
+            # Process input and grount truth
+            has_target = target_tokens is not None
+            target_tokens = target_tokens.int().cpu() if has_target else None
+
+            src_str = src_dict.string(src_tokens, args.remove_bpe)
+            if has_target:
+                target_str = tgt_dict.string(target_tokens, args.remove_bpe)
+
+            if args.log_translations:
+                log.write('S-{}\t{}\n'.format(sample_id, src_str))
+                if has_target:
+                    log.write('T-{}\t{}\n'.format(sample_id, target_str))
+
+            # Process top predictions
+            for i, hypo in enumerate(hypos[:min(len(hypos), args.nbest)]):
+                hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
+                        hypo_tokens=hypo['tokens'].int().cpu(),
+                        src_str=src_str,
+                        alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None,
+                        align_dict = None,
+                        tgt_dict=tgt_dict,
+                        remove_bpe=args.remove_bpe
+                        )
+                if args.log_translations:
+                    log.write('H-{}\t{}\t{}\n'.format(sample_id, hypo['score'], hypo_str))
+                    # log.write(str(hypo_tokens))
+                    log.write('P-{}\t{}\n'.format(
+                        sample_id,
+                        ' '.join(map(
+                            lambda x: '{:.4f}'.format(x),
+                            hypo['positional_scores'].tolist(),
+                        ))
+                    ))
+
+                # Score only the top hypothesis
+                if has_target and i==0:
+                    src_str = detokenize_subtokenized_sentence(src_str)
+                    target_str = detokenize_subtokenized_sentence(target_str)
+                    hypo_str = detokenize_subtokenized_sentence(hypo_str)
+                    sys_tok = bleu_tokenize((hypo_str.lower() if args.ignore_case else hypo_str))
+                    ref_tok = bleu_tokenize((target_str.lower() if args.ignore_case else target_str))
+                    sys_toks.append(sys_tok)
+                    ref_toks.append(ref_tok)
+
+            wps_meter.update(src_tokens.size(0))
+            progress.log({'wps':round(wps_meter.avg)})
+            num_sentences += 1
+
+    bleu_score_reference = compute_bleu(ref_toks, sys_toks, args)
+    bleu_score_reference_str = '{:.4f}'.format(bleu_score_reference)
+    if args.log_translations:
+        log.close()
+    if gen_timer.sum != 0:
+        print('| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)'.format(
+            num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1./gen_timer.avg))
+    if has_target:
+        print('| Generate {} with beam={}: bleu_score={}'.format(subset, args.beam, bleu_score_reference_str))
+    print('| Eval completed in: {:.2f}s'.format(time.time()-begin))
+    log_end(key=constants.EVAL_STOP, metadata={'epoch_num': epoch_itr.epoch}, sync=False)
+
+    return bleu_score_reference
+
+def get_valid_stats(trainer):
+    stats = collections.OrderedDict()
+    stats['valid_loss'] = trainer.get_meter('valid_loss').avg
+    if trainer.get_meter('valid_nll_loss').count > 0:
+        nll_loss = trainer.get_meter('valid_nll_loss').avg
+        stats['valid_nll_loss'] = nll_loss
+    else:
+        nll_loss = trainer.get_meter('valid_loss').avg
+    stats['valid_ppl'] = get_perplexity(nll_loss)
+    stats['num_updates'] = trainer.get_num_updates()
+    if hasattr(save_checkpoint, 'best'):
+        stats['best'] = min(save_checkpoint.best, stats['valid_loss'])
+    return stats
+
+
+def get_perplexity(loss):
+    try:
+        return '{:.2f}'.format(math.pow(2, loss))
+    except OverflowError:
+        return float('inf')
+
+
+def save_checkpoint(args, trainer, epoch_itr, val_loss):
+    if args.no_save or not distributed_utils.is_master(args):
+        return
+    epoch = epoch_itr.epoch
+    end_of_epoch = epoch_itr.end_of_epoch()
+    updates = trainer.get_num_updates()
+
+    checkpoint_conds = collections.OrderedDict()
+    checkpoint_conds['checkpoint{}.pt'.format(epoch)] = (
+            end_of_epoch and not args.no_epoch_checkpoints and
+            epoch % args.save_interval == 0
+    )
+    checkpoint_conds['checkpoint_{}_{}.pt'.format(epoch, updates)] = (
+            not end_of_epoch and args.save_interval_updates > 0 and
+            updates % args.save_interval_updates == 0
+    )
+    checkpoint_conds['checkpoint_best.pt'] = (
+            val_loss is not None and
+            (not hasattr(save_checkpoint, 'best') or val_loss < save_checkpoint.best)
+    )
+    checkpoint_conds['checkpoint_last.pt'] = True  # keep this last so that it's a symlink
+
+    prev_best = getattr(save_checkpoint, 'best', val_loss)
+    if val_loss is not None:
+        save_checkpoint.best = min(val_loss, prev_best)
+    extra_state = {
+        'best': save_checkpoint.best,
+        'train_iterator': epoch_itr.state_dict(),
+        'val_loss': val_loss,
+    }
+
+    checkpoints = [os.path.join(args.save_dir, fn) for fn, cond in checkpoint_conds.items() if cond]
+    if len(checkpoints) > 0:
+        for cp in checkpoints:
+            trainer.save_checkpoint(cp, extra_state)
+
+    if not end_of_epoch and args.keep_interval_updates > 0:
+        # remove old checkpoints; checkpoints are sorted in descending order
+        checkpoints = utils.checkpoint_paths(args.save_dir, pattern=r'checkpoint_\d+_(\d+)\.pt')
+        for old_chk in checkpoints[args.keep_interval_updates:]:
+            os.remove(old_chk)
+
+
+def load_checkpoint(args, trainer, epoch_itr):
+    """Load a checkpoint and replay dataloader to match."""
+    os.makedirs(args.save_dir, exist_ok=True)
+    checkpoint_path = os.path.join(args.save_dir, args.restore_file)
+    if os.path.isfile(checkpoint_path):
+        extra_state = trainer.load_checkpoint(checkpoint_path)
+        if extra_state is not None:
+            # replay train iterator to match checkpoint
+            epoch_itr.load_state_dict(extra_state['train_iterator'])
+
+            print('| loaded checkpoint {} (epoch {} @ {} updates)'.format(
+                checkpoint_path, epoch_itr.epoch, trainer.get_num_updates()))
+
+            trainer.lr_step(epoch_itr.epoch)
+            trainer.lr_step_update(trainer.get_num_updates())
+            if 'best' in extra_state:
+                save_checkpoint.best = extra_state['best']
+
+
+def load_dataset_splits(task, splits):
+    for split in splits:
+        if split == 'train':
+            task.load_dataset(split, combine=True)
+        else:
+            for k in itertools.count():
+                split_k = split + (str(k) if k > 0 else '')
+                try:
+                    task.load_dataset(split_k, combine=False)
+                except FileNotFoundError as e:
+                    if k > 0:
+                        break
+                    raise e
+
+
+if __name__ == '__main__':
+    parser = options.get_training_parser()
+    args = options.parse_args_and_arch(parser)
+
+    if args.distributed_port > 0 or args.distributed_init_method is not None:
+        from distributed_train import main as distributed_main
+
+        distributed_main(args)
+    elif args.distributed_world_size > 1:
+        from multiprocessing_train import main as multiprocessing_main
+
+        multiprocessing_main(args)
+    else:
+        main(args)
--- a/implementations/pytorch/transformer.log
+++ b/implementations/pytorch/transformer.log
+:::MLLOG {"namespace": "", "time_ms": 1728444225641, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 116}}
+:::MLLOG {"namespace": "", "time_ms": 1728444225642, "event_type": "POINT_IN_TIME", "key": "global_batch_size", "value": 6000, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 133}}
+:::MLLOG {"namespace": "", "time_ms": 1728444225642, "event_type": "POINT_IN_TIME", "key": "opt_name", "value": "adam", "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 134}}
+:::MLLOG {"namespace": "", "time_ms": 1728444225643, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.25, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 136}}
+:::MLLOG {"namespace": "", "time_ms": 1728444225643, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_steps", "value": 0, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 137}}
+:::MLLOG {"namespace": "", "time_ms": 1728445229773, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 116}}
+:::MLLOG {"namespace": "", "time_ms": 1728445229775, "event_type": "POINT_IN_TIME", "key": "global_batch_size", "value": 6000, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 133}}
+:::MLLOG {"namespace": "", "time_ms": 1728445229775, "event_type": "POINT_IN_TIME", "key": "opt_name", "value": "adam", "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 134}}
+:::MLLOG {"namespace": "", "time_ms": 1728445229775, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.25, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 136}}
+:::MLLOG {"namespace": "", "time_ms": 1728445229775, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_steps", "value": 0, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 137}}
+:::MLLOG {"namespace": "", "time_ms": 1728445371286, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 116}}
+:::MLLOG {"namespace": "", "time_ms": 1728445371287, "event_type": "POINT_IN_TIME", "key": "global_batch_size", "value": 4096, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 133}}
+:::MLLOG {"namespace": "", "time_ms": 1728445371287, "event_type": "POINT_IN_TIME", "key": "opt_name", "value": "adam", "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 134}}
+:::MLLOG {"namespace": "", "time_ms": 1728445371287, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.25, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 136}}
+:::MLLOG {"namespace": "", "time_ms": 1728445371287, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_steps", "value": 0, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 137}}
+:::MLLOG {"namespace": "", "time_ms": 1728452615867, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 116}}
+:::MLLOG {"namespace": "", "time_ms": 1728452615869, "event_type": "POINT_IN_TIME", "key": "global_batch_size", "value": 6000, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 133}}
+:::MLLOG {"namespace": "", "time_ms": 1728452615869, "event_type": "POINT_IN_TIME", "key": "opt_name", "value": "adam", "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 134}}
+:::MLLOG {"namespace": "", "time_ms": 1728452615869, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.25, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 136}}
+:::MLLOG {"namespace": "", "time_ms": 1728452615869, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_steps", "value": 0, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 137}}
+:::MLLOG {"namespace": "", "time_ms": 1728452615869, "event_type": "POINT_IN_TIME", "key": "max_sequence_length", "value": 1024, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 139, "method": "discard"}}
+:::MLLOG {"namespace": "", "time_ms": 1728452615869, "event_type": "POINT_IN_TIME", "key": "opt_adam_beta_1", "value": 0.9, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 140}}
+:::MLLOG {"namespace": "", "time_ms": 1728452615870, "event_type": "POINT_IN_TIME", "key": "opt_adam_beta_2", "value": 0.999, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 141}}
+:::MLLOG {"namespace": "", "time_ms": 1728452615870, "event_type": "POINT_IN_TIME", "key": "opt_adam_epsilon", "value": 1e-08, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 142}}
+:::MLLOG {"namespace": "", "time_ms": 1728452615870, "event_type": "POINT_IN_TIME", "key": "seed", "value": 1234, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 143}}
+:::MLLOG {"namespace": "", "time_ms": 1728452980089, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 116}}
+:::MLLOG {"namespace": "", "time_ms": 1728452980091, "event_type": "POINT_IN_TIME", "key": "global_batch_size", "value": 6000, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 133}}
+:::MLLOG {"namespace": "", "time_ms": 1728452980091, "event_type": "POINT_IN_TIME", "key": "opt_name", "value": "adam", "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 134}}
+:::MLLOG {"namespace": "", "time_ms": 1728452980091, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.25, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 136}}
+:::MLLOG {"namespace": "", "time_ms": 1728452980091, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_steps", "value": 0, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 137}}
+:::MLLOG {"namespace": "", "time_ms": 1728452980091, "event_type": "POINT_IN_TIME", "key": "max_sequence_length", "value": 1024, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 139, "method": "discard"}}
+:::MLLOG {"namespace": "", "time_ms": 1728452980091, "event_type": "POINT_IN_TIME", "key": "opt_adam_beta_1", "value": 0.9, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 140}}
+:::MLLOG {"namespace": "", "time_ms": 1728452980092, "event_type": "POINT_IN_TIME", "key": "opt_adam_beta_2", "value": 0.999, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 141}}
+:::MLLOG {"namespace": "", "time_ms": 1728452980092, "event_type": "POINT_IN_TIME", "key": "opt_adam_epsilon", "value": 1e-08, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 142}}
+:::MLLOG {"namespace": "", "time_ms": 1728452980092, "event_type": "POINT_IN_TIME", "key": "seed", "value": 1234, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 143}}
+:::MLLOG {"namespace": "", "time_ms": 1728453530231, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 116}}
+:::MLLOG {"namespace": "", "time_ms": 1728453530233, "event_type": "POINT_IN_TIME", "key": "global_batch_size", "value": 6000, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 133}}
+:::MLLOG {"namespace": "", "time_ms": 1728453530233, "event_type": "POINT_IN_TIME", "key": "opt_name", "value": "adam", "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 134}}
+:::MLLOG {"namespace": "", "time_ms": 1728453530233, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.25, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 136}}
+:::MLLOG {"namespace": "", "time_ms": 1728453530233, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_steps", "value": 0, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 137}}
+:::MLLOG {"namespace": "", "time_ms": 1728453530233, "event_type": "POINT_IN_TIME", "key": "max_sequence_length", "value": 1024, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 139, "method": "discard"}}
+:::MLLOG {"namespace": "", "time_ms": 1728453530234, "event_type": "POINT_IN_TIME", "key": "opt_adam_beta_1", "value": 0.9, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 140}}
+:::MLLOG {"namespace": "", "time_ms": 1728453530234, "event_type": "POINT_IN_TIME", "key": "opt_adam_beta_2", "value": 0.999, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 141}}
+:::MLLOG {"namespace": "", "time_ms": 1728453530234, "event_type": "POINT_IN_TIME", "key": "opt_adam_epsilon", "value": 1e-08, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 142}}
+:::MLLOG {"namespace": "", "time_ms": 1728453530234, "event_type": "POINT_IN_TIME", "key": "seed", "value": 1234, "metadata": {"file": "/mnt/fs/user/llama/custom_model/mlcommons/training_results_v0.7/NVIDIA/benchmarks/transformer/implementations/pytorch/train.py", "lineno": 143}}
+:::MLLOG {"namespace": "", "time_ms": 1728889462232, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
+:::MLLOG {"namespace": "", "time_ms": 1728889462232, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
+:::MLLOG {"namespace": "", "time_ms": 1728889462243, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
+:::MLLOG {"namespace": "", "time_ms": 1728889462246, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
+:::MLLOG {"namespace": "", "time_ms": 1728889463182, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
+:::MLLOG {"namespace": "", "time_ms": 1728889463193, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
+:::MLLOG {"namespace": "", "time_ms": 1728889463210, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
+:::MLLOG {"namespace": "", "time_ms": 1728889463219, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
+:::MLLOG {"namespace": "", "time_ms": 1728889463220, "event_type": "POINT_IN_TIME", "key": "global_batch_size", "value": 81920, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 133}}
+:::MLLOG {"namespace": "", "time_ms": 1728889463220, "event_type": "POINT_IN_TIME", "key": "opt_name", "value": "adam", "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 134}}
+:::MLLOG {"namespace": "", "time_ms": 1728889463220, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.0019, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 136}}
+:::MLLOG {"namespace": "", "time_ms": 1728889463220, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_steps", "value": 750, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 137}}
+:::MLLOG {"namespace": "", "time_ms": 1728889463220, "event_type": "POINT_IN_TIME", "key": "max_sequence_length", "value": 64, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 139, "method": "discard"}}
+:::MLLOG {"namespace": "", "time_ms": 1728889463220, "event_type": "POINT_IN_TIME", "key": "opt_adam_beta_1", "value": 0.9, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 140}}
+:::MLLOG {"namespace": "", "time_ms": 1728889463221, "event_type": "POINT_IN_TIME", "key": "opt_adam_beta_2", "value": 0.98, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 141}}
+:::MLLOG {"namespace": "", "time_ms": 1728889463221, "event_type": "POINT_IN_TIME", "key": "opt_adam_epsilon", "value": 1e-09, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 142}}
+:::MLLOG {"namespace": "", "time_ms": 1728889463221, "event_type": "POINT_IN_TIME", "key": "seed", "value": 22078, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 143}}
+:::MLLOG {"namespace": "", "time_ms": 1728889761577, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
+:::MLLOG {"namespace": "", "time_ms": 1728889762201, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
+:::MLLOG {"namespace": "", "time_ms": 1728889762201, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
+:::MLLOG {"namespace": "", "time_ms": 1728889762208, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
+:::MLLOG {"namespace": "", "time_ms": 1728889762264, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
+:::MLLOG {"namespace": "", "time_ms": 1728889762284, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
+:::MLLOG {"namespace": "", "time_ms": 1728889762285, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
+:::MLLOG {"namespace": "", "time_ms": 1728889762290, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
+:::MLLOG {"namespace": "", "time_ms": 1728889762290, "event_type": "POINT_IN_TIME", "key": "global_batch_size", "value": 81920, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 133}}
+:::MLLOG {"namespace": "", "time_ms": 1728889762290, "event_type": "POINT_IN_TIME", "key": "opt_name", "value": "adam", "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 134}}
+:::MLLOG {"namespace": "", "time_ms": 1728889762291, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.0019, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 136}}
+:::MLLOG {"namespace": "", "time_ms": 1728889762291, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_steps", "value": 750, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 137}}
+:::MLLOG {"namespace": "", "time_ms": 1728889762291, "event_type": "POINT_IN_TIME", "key": "max_sequence_length", "value": 64, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 139, "method": "discard"}}
+:::MLLOG {"namespace": "", "time_ms": 1728889762291, "event_type": "POINT_IN_TIME", "key": "opt_adam_beta_1", "value": 0.9, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 140}}
+:::MLLOG {"namespace": "", "time_ms": 1728889762291, "event_type": "POINT_IN_TIME", "key": "opt_adam_beta_2", "value": 0.98, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 141}}
+:::MLLOG {"namespace": "", "time_ms": 1728889762291, "event_type": "POINT_IN_TIME", "key": "opt_adam_epsilon", "value": 1e-09, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 142}}
+:::MLLOG {"namespace": "", "time_ms": 1728889762291, "event_type": "POINT_IN_TIME", "key": "seed", "value": 17315, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 143}}
+:::MLLOG {"namespace": "", "time_ms": 1728889771351, "event_type": "INTERVAL_END", "key": "init_stop", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 212}}
+:::MLLOG {"namespace": "", "time_ms": 1728889771352, "event_type": "INTERVAL_START", "key": "run_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 214}}
+:::MLLOG {"namespace": "", "time_ms": 1728889771904, "event_type": "POINT_IN_TIME", "key": "train_samples", "value": 4590101, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 222}}
+:::MLLOG {"namespace": "", "time_ms": 1728889771904, "event_type": "POINT_IN_TIME", "key": "eval_samples", "value": 3003, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 225}}
+:::MLLOG {"namespace": "", "time_ms": 1728889773125, "event_type": "INTERVAL_START", "key": "block_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 255, "first_epoch_num": 1, "epoch_count": 1}}
+:::MLLOG {"namespace": "", "time_ms": 1728889773126, "event_type": "INTERVAL_START", "key": "epoch_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 258, "epoch_num": 1}}
+:::MLLOG {"namespace": "", "time_ms": 1728890366314, "event_type": "INTERVAL_END", "key": "epoch_stop", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 273, "epoch_num": 1}}
+:::MLLOG {"namespace": "", "time_ms": 1728890366315, "event_type": "INTERVAL_START", "key": "eval_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 640, "epoch_num": 1}}
+:::MLLOG {"namespace": "", "time_ms": 1728890627512, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
+:::MLLOG {"namespace": "", "time_ms": 1728890627674, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
+:::MLLOG {"namespace": "", "time_ms": 1728890627712, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
+:::MLLOG {"namespace": "", "time_ms": 1728890627725, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
+:::MLLOG {"namespace": "", "time_ms": 1728890628433, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
+:::MLLOG {"namespace": "", "time_ms": 1728890628454, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
+:::MLLOG {"namespace": "", "time_ms": 1728890628462, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
+:::MLLOG {"namespace": "", "time_ms": 1728890628464, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 116}}
+:::MLLOG {"namespace": "", "time_ms": 1728890628464, "event_type": "POINT_IN_TIME", "key": "global_batch_size", "value": 81920, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 133}}
+:::MLLOG {"namespace": "", "time_ms": 1728890628464, "event_type": "POINT_IN_TIME", "key": "opt_name", "value": "adam", "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 134}}
+:::MLLOG {"namespace": "", "time_ms": 1728890628465, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.0019, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 136}}
+:::MLLOG {"namespace": "", "time_ms": 1728890628465, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_steps", "value": 750, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 137}}
+:::MLLOG {"namespace": "", "time_ms": 1728890628465, "event_type": "POINT_IN_TIME", "key": "max_sequence_length", "value": 64, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 139, "method": "discard"}}
+:::MLLOG {"namespace": "", "time_ms": 1728890628465, "event_type": "POINT_IN_TIME", "key": "opt_adam_beta_1", "value": 0.9, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 140}}
+:::MLLOG {"namespace": "", "time_ms": 1728890628465, "event_type": "POINT_IN_TIME", "key": "opt_adam_beta_2", "value": 0.98, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 141}}
+:::MLLOG {"namespace": "", "time_ms": 1728890628465, "event_type": "POINT_IN_TIME", "key": "opt_adam_epsilon", "value": 1e-09, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 142}}
+:::MLLOG {"namespace": "", "time_ms": 1728890628465, "event_type": "POINT_IN_TIME", "key": "seed", "value": 9431, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 143}}
+:::MLLOG {"namespace": "", "time_ms": 1728890637403, "event_type": "INTERVAL_END", "key": "init_stop", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 212}}
+:::MLLOG {"namespace": "", "time_ms": 1728890637404, "event_type": "INTERVAL_START", "key": "run_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 214}}
+:::MLLOG {"namespace": "", "time_ms": 1728890637971, "event_type": "POINT_IN_TIME", "key": "train_samples", "value": 4590101, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 222}}
+:::MLLOG {"namespace": "", "time_ms": 1728890637971, "event_type": "POINT_IN_TIME", "key": "eval_samples", "value": 3003, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 225}}
+:::MLLOG {"namespace": "", "time_ms": 1728890639238, "event_type": "INTERVAL_START", "key": "block_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 255, "first_epoch_num": 1, "epoch_count": 1}}
+:::MLLOG {"namespace": "", "time_ms": 1728890639239, "event_type": "INTERVAL_START", "key": "epoch_start", "value": null, "metadata": {"file": "/mnt/fs/user/llama/custom_model/libo_test/mlperf_test/transformer/implementations/pytorch/train.py", "lineno": 258, "epoch_num": 1}}
--- a/implementations/pytorch/utils/__pycache__/tokenizer.cpython-310.pyc
+++ b/implementations/pytorch/utils/__pycache__/tokenizer.cpython-310.pyc
--- a/implementations/pytorch/utils/tokenizer.py
+++ b/implementations/pytorch/utils/tokenizer.py
+# Copyright 2018 MLBenchmark Group. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Defines Subtokenizer class to encode and decode strings."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import os
+import re
+import sys
+import unicodedata
+
+import numpy as np
+import six
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+LUA = "<lua_index_compat>"
+PAD = "<pad>_"
+PAD_ID = 1
+EOS = "<EOS>_"
+EOS_ID = 2
+UNK = "<bypass_unk>"
+RESERVED_TOKENS = [LUA, PAD, EOS, UNK]
+
+# Set of characters that will be used in the function _escape_token() (see func
+# docstring for more details).
+# This set is added to the alphabet list to ensure that all escaped tokens can
+# be encoded.
+_ESCAPE_CHARS = set(u"\\_u;0123456789")
+# Regex for the function _unescape_token(), the inverse of _escape_token().
+# This is used to find "\u", "\\", and "\###;" substrings in the token.
+_UNESCAPE_REGEX = re.compile(r"\\u|\\\\|\\([0-9]+);")
+
+_UNDEFINED_UNICODE = u"\u3013"
+
+# Set contains all letter and number characters.
+_ALPHANUMERIC_CHAR_SET = set(
+    six.unichr(i) for i in xrange(sys.maxunicode)
+    if (unicodedata.category(six.unichr(i)).startswith("L") or
+        unicodedata.category(six.unichr(i)).startswith("N")))
+
+# min_count is the minimum number of times a subtoken must appear in the data
+# before before it is added to the vocabulary. The value is found using binary
+# search to obtain the target vocabulary size.
+_MIN_MIN_COUNT = 1     # min value to use when binary searching for min_count
+_MAX_MIN_COUNT = 1000  # max value to use when binary searching for min_count
+
+
+class Subtokenizer(object):
+  """Encodes and decodes strings to/from integer IDs."""
+
+  def __init__(self, vocab_file, reserved_tokens=None):
+    """Initializes class, creating a vocab file if data_files is provided."""
+    print("Initializing Subtokenizer from file %s." % vocab_file)
+
+    if reserved_tokens is None:
+      reserved_tokens = RESERVED_TOKENS
+    elif reserved_tokens is 'assumed_in_file':
+      reserved_tokens = []
+
+    self.subtoken_list = _load_vocab_file(vocab_file, reserved_tokens)
+    self.alphabet = _generate_alphabet_dict(self.subtoken_list, reserved_tokens)
+    self.subtoken_to_id_dict = _list_to_index_dict(self.subtoken_list)
+
+    self.max_subtoken_length = 0
+    for subtoken in self.subtoken_list:
+      self.max_subtoken_length = max(self.max_subtoken_length, len(subtoken))
+
+    # Create cache to speed up subtokenization
+    self._cache_size = 2 ** 20
+    self._cache = [(None, None)] * self._cache_size
+
+  @staticmethod
+  def init_from_files(
+      vocab_file, files, target_vocab_size, threshold, min_count=None,
+      file_byte_limit=1e6, reserved_tokens=None):
+    """Create subtoken vocabulary based on files, and save vocab to file.
+
+    Args:
+      vocab_file: String name of vocab file to store subtoken vocabulary.
+      files: List of file paths that will be used to generate vocabulary.
+      target_vocab_size: target vocabulary size to generate.
+      threshold: int threshold of vocabulary size to accept.
+      min_count: int minimum count to use for generating the vocabulary. The min
+        count is the minimum number of times a subtoken should appear in the
+        files before it is added to the vocabulary. If set to none, this value
+        is found using binary search.
+      file_byte_limit: (Default 1e6) Maximum number of bytes of sample text that
+        will be drawn from the files.
+      reserved_tokens: List of string tokens that are guaranteed to be at the
+        beginning of the subtoken vocabulary list.
+
+    Returns:
+      Subtokenizer object
+    """
+    if reserved_tokens is None:
+      reserved_tokens = RESERVED_TOKENS
+
+    if os.path.exists(vocab_file):
+      print("Vocab file already exists (%s)" % vocab_file)
+    else:
+      print("Begin steps to create subtoken vocabulary...")
+      token_counts = _count_tokens(files, file_byte_limit)
+      alphabet = _generate_alphabet_dict(token_counts)
+      subtoken_list = _generate_subtokens_with_target_vocab_size(
+          token_counts, alphabet, target_vocab_size, threshold, min_count,
+          reserved_tokens)
+      print("Generated vocabulary with %d subtokens." %
+                      len(subtoken_list))
+      _save_vocab_file(vocab_file, subtoken_list)
+    return Subtokenizer(vocab_file)
+
+  @staticmethod
+  def init_from_existing_vocab_file(
+          vocab_file, files, target_vocab_size, threshold, min_count=None,
+          file_byte_limit=1e6, reserved_tokens=None):
+    """Create subtoken vocabulary based on files, and save vocab to file.
+
+    Args:
+      vocab_file: String name of vocab file to store subtoken vocabulary.
+      files: List of file paths that will be used to generate vocabulary.
+      target_vocab_size: target vocabulary size to generate.
+      threshold: int threshold of vocabulary size to accept.
+      min_count: int minimum count to use for generating the vocabulary. The min
+        count is the minimum number of times a subtoken should appear in the
+        files before it is added to the vocabulary. If set to none, this value
+        is found using binary search.
+      file_byte_limit: (Default 1e6) Maximum number of bytes of sample text that
+        will be drawn from the files.
+      reserved_tokens: List of string tokens that are guaranteed to be at the
+        beginning of the subtoken vocabulary list.
+
+    Returns:
+      Subtokenizer object
+    """
+
+    if os.path.exists(vocab_file):
+      print("Vocab file exists (%s)" % vocab_file)
+    else:
+      print("Vocab file does not exist (%s)" % vocab_file)
+
+    return Subtokenizer(vocab_file, reserved_tokens='assumed_in_file')
+
+  def encode(self, raw_string, add_eos=False):
+    """Encodes a string into a list of int subtoken ids."""
+    ret = []
+    tokens = _split_string_to_tokens(_native_to_unicode(raw_string))
+    for token in tokens:
+      ret.extend(self._token_to_subtoken_ids(token))
+    if add_eos:
+      ret.append(EOS_ID)
+    return ret
+
+  def _token_to_subtoken_ids(self, token):
+    """Encode a single token into a list of subtoken ids."""
+    cache_location = hash(token) % self._cache_size
+    cache_key, cache_value = self._cache[cache_location]
+    if cache_key == token:
+      return cache_value
+
+    ret = _split_token_to_subtokens(
+        _escape_token(token, self.alphabet), self.subtoken_to_id_dict,
+        self.max_subtoken_length)
+    ret = [self.subtoken_to_id_dict[subtoken_id] for subtoken_id in ret]
+
+    self._cache[cache_location] = (token, ret)
+    return ret
+
+  def decode(self, subtokens):
+    """Converts list of int subtokens ids into a string."""
+    if isinstance(subtokens, np.ndarray):
+      # Note that list(subtokens) converts subtokens to a python list, but the
+      # items remain as np.int32. This converts both the array and its items.
+      subtokens = subtokens.tolist()
+
+    if not subtokens:
+      return ""
+
+    assert isinstance(subtokens, list) and isinstance(subtokens[0], int), (
+        "Subtokens argument passed into decode() must be a list of integers.")
+
+    return _unicode_to_native(
+        _join_tokens_to_string(self._subtoken_ids_to_tokens(subtokens)))
+
+  def _subtoken_ids_to_tokens(self, subtokens):
+    """Convert list of int subtoken ids to a list of string tokens."""
+    escaped_tokens = "".join([
+        self.subtoken_list[s] for s in subtokens
+        if s < len(self.subtoken_list)])
+    escaped_tokens = escaped_tokens.split("_")
+
+    # All tokens in the vocabulary list have been escaped (see _escape_token())
+    # so each token must be unescaped when decoding.
+    ret = []
+    for token in escaped_tokens:
+      if token:
+        ret.append(_unescape_token(token))
+    return ret
+
+
+def _save_vocab_file(vocab_file, subtoken_list):
+  """Save subtokens to file."""
+  with open(vocab_file, mode='w', newline='\n') as f:
+    for subtoken in subtoken_list:
+      f.write("'%s'\n" % _unicode_to_native(subtoken))
+
+
+def _load_vocab_file(vocab_file, reserved_tokens=None):
+  """Load vocabulary while ensuring reserved tokens are at the top."""
+  if reserved_tokens is None:
+    reserved_tokens = RESERVED_TOKENS
+
+  subtoken_list = []
+  with open(vocab_file, mode='r', newline='\n') as f:
+    for line in f:
+      subtoken = _native_to_unicode(line.strip())
+      subtoken = subtoken[1:-1]  # Remove surrounding single-quotes
+      if subtoken in reserved_tokens:
+        continue
+      subtoken_list.append(_native_to_unicode(subtoken))
+  return reserved_tokens + subtoken_list
+
+
+def _native_to_unicode(s):
+  """Convert string to unicode (required in Python 2)."""
+  if six.PY2:
+    return s if isinstance(s, unicode) else s.decode("utf-8")
+  else:
+    return s
+
+
+def _unicode_to_native(s):
+  """Convert string from unicode to native format (required in Python 2)."""
+  if six.PY2:
+    return s.encode("utf-8") if isinstance(s, unicode) else s
+  else:
+    return s
+
+
+def _split_string_to_tokens(text):
+  """Splits text to a list of string tokens."""
+  if not text:
+    return []
+  ret = []
+  token_start = 0
+  # Classify each character in the input string
+  is_alnum = [c in _ALPHANUMERIC_CHAR_SET for c in text]
+  for pos in xrange(1, len(text)):
+    if is_alnum[pos] != is_alnum[pos - 1]:
+      token = text[token_start:pos]
+      if token != u" " or token_start == 0:
+        ret.append(token)
+      token_start = pos
+  final_token = text[token_start:]
+  ret.append(final_token)
+  return ret
+
+
+def _join_tokens_to_string(tokens):
+  """Join a list of string tokens into a single string."""
+  token_is_alnum = [t[0] in _ALPHANUMERIC_CHAR_SET for t in tokens]
+  ret = []
+  for i, token in enumerate(tokens):
+    if i > 0 and token_is_alnum[i - 1] and token_is_alnum[i]:
+      ret.append(u" ")
+    ret.append(token)
+  return "".join(ret)
+
+
+def _escape_token(token, alphabet):
+  r"""Replace characters that aren't in the alphabet and append "_" to token.
+
+  Apply three transformations to the token:
+    1. Replace underline character "_" with "\u", and backslash "\" with "\\".
+    2. Replace characters outside of the alphabet with "\###;", where ### is the
+       character's Unicode code point.
+    3. Appends "_" to mark the end of a token.
+
+  Args:
+    token: unicode string to be escaped
+    alphabet: list of all known characters
+
+  Returns:
+    escaped string
+  """
+  token = token.replace(u"\\", u"\\\\").replace(u"_", u"\\u")
+  ret = [c if c in alphabet and c != u"\n" else r"\%d;" % ord(c) for c in token]
+  return u"".join(ret) + "_"
+
+
+def _unescape_token(token):
+  r"""Replaces escaped characters in the token with their unescaped versions.
+
+  Applies inverse transformations as _escape_token():
+    1. Replace "\u" with "_", and "\\" with "\".
+    2. Replace "\###;" with the unicode character the ### refers to.
+
+  Args:
+    token: escaped string
+
+  Returns:
+    unescaped string
+  """
+
+  def match(m):
+    r"""Returns replacement string for matched object.
+
+    Matched objects contain one of the strings that matches the regex pattern:
+      r"\\u|\\\\|\\([0-9]+);"
+    The strings can be '\u', '\\', or '\###;' (### is any digit number).
+
+    m.group(0) refers to the entire matched string ('\u', '\\', or '\###;').
+    m.group(1) refers to the first parenthesized subgroup ('###').
+
+    m.group(0) exists for all match objects, while m.group(1) exists only for
+    the string '\###;'.
+
+    This function looks to see if m.group(1) exists. If it doesn't, then the
+    matched string must be '\u' or '\\' . In this case, the corresponding
+    replacement ('_' and '\') are returned. Note that in python, a single
+    backslash is written as '\\', and double backslash as '\\\\'.
+
+    If m.goup(1) exists, then use the integer in m.group(1) to return a
+    unicode character.
+
+    Args:
+      m: match object
+
+    Returns:
+      String to replace matched object with.
+    """
+    # Check if the matched strings are '\u' or '\\'.
+    if m.group(1) is None:
+      return u"_" if m.group(0) == u"\\u" else u"\\"
+
+    # If m.group(1) exists, try and return unicode character.
+    try:
+      return six.unichr(int(m.group(1)))
+    except (ValueError, OverflowError) as _:
+      return _UNDEFINED_UNICODE
+
+  # Use match function to replace escaped substrings in the token.
+  return _UNESCAPE_REGEX.sub(match, token)
+
+
+def _count_tokens(files, file_byte_limit=1e6):
+  """Return token counts of words in the files.
+
+  Samples file_byte_limit bytes from each file, and counts the words that appear
+  in the samples. The samples are semi-evenly distributed across the file.
+
+  Args:
+    files: List of filepaths
+    file_byte_limit: Max number of bytes that will be read from each file.
+
+  Returns:
+    Dictionary mapping tokens to the number of times they appear in the sampled
+    lines from the files.
+  """
+  token_counts = collections.defaultdict(int)
+
+  for filepath in files:
+    with open(filepath, mode='r', newline='\n') as reader:
+      file_byte_budget = file_byte_limit
+      counter = 0
+      lines_to_skip = int(reader.size() / (file_byte_budget * 2))
+      for line in reader:
+        if counter < lines_to_skip:
+          counter += 1
+        else:
+          if file_byte_budget < 0:
+            break
+          line = line.strip()
+          file_byte_budget -= len(line)
+          counter = 0
+
+          # Add words to token counts
+          for token in _split_string_to_tokens(_native_to_unicode(line)):
+            token_counts[token] += 1
+  return token_counts
+
+
+def _list_to_index_dict(lst):
+  """Create dictionary mapping list items to their indices in the list."""
+  return {item: n for n, item in enumerate(lst)}
+
+
+def _split_token_to_subtokens(token, subtoken_dict, max_subtoken_length):
+  """Splits a token into subtokens defined in the subtoken dict."""
+  ret = []
+  start = 0
+  token_len = len(token)
+  import pdb
+  while start < token_len:
+    # Find the longest subtoken, so iterate backwards.
+    for end in xrange(min(token_len, start + max_subtoken_length), start, -1):
+      subtoken = token[start:end]
+      if subtoken in subtoken_dict:
+        ret.append(subtoken)
+        start = end
+        break
+    else:  # Did not break
+      # If there is no possible encoding of the escaped token then one of the
+      # characters in the token is not in the alphabet. This should be
+      # impossible and would be indicative of a bug.
+      #pdb.set_trace()
+      raise ValueError("Was unable to split token \"%s\" into subtokens." %
+                       token)
+  return ret
+
+
+def _generate_subtokens_with_target_vocab_size(
+    token_counts, alphabet, target_size, threshold, min_count=None,
+    reserved_tokens=None):
+  """Generate subtoken vocabulary close to the target size."""
+  if reserved_tokens is None:
+    reserved_tokens = RESERVED_TOKENS
+
+  if min_count is not None:
+    print("Using min_count=%d to generate vocab with target size %d" %
+                    (min_count, target_size))
+    return _generate_subtokens(
+        token_counts, alphabet, min_count, reserved_tokens=reserved_tokens)
+
+  def bisect(min_val, max_val):
+    """Recursive function to binary search for subtoken vocabulary."""
+    cur_count = (min_val + max_val) // 2
+    print("Binary search: trying min_count=%d (%d %d)" %
+                    (cur_count, min_val, max_val))
+    subtoken_list = _generate_subtokens(
+        token_counts, alphabet, cur_count, reserved_tokens=reserved_tokens)
+
+    val = len(subtoken_list)
+    print("Binary search: min_count=%d resulted in %d tokens" %
+                    (cur_count, val))
+
+    within_threshold = abs(val - target_size) < threshold
+    if within_threshold or min_val >= max_val or cur_count < 2:
+      return subtoken_list
+    if val > target_size:
+      other_subtoken_list = bisect(cur_count + 1, max_val)
+    else:
+      other_subtoken_list = bisect(min_val, cur_count - 1)
+
+    # Return vocabulary dictionary with the closest number of tokens.
+    other_val = len(other_subtoken_list)
+    if abs(other_val - target_size) < abs(val - target_size):
+      return other_subtoken_list
+    return subtoken_list
+
+  print("Finding best min_count to get target size of %d" %
+                  target_size)
+  return bisect(_MIN_MIN_COUNT, _MAX_MIN_COUNT)
+
+
+def _generate_alphabet_dict(iterable, reserved_tokens=None):
+  """Create set of characters that appear in any element in the iterable."""
+  if reserved_tokens is None:
+    reserved_tokens = RESERVED_TOKENS
+  elif reserved_tokens is 'assumed_in_file':
+    reserved_tokens = []
+
+  alphabet = {c for token in iterable for c in token}
+  alphabet |= {c for token in reserved_tokens for c in token}
+  alphabet |= _ESCAPE_CHARS  # Add escape characters to alphabet set.
+
+  return alphabet
+
+
+def _count_and_gen_subtokens(
+    token_counts, alphabet, subtoken_dict, max_subtoken_length):
+  """Count number of times subtokens appear, and generate new subtokens.
+
+  Args:
+    token_counts: dict mapping tokens to the number of times they appear in the
+      original files.
+    alphabet: list of allowed characters. Used to escape the tokens, which
+      guarantees that all tokens can be split into subtokens.
+    subtoken_dict: dict mapping subtokens to ids.
+    max_subtoken_length: maximum length of subtoken in subtoken_dict.
+
+  Returns:
+    A defaultdict mapping subtokens to the number of times they appear in the
+    tokens. The dict may contain new subtokens.
+  """
+  subtoken_counts = collections.defaultdict(int)
+  for token, count in six.iteritems(token_counts):
+    token = _escape_token(token, alphabet)
+    subtokens = _split_token_to_subtokens(
+        token, subtoken_dict, max_subtoken_length)
+
+    # Generate new subtokens by taking substrings from token.
+    start = 0
+    for subtoken in subtokens:
+      for end in xrange(start + 1, len(token) + 1):
+        new_subtoken = token[start:end]
+        subtoken_counts[new_subtoken] += count
+      start += len(subtoken)
+
+  return subtoken_counts
+
+
+def _filter_and_bucket_subtokens(subtoken_counts, min_count):
+  """Return a bucketed list of subtokens that are filtered by count.
+
+  Args:
+    subtoken_counts: defaultdict mapping subtokens to their counts
+    min_count: int count used to filter subtokens
+
+  Returns:
+    List of subtoken sets, where subtokens in set i have the same length=i.
+  """
+  # Create list of buckets, where subtokens in bucket i have length i.
+  subtoken_buckets = []
+  for subtoken, count in six.iteritems(subtoken_counts):
+    if count < min_count:  # Filter out subtokens that don't appear enough
+      continue
+    while len(subtoken_buckets) <= len(subtoken):
+      subtoken_buckets.append(set())
+    subtoken_buckets[len(subtoken)].add(subtoken)
+  return subtoken_buckets
+
+
+def _gen_new_subtoken_list(
+    subtoken_counts, min_count, alphabet, reserved_tokens=None):
+  """Generate candidate subtokens ordered by count, and new max subtoken length.
+
+  Add subtokens to the candiate list in order of length (longest subtokens
+  first). When a subtoken is added, the counts of each of its prefixes are
+  decreased. Prefixes that don't appear much outside the subtoken are not added
+  to the candidate list.
+
+  For example:
+    subtoken being added to candidate list: 'translate'
+    subtoken_counts: {'translate':10, 't':40, 'tr':16, 'tra':12, ...}
+    min_count: 5
+
+  When 'translate' is added, subtoken_counts is updated to:
+    {'translate':0, 't':30, 'tr':6, 'tra': 2, ...}
+
+  The subtoken 'tra' will not be added to the candidate list, because it appears
+  twice (less than min_count) outside of 'translate'.
+
+  Args:
+    subtoken_counts: defaultdict mapping str subtokens to int counts
+    min_count: int minumum count requirement for subtokens
+    alphabet: set of characters. Each character is added to the subtoken list to
+      guarantee that all tokens can be encoded.
+    reserved_tokens: list of tokens that will be added to the beginning of the
+      returned subtoken list.
+
+  Returns:
+    List of candidate subtokens in decreasing count order, and maximum subtoken
+    length
+  """
+  if reserved_tokens is None:
+    reserved_tokens = RESERVED_TOKENS
+
+  # Create a list of (count, subtoken) for each candidate subtoken.
+  subtoken_candidates = []
+
+  # Use bucketted list to iterate through subtokens in order of length.
+  # subtoken_buckets[i] = set(subtokens), where each subtoken has length i.
+  subtoken_buckets = _filter_and_bucket_subtokens(subtoken_counts, min_count)
+  max_subtoken_length = len(subtoken_buckets) - 1
+
+  # Go through the list in reverse order to consider longer subtokens first.
+  for subtoken_len in xrange(max_subtoken_length, 0, -1):
+    for subtoken in subtoken_buckets[subtoken_len]:
+      count = subtoken_counts[subtoken]
+
+      # Possible if this subtoken is a prefix of another token.
+      if count < min_count:
+        continue
+
+      # Ignore alphabet/reserved tokens, which will be added manually later.
+      if subtoken not in alphabet and subtoken not in reserved_tokens:
+        subtoken_candidates.append((count, subtoken))
+
+      # Decrement count of the subtoken's prefixes (if a longer subtoken is
+      # added, its prefixes lose priority to be added).
+      for end in xrange(1, subtoken_len):
+        subtoken_counts[subtoken[:end]] -= count
+
+  # Add alphabet subtokens (guarantees that all strings are encodable).
+  subtoken_candidates.extend((subtoken_counts.get(a, 0), a) for a in alphabet)
+
+  # Order subtoken candidates by decreasing count.
+  subtoken_list = [t for _, t in sorted(subtoken_candidates, reverse=True)]
+
+  # Add reserved tokens to beginning of the list.
+  subtoken_list = reserved_tokens + subtoken_list
+  return subtoken_list, max_subtoken_length
+
+
+def _generate_subtokens(
+    token_counts, alphabet, min_count, num_iterations=4,
+    reserved_tokens=None):
+  """Create a list of subtokens in decreasing order of frequency.
+
+  Args:
+    token_counts: dict mapping str tokens -> int count
+    alphabet: set of characters
+    min_count: int minimum number of times a subtoken must appear before it is
+      added to the vocabulary.
+    num_iterations: int number of iterations to generate new tokens.
+    reserved_tokens: list of tokens that will be added to the beginning to the
+      returned subtoken list.
+
+  Returns:
+    Sorted list of subtokens (most frequent first)
+  """
+  if reserved_tokens is None:
+    reserved_tokens = RESERVED_TOKENS
+
+  # Use alphabet set to create initial list of subtokens
+  subtoken_list = reserved_tokens + list(alphabet)
+  max_subtoken_length = 1
+
+  # On each iteration, segment all words using the subtokens defined in
+  # subtoken_dict, count how often the resulting subtokens appear, and update
+  # the dictionary with subtokens w/ high enough counts.
+  for i in xrange(num_iterations):
+    print("\tGenerating subtokens: iteration %d" % i)
+    # Generate new subtoken->id dictionary using the new subtoken list.
+    subtoken_dict = _list_to_index_dict(subtoken_list)
+
+    # Create dict mapping subtoken->count, with additional subtokens created
+    # from substrings taken from the tokens.
+    subtoken_counts = _count_and_gen_subtokens(
+        token_counts, alphabet, subtoken_dict, max_subtoken_length)
+
+    # Generate new list of subtokens sorted by subtoken count.
+    subtoken_list, max_subtoken_length = _gen_new_subtoken_list(
+        subtoken_counts, min_count, alphabet, reserved_tokens)
+
+    print("\tVocab size: %d" % len(subtoken_list))
+  return subtoken_list
--- a/optimizers.zip
+++ b/optimizers.zip