tokenization_transfo_xl.py 21.2 KB
Newer Older
thomwolf's avatar
thomwolf committed
1
# coding=utf-8
thomwolf's avatar
thomwolf committed
2
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
thomwolf's avatar
thomwolf committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Tokenization classes for Transformer XL model.
17
    Adapted from https://github.com/kimiyoung/transformer-xl.
thomwolf's avatar
thomwolf committed
18
"""
thomwolf's avatar
thomwolf committed
19
20
from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
thomwolf's avatar
thomwolf committed
21

22
import glob
thomwolf's avatar
thomwolf committed
23
import logging
thomwolf's avatar
thomwolf committed
24
25
import os
import sys
thomwolf's avatar
thomwolf committed
26
from collections import Counter, OrderedDict
thomwolf's avatar
thomwolf committed
27
28
29
30
from io import open

import torch
import numpy as np
thomwolf's avatar
thomwolf committed
31
32

from .file_utils import cached_path
thomwolf's avatar
thomwolf committed
33
from .tokenization_utils import PreTrainedTokenizer
thomwolf's avatar
thomwolf committed
34

thomwolf's avatar
thomwolf committed
35
36
37
38
39
40
if sys.version_info[0] == 2:
    import cPickle as pickle
else:
    import pickle


thomwolf's avatar
thomwolf committed
41
42
logger = logging.getLogger(__name__)

43
VOCAB_FILES_NAMES = {'pretrained_vocab_file': 'vocab.bin', 'vocab_file': 'vocab.txt'}
44
45
46
47
48
49
50
51
52

PRETRAINED_VOCAB_FILES_MAP = {
    'pretrained_vocab_file':
    {
        'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-vocab.bin",
    }
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
53
    'transfo-xl-wt103': None,
thomwolf's avatar
thomwolf committed
54
}
55
56
57

PRETRAINED_CORPUS_ARCHIVE_MAP = {
    'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-corpus.bin",
thomwolf's avatar
thomwolf committed
58
}
59
CORPUS_NAME = 'corpus.bin'
thomwolf's avatar
thomwolf committed
60

61
class TransfoXLTokenizer(PreTrainedTokenizer):
thomwolf's avatar
thomwolf committed
62
63
64
    """
    Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl
    """
65
66
67
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
thomwolf's avatar
thomwolf committed
68

69
    def __init__(self, special=None, min_freq=0, max_size=None, lower_case=False,
70
                 delimiter=None, vocab_file=None, pretrained_vocab_file=None,
71
72
73
74
75
                 never_split=None, unk_token="<unk>", eos_token="<eos>",
                 additional_special_tokens=["<formula>"], **kwargs):
        super(TransfoXLTokenizer, self).__init__(unk_token=unk_token, eos_token=eos_token,
                                                 additional_special_tokens=additional_special_tokens,
                                                 **kwargs)
76
77
78
79

        self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
        self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens

80
81
82
83
        if never_split is None:
            never_split = self.all_special_tokens
        if special is None:
            special = []
thomwolf's avatar
thomwolf committed
84
85
86
87
88
89
90
        self.counter = Counter()
        self.special = special
        self.min_freq = min_freq
        self.max_size = max_size
        self.lower_case = lower_case
        self.delimiter = delimiter
        self.vocab_file = vocab_file
91
        self.never_split = never_split
thomwolf's avatar
thomwolf committed
92

93
94
95
96
97
98
99
        if pretrained_vocab_file is not None:
            # Hack because, honestly this tokenizer was not made to be used
            # in a library like ours, at all.
            vocab_dict = torch.load(pretrained_vocab_file)
            for key, value in vocab_dict.items():
                self.__dict__[key] = value

100
101
102
        if vocab_file is not None:
            self.build_vocab()

thomwolf's avatar
thomwolf committed
103
    def count_file(self, path, verbose=False, add_eos=False):
thomwolf's avatar
thomwolf committed
104
        if verbose: logger.info('counting file {} ...'.format(path))
thomwolf's avatar
thomwolf committed
105
106
107
108
109
110
        assert os.path.exists(path)

        sents = []
        with open(path, 'r', encoding='utf-8') as f:
            for idx, line in enumerate(f):
                if verbose and idx > 0 and idx % 500000 == 0:
thomwolf's avatar
thomwolf committed
111
                    logger.info('    line {}'.format(idx))
thomwolf's avatar
thomwolf committed
112
113
114
115
116
117
118
119
120
121
                symbols = self.tokenize(line, add_eos=add_eos)
                self.counter.update(symbols)
                sents.append(symbols)

        return sents

    def count_sents(self, sents, verbose=False):
        """
            sents : a list of sentences, each a list of tokenized symbols
        """
thomwolf's avatar
thomwolf committed
122
        if verbose: logger.info('counting {} sents ...'.format(len(sents)))
thomwolf's avatar
thomwolf committed
123
124
        for idx, symbols in enumerate(sents):
            if verbose and idx > 0 and idx % 500000 == 0:
thomwolf's avatar
thomwolf committed
125
                logger.info('    line {}'.format(idx))
thomwolf's avatar
thomwolf committed
126
127
128
129
130
131
132
133
134
135
            self.counter.update(symbols)

    def _build_from_file(self, vocab_file):
        self.idx2sym = []
        self.sym2idx = OrderedDict()

        with open(vocab_file, 'r', encoding='utf-8') as f:
            for line in f:
                symb = line.strip().split()[0]
                self.add_symbol(symb)
136
137
138
139
140
141
        if '<UNK>' in self.sym2idx:
            self.unk_idx = self.sym2idx['<UNK>']
        elif '<unk>' in self.sym2idx:
            self.unk_idx = self.sym2idx['<unk>']
        else:
            raise ValueError('No <unkown> token in vocabulary')
thomwolf's avatar
thomwolf committed
142

143
    def save_vocabulary(self, vocab_path):
thomwolf's avatar
thomwolf committed
144
145
        """Save the tokenizer vocabulary to a directory or file."""
        if os.path.isdir(vocab_path):
146
            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['pretrained_vocab_file'])
147
        torch.save(self.__dict__, vocab_file)
148
        return (vocab_file,)
149

thomwolf's avatar
thomwolf committed
150
151
    def build_vocab(self):
        if self.vocab_file:
thomwolf's avatar
thomwolf committed
152
            logger.info('building vocab from {}'.format(self.vocab_file))
thomwolf's avatar
thomwolf committed
153
            self._build_from_file(self.vocab_file)
thomwolf's avatar
thomwolf committed
154
            logger.info('final vocab size {}'.format(len(self)))
thomwolf's avatar
thomwolf committed
155
        else:
thomwolf's avatar
thomwolf committed
156
            logger.info('building vocab with min_freq={}, max_size={}'.format(
thomwolf's avatar
thomwolf committed
157
158
159
160
161
162
163
164
165
166
167
                self.min_freq, self.max_size))
            self.idx2sym = []
            self.sym2idx = OrderedDict()

            for sym in self.special:
                self.add_special(sym)

            for sym, cnt in self.counter.most_common(self.max_size):
                if cnt < self.min_freq: break
                self.add_symbol(sym)

thomwolf's avatar
thomwolf committed
168
            logger.info('final vocab size {} from {} unique tokens'.format(
thomwolf's avatar
thomwolf committed
169
170
171
172
                len(self), len(self.counter)))

    def encode_file(self, path, ordered=False, verbose=False, add_eos=True,
            add_double_eos=False):
thomwolf's avatar
thomwolf committed
173
        if verbose: logger.info('encoding file {} ...'.format(path))
thomwolf's avatar
thomwolf committed
174
175
176
177
178
        assert os.path.exists(path)
        encoded = []
        with open(path, 'r', encoding='utf-8') as f:
            for idx, line in enumerate(f):
                if verbose and idx > 0 and idx % 500000 == 0:
thomwolf's avatar
thomwolf committed
179
                    logger.info('    line {}'.format(idx))
thomwolf's avatar
thomwolf committed
180
181
182
183
184
185
186
187
188
189
                symbols = self.tokenize(line, add_eos=add_eos,
                    add_double_eos=add_double_eos)
                encoded.append(self.convert_to_tensor(symbols))

        if ordered:
            encoded = torch.cat(encoded)

        return encoded

    def encode_sents(self, sents, ordered=False, verbose=False):
thomwolf's avatar
thomwolf committed
190
        if verbose: logger.info('encoding {} sents ...'.format(len(sents)))
thomwolf's avatar
thomwolf committed
191
192
193
        encoded = []
        for idx, symbols in enumerate(sents):
            if verbose and idx > 0 and idx % 500000 == 0:
thomwolf's avatar
thomwolf committed
194
                logger.info('    line {}'.format(idx))
thomwolf's avatar
thomwolf committed
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
            encoded.append(self.convert_to_tensor(symbols))

        if ordered:
            encoded = torch.cat(encoded)

        return encoded

    def add_special(self, sym):
        if sym not in self.sym2idx:
            self.idx2sym.append(sym)
            self.sym2idx[sym] = len(self.idx2sym) - 1
            setattr(self, '{}_idx'.format(sym.strip('<>')), self.sym2idx[sym])

    def add_symbol(self, sym):
        if sym not in self.sym2idx:
            self.idx2sym.append(sym)
            self.sym2idx[sym] = len(self.idx2sym) - 1

213
214
    def _convert_id_to_token(self, idx):
        """Converts an id in a token (BPE) using the vocab."""
215
        assert 0 <= idx < len(self), 'Index {} out of vocabulary range'.format(idx)
thomwolf's avatar
thomwolf committed
216
217
        return self.idx2sym[idx]

218
219
    def _convert_token_to_id(self, sym):
        """ Converts a token (str/unicode) in an id using the vocab. """
thomwolf's avatar
thomwolf committed
220
221
222
        if sym in self.sym2idx:
            return self.sym2idx[sym]
        else:
thomwolf's avatar
thomwolf committed
223
            # logger.info('encounter unk {}'.format(sym))
224
225
226
227
228
229
230
231
232
233
            # assert '<eos>' not in sym
            if hasattr(self, 'unk_idx'):
                return self.sym2idx.get(sym, self.unk_idx)
            # Backward compatibility with pre-trained models
            elif '<unk>' in self.sym2idx:
                return self.sym2idx['<unk>']
            elif '<UNK>' in self.sym2idx:
                return self.sym2idx['<UNK>']
            else:
                raise ValueError('Token not in vocabulary and no <unk> token in vocabulary for replacement')
thomwolf's avatar
thomwolf committed
234

235
236
237
    def convert_tokens_to_string(self, tokens):
        """ Converts a sequence of tokens (string) in a single string. """
        out_string = ' '.join(tokens).strip()
238
        return out_string
thomwolf's avatar
thomwolf committed
239
240
241
242

    def convert_to_tensor(self, symbols):
        return torch.LongTensor(self.convert_tokens_to_ids(symbols))

243
244
    @property
    def vocab_size(self):
thomwolf's avatar
thomwolf committed
245
246
        return len(self.idx2sym)

247
    def _tokenize(self, line, add_eos=False, add_double_eos=False):
thomwolf's avatar
thomwolf committed
248
        line = line.strip()
249
250
251
        # convert to lower case
        if self.lower_case:
            line = line.lower()
thomwolf's avatar
thomwolf committed
252

253
254
255
256
257
        # empty delimiter '' will evaluate False
        if self.delimiter == '':
            symbols = line
        else:
            symbols = line.split(self.delimiter)
thomwolf's avatar
thomwolf committed
258
259

        if add_double_eos: # lm1b
260
            return ['<S>'] + symbols + ['<S>']
thomwolf's avatar
thomwolf committed
261
        elif add_eos:
262
            return symbols + ['<eos>']
thomwolf's avatar
thomwolf committed
263
        else:
264
            return symbols
thomwolf's avatar
thomwolf committed
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299


class LMOrderedIterator(object):
    def __init__(self, data, bsz, bptt, device='cpu', ext_len=None):
        """
            data -- LongTensor -- the LongTensor is strictly ordered
        """
        self.bsz = bsz
        self.bptt = bptt
        self.ext_len = ext_len if ext_len is not None else 0

        self.device = device

        # Work out how cleanly we can divide the dataset into bsz parts.
        self.n_step = data.size(0) // bsz

        # Trim off any extra elements that wouldn't cleanly fit (remainders).
        data = data.narrow(0, 0, self.n_step * bsz)

        # Evenly divide the data across the bsz batches.
        self.data = data.view(bsz, -1).t().contiguous().to(device)

        # Number of mini-batches
        self.n_batch = (self.n_step + self.bptt - 1) // self.bptt

    def get_batch(self, i, bptt=None):
        if bptt is None: bptt = self.bptt
        seq_len = min(bptt, self.data.size(0) - 1 - i)

        end_idx = i + seq_len
        beg_idx = max(0, i - self.ext_len)

        data = self.data[beg_idx:end_idx]
        target = self.data[i+1:i+1+seq_len]

300
301
302
303
        data_out = data.transpose(0, 1).contiguous().to(self.device)
        target_out = target.transpose(0, 1).contiguous().to(self.device)

        return data_out, target_out, seq_len
thomwolf's avatar
thomwolf committed
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386

    def get_fixlen_iter(self, start=0):
        for i in range(start, self.data.size(0) - 1, self.bptt):
            yield self.get_batch(i)

    def get_varlen_iter(self, start=0, std=5, min_len=5, max_deviation=3):
        max_len = self.bptt + max_deviation * std
        i = start
        while True:
            bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2.
            bptt = min(max_len, max(min_len, int(np.random.normal(bptt, std))))
            data, target, seq_len = self.get_batch(i, bptt)
            i += seq_len
            yield data, target, seq_len
            if i >= self.data.size(0) - 2:
                break

    def __iter__(self):
        return self.get_fixlen_iter()


class LMShuffledIterator(object):
    def __init__(self, data, bsz, bptt, device='cpu', ext_len=None, shuffle=False):
        """
            data -- list[LongTensor] -- there is no order among the LongTensors
        """
        self.data = data

        self.bsz = bsz
        self.bptt = bptt
        self.ext_len = ext_len if ext_len is not None else 0

        self.device = device
        self.shuffle = shuffle

    def get_sent_stream(self):
        # index iterator
        epoch_indices = np.random.permutation(len(self.data)) if self.shuffle \
            else np.array(range(len(self.data)))

        # sentence iterator
        for idx in epoch_indices:
            yield self.data[idx]

    def stream_iterator(self, sent_stream):
        # streams for each data in the batch
        streams = [None] * self.bsz

        data = torch.LongTensor(self.bptt, self.bsz)
        target = torch.LongTensor(self.bptt, self.bsz)

        n_retain = 0

        while True:
            # data   : [n_retain+bptt x bsz]
            # target : [bptt x bsz]
            data[n_retain:].fill_(-1)
            target.fill_(-1)

            valid_batch = True

            for i in range(self.bsz):
                n_filled = 0
                try:
                    while n_filled < self.bptt:
                        if streams[i] is None or len(streams[i]) <= 1:
                            streams[i] = next(sent_stream)
                        # number of new tokens to fill in
                        n_new = min(len(streams[i]) - 1, self.bptt - n_filled)
                        # first n_retain tokens are retained from last batch
                        data[n_retain+n_filled:n_retain+n_filled+n_new, i] = \
                            streams[i][:n_new]
                        target[n_filled:n_filled+n_new, i] = \
                            streams[i][1:n_new+1]
                        streams[i] = streams[i][n_new:]
                        n_filled += n_new
                except StopIteration:
                    valid_batch = False
                    break

            if not valid_batch:
                return

387
388
            data_out = data.transpose(0, 1).contiguous().to(self.device)
            target_out = target.transpose(0, 1).contiguous().to(self.device)
thomwolf's avatar
thomwolf committed
389

390
            yield data_out, target_out, self.bptt
thomwolf's avatar
thomwolf committed
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437

            n_retain = min(data.size(0), self.ext_len)
            if n_retain > 0:
                data[:n_retain] = data[-n_retain:]
            data.resize_(n_retain + self.bptt, data.size(1))

    def __iter__(self):
        # sent_stream is an iterator
        sent_stream = self.get_sent_stream()

        for batch in self.stream_iterator(sent_stream):
            yield batch


class LMMultiFileIterator(LMShuffledIterator):
    def __init__(self, paths, vocab, bsz, bptt, device='cpu', ext_len=None,
        shuffle=False):

        self.paths = paths
        self.vocab = vocab

        self.bsz = bsz
        self.bptt = bptt
        self.ext_len = ext_len if ext_len is not None else 0

        self.device = device
        self.shuffle = shuffle

    def get_sent_stream(self, path):
        sents = self.vocab.encode_file(path, add_double_eos=True)
        if self.shuffle:
            np.random.shuffle(sents)
        sent_stream = iter(sents)

        return sent_stream

    def __iter__(self):
        if self.shuffle:
            np.random.shuffle(self.paths)

        for path in self.paths:
            # sent_stream is an iterator
            sent_stream = self.get_sent_stream(path)
            for batch in self.stream_iterator(sent_stream):
                yield batch


438
439
440
441
442
443
444
445
446
447
448
449
450
451
class TransfoXLCorpus(object):
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
        """
        Instantiate a pre-processed corpus.
        """
        vocab = TransfoXLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
        if pretrained_model_name_or_path in PRETRAINED_CORPUS_ARCHIVE_MAP:
            corpus_file = PRETRAINED_CORPUS_ARCHIVE_MAP[pretrained_model_name_or_path]
        else:
            corpus_file = os.path.join(pretrained_model_name_or_path, CORPUS_NAME)
        # redirect to the cache, if necessary
        try:
            resolved_corpus_file = cached_path(corpus_file, cache_dir=cache_dir)
thomwolf's avatar
thomwolf committed
452
        except EnvironmentError:
453
            logger.error(
454
                "Corpus '{}' was not found in corpus list ({}). "
455
456
457
                "We assumed '{}' was a path or url but couldn't find files {} "
                "at this path or url.".format(
                    pretrained_model_name_or_path,
458
                    ', '.join(PRETRAINED_CORPUS_ARCHIVE_MAP.keys()),
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
                    pretrained_model_name_or_path,
                    corpus_file))
            return None
        if resolved_corpus_file == corpus_file:
            logger.info("loading corpus file {}".format(corpus_file))
        else:
            logger.info("loading corpus file {} from cache at {}".format(
                corpus_file, resolved_corpus_file))

        # Instantiate tokenizer.
        corpus = cls(*inputs, **kwargs)
        corpus_dict = torch.load(resolved_corpus_file)
        for key, value in corpus_dict.items():
            corpus.__dict__[key] = value
        corpus.vocab = vocab
474
475
476
477
478
479
        if corpus.train is not None:
            corpus.train = torch.tensor(corpus.train, dtype=torch.long)
        if corpus.valid is not None:
            corpus.valid = torch.tensor(corpus.valid, dtype=torch.long)
        if corpus.test is not None:
            corpus.test = torch.tensor(corpus.test, dtype=torch.long)
480
481
482
483
484
485
486
487
488
489
        return corpus

    def __init__(self, *args, **kwargs):
        self.vocab = TransfoXLTokenizer(*args, **kwargs)
        self.dataset = None
        self.train = None
        self.valid = None
        self.test = None

    def build_corpus(self, path, dataset):
thomwolf's avatar
thomwolf committed
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
        self.dataset = dataset

        if self.dataset in ['ptb', 'wt2', 'enwik8', 'text8']:
            self.vocab.count_file(os.path.join(path, 'train.txt'))
            self.vocab.count_file(os.path.join(path, 'valid.txt'))
            self.vocab.count_file(os.path.join(path, 'test.txt'))
        elif self.dataset == 'wt103':
            self.vocab.count_file(os.path.join(path, 'train.txt'))
        elif self.dataset == 'lm1b':
            train_path_pattern = os.path.join(
                path, '1-billion-word-language-modeling-benchmark-r13output',
                'training-monolingual.tokenized.shuffled', 'news.en-*')
            train_paths = glob.glob(train_path_pattern)
            # the vocab will load from file when build_vocab() is called

        self.vocab.build_vocab()

        if self.dataset in ['ptb', 'wt2', 'wt103']:
            self.train = self.vocab.encode_file(
                os.path.join(path, 'train.txt'), ordered=True)
            self.valid = self.vocab.encode_file(
                os.path.join(path, 'valid.txt'), ordered=True)
512
            self.test = self.vocab.encode_file(
thomwolf's avatar
thomwolf committed
513
514
515
516
517
518
                os.path.join(path, 'test.txt'), ordered=True)
        elif self.dataset in ['enwik8', 'text8']:
            self.train = self.vocab.encode_file(
                os.path.join(path, 'train.txt'), ordered=True, add_eos=False)
            self.valid = self.vocab.encode_file(
                os.path.join(path, 'valid.txt'), ordered=True, add_eos=False)
519
            self.test = self.vocab.encode_file(
thomwolf's avatar
thomwolf committed
520
521
522
523
524
                os.path.join(path, 'test.txt'), ordered=True, add_eos=False)
        elif self.dataset == 'lm1b':
            self.train = train_paths
            self.valid = self.vocab.encode_file(
                os.path.join(path, 'valid.txt'), ordered=False, add_double_eos=True)
525
            self.test = self.vocab.encode_file(
thomwolf's avatar
thomwolf committed
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
                os.path.join(path, 'test.txt'), ordered=False, add_double_eos=True)

    def get_iterator(self, split, *args, **kwargs):
        if split == 'train':
            if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']:
                data_iter = LMOrderedIterator(self.train, *args, **kwargs)
            elif self.dataset == 'lm1b':
                kwargs['shuffle'] = True
                data_iter = LMMultiFileIterator(self.train, self.vocab, *args, **kwargs)
        elif split in ['valid', 'test']:
            data = self.valid if split == 'valid' else self.test
            if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']:
                data_iter = LMOrderedIterator(data, *args, **kwargs)
            elif self.dataset == 'lm1b':
                data_iter = LMShuffledIterator(data, *args, **kwargs)

        return data_iter


def get_lm_corpus(datadir, dataset):
    fn = os.path.join(datadir, 'cache.pt')
    fn_pickle = os.path.join(datadir, 'cache.pkl')
    if os.path.exists(fn):
thomwolf's avatar
thomwolf committed
549
        logger.info('Loading cached dataset...')
thomwolf's avatar
thomwolf committed
550
551
        corpus = torch.load(fn_pickle)
    elif os.path.exists(fn):
thomwolf's avatar
thomwolf committed
552
        logger.info('Loading cached dataset from pickle...')
thomwolf's avatar
thomwolf committed
553
554
555
        with open(fn, "rb") as fp:
            corpus = pickle.load(fp)
    else:
thomwolf's avatar
thomwolf committed
556
        logger.info('Producing dataset {}...'.format(dataset))
thomwolf's avatar
thomwolf committed
557
558
559
560
561
562
563
564
565
566
567
568
569
570
        kwargs = {}
        if dataset in ['wt103', 'wt2']:
            kwargs['special'] = ['<eos>']
            kwargs['lower_case'] = False
        elif dataset == 'ptb':
            kwargs['special'] = ['<eos>']
            kwargs['lower_case'] = True
        elif dataset == 'lm1b':
            kwargs['special'] = []
            kwargs['lower_case'] = False
            kwargs['vocab_file'] = os.path.join(datadir, '1b_word_vocab.txt')
        elif dataset in ['enwik8', 'text8']:
            pass

571
        corpus = TransfoXLCorpus(datadir, dataset, **kwargs)
thomwolf's avatar
thomwolf committed
572
573
574
        torch.save(corpus, fn)

    return corpus