tokenization_transfo_xl.py 21.2 KB
Newer Older
thomwolf's avatar
thomwolf committed
1
# coding=utf-8
thomwolf's avatar
thomwolf committed
2
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
thomwolf's avatar
thomwolf committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Tokenization classes for Transformer XL model.
17
    Adapted from https://github.com/kimiyoung/transformer-xl.
thomwolf's avatar
thomwolf committed
18
"""
19
from __future__ import absolute_import, division, print_function, unicode_literals
thomwolf's avatar
thomwolf committed
20

21
import glob
thomwolf's avatar
thomwolf committed
22
import logging
thomwolf's avatar
thomwolf committed
23
24
import os
import sys
thomwolf's avatar
thomwolf committed
25
from collections import Counter, OrderedDict
thomwolf's avatar
thomwolf committed
26
27
28
from io import open

import numpy as np
thomwolf's avatar
thomwolf committed
29
30

from .file_utils import cached_path
thomwolf's avatar
thomwolf committed
31
from .tokenization_utils import PreTrainedTokenizer
thomwolf's avatar
thomwolf committed
32

Aymeric Augustin's avatar
Aymeric Augustin committed
33

thomwolf's avatar
thomwolf committed
34
35
36
37
38
try:
    import torch
except ImportError:
    pass

39
40
41
42
if sys.version_info[0] == 2:
    import cPickle as pickle
else:
    import pickle
thomwolf's avatar
thomwolf committed
43
44


thomwolf's avatar
thomwolf committed
45
46
logger = logging.getLogger(__name__)

47
VOCAB_FILES_NAMES = {"pretrained_vocab_file": "vocab.bin", "vocab_file": "vocab.txt"}
48
49

PRETRAINED_VOCAB_FILES_MAP = {
50
51
    "pretrained_vocab_file": {
        "transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-vocab.bin",
52
53
54
55
    }
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
56
    "transfo-xl-wt103": None,
thomwolf's avatar
thomwolf committed
57
}
58
59

PRETRAINED_CORPUS_ARCHIVE_MAP = {
60
    "transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-corpus.bin",
thomwolf's avatar
thomwolf committed
61
}
62
63
CORPUS_NAME = "corpus.bin"

thomwolf's avatar
thomwolf committed
64

65
class TransfoXLTokenizer(PreTrainedTokenizer):
thomwolf's avatar
thomwolf committed
66
67
68
    """
    Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl
    """
69

70
71
72
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
thomwolf's avatar
thomwolf committed
73

74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
    def __init__(
        self,
        special=None,
        min_freq=0,
        max_size=None,
        lower_case=False,
        delimiter=None,
        vocab_file=None,
        pretrained_vocab_file=None,
        never_split=None,
        unk_token="<unk>",
        eos_token="<eos>",
        additional_special_tokens=["<formula>"],
        **kwargs
    ):
        super(TransfoXLTokenizer, self).__init__(
            unk_token=unk_token, eos_token=eos_token, additional_special_tokens=additional_special_tokens, **kwargs
        )

        self.max_len_single_sentence = (
            self.max_len
        )  # no default special tokens - you can update this value if you add special tokens
        self.max_len_sentences_pair = (
            self.max_len
        )  # no default special tokens - you can update this value if you add special tokens
99

100
101
102
103
        if never_split is None:
            never_split = self.all_special_tokens
        if special is None:
            special = []
thomwolf's avatar
thomwolf committed
104
105
106
107
108
109
110
        self.counter = Counter()
        self.special = special
        self.min_freq = min_freq
        self.max_size = max_size
        self.lower_case = lower_case
        self.delimiter = delimiter
        self.vocab_file = vocab_file
111
        self.never_split = never_split
thomwolf's avatar
thomwolf committed
112

113
114
115
116
117
        if pretrained_vocab_file is not None:
            # Hack because, honestly this tokenizer was not made to be used
            # in a library like ours, at all.
            vocab_dict = torch.load(pretrained_vocab_file)
            for key, value in vocab_dict.items():
thomwolf's avatar
thomwolf committed
118
119
                if key not in self.__dict__:
                    self.__dict__[key] = value
120

121
122
123
        if vocab_file is not None:
            self.build_vocab()

thomwolf's avatar
thomwolf committed
124
    def count_file(self, path, verbose=False, add_eos=False):
125
126
        if verbose:
            logger.info("counting file {} ...".format(path))
thomwolf's avatar
thomwolf committed
127
128
129
        assert os.path.exists(path)

        sents = []
130
        with open(path, "r", encoding="utf-8") as f:
thomwolf's avatar
thomwolf committed
131
132
            for idx, line in enumerate(f):
                if verbose and idx > 0 and idx % 500000 == 0:
133
                    logger.info("    line {}".format(idx))
thomwolf's avatar
thomwolf committed
134
135
136
137
138
139
140
141
142
143
                symbols = self.tokenize(line, add_eos=add_eos)
                self.counter.update(symbols)
                sents.append(symbols)

        return sents

    def count_sents(self, sents, verbose=False):
        """
            sents : a list of sentences, each a list of tokenized symbols
        """
144
145
        if verbose:
            logger.info("counting {} sents ...".format(len(sents)))
thomwolf's avatar
thomwolf committed
146
147
        for idx, symbols in enumerate(sents):
            if verbose and idx > 0 and idx % 500000 == 0:
148
                logger.info("    line {}".format(idx))
thomwolf's avatar
thomwolf committed
149
150
151
152
153
154
            self.counter.update(symbols)

    def _build_from_file(self, vocab_file):
        self.idx2sym = []
        self.sym2idx = OrderedDict()

155
        with open(vocab_file, "r", encoding="utf-8") as f:
thomwolf's avatar
thomwolf committed
156
157
158
            for line in f:
                symb = line.strip().split()[0]
                self.add_symbol(symb)
159
160
161
162
        if "<UNK>" in self.sym2idx:
            self.unk_idx = self.sym2idx["<UNK>"]
        elif "<unk>" in self.sym2idx:
            self.unk_idx = self.sym2idx["<unk>"]
163
        else:
164
            raise ValueError("No <unkown> token in vocabulary")
thomwolf's avatar
thomwolf committed
165

166
    def save_vocabulary(self, vocab_path):
thomwolf's avatar
thomwolf committed
167
168
        """Save the tokenizer vocabulary to a directory or file."""
        if os.path.isdir(vocab_path):
169
            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["pretrained_vocab_file"])
170
        torch.save(self.__dict__, vocab_file)
171
        return (vocab_file,)
172

thomwolf's avatar
thomwolf committed
173
174
    def build_vocab(self):
        if self.vocab_file:
175
            logger.info("building vocab from {}".format(self.vocab_file))
thomwolf's avatar
thomwolf committed
176
            self._build_from_file(self.vocab_file)
177
            logger.info("final vocab size {}".format(len(self)))
thomwolf's avatar
thomwolf committed
178
        else:
179
            logger.info("building vocab with min_freq={}, max_size={}".format(self.min_freq, self.max_size))
thomwolf's avatar
thomwolf committed
180
181
182
183
184
185
186
            self.idx2sym = []
            self.sym2idx = OrderedDict()

            for sym in self.special:
                self.add_special(sym)

            for sym, cnt in self.counter.most_common(self.max_size):
187
188
                if cnt < self.min_freq:
                    break
thomwolf's avatar
thomwolf committed
189
190
                self.add_symbol(sym)

191
            logger.info("final vocab size {} from {} unique tokens".format(len(self), len(self.counter)))
thomwolf's avatar
thomwolf committed
192

193
194
195
    def encode_file(self, path, ordered=False, verbose=False, add_eos=True, add_double_eos=False):
        if verbose:
            logger.info("encoding file {} ...".format(path))
thomwolf's avatar
thomwolf committed
196
197
        assert os.path.exists(path)
        encoded = []
198
        with open(path, "r", encoding="utf-8") as f:
thomwolf's avatar
thomwolf committed
199
200
            for idx, line in enumerate(f):
                if verbose and idx > 0 and idx % 500000 == 0:
201
202
                    logger.info("    line {}".format(idx))
                symbols = self.tokenize(line, add_eos=add_eos, add_double_eos=add_double_eos)
thomwolf's avatar
thomwolf committed
203
204
205
206
207
208
209
210
                encoded.append(self.convert_to_tensor(symbols))

        if ordered:
            encoded = torch.cat(encoded)

        return encoded

    def encode_sents(self, sents, ordered=False, verbose=False):
211
212
        if verbose:
            logger.info("encoding {} sents ...".format(len(sents)))
thomwolf's avatar
thomwolf committed
213
214
215
        encoded = []
        for idx, symbols in enumerate(sents):
            if verbose and idx > 0 and idx % 500000 == 0:
216
                logger.info("    line {}".format(idx))
thomwolf's avatar
thomwolf committed
217
218
219
220
221
222
223
224
225
226
227
            encoded.append(self.convert_to_tensor(symbols))

        if ordered:
            encoded = torch.cat(encoded)

        return encoded

    def add_special(self, sym):
        if sym not in self.sym2idx:
            self.idx2sym.append(sym)
            self.sym2idx[sym] = len(self.idx2sym) - 1
228
            setattr(self, "{}_idx".format(sym.strip("<>")), self.sym2idx[sym])
thomwolf's avatar
thomwolf committed
229
230
231
232
233
234

    def add_symbol(self, sym):
        if sym not in self.sym2idx:
            self.idx2sym.append(sym)
            self.sym2idx[sym] = len(self.idx2sym) - 1

235
236
    def _convert_id_to_token(self, idx):
        """Converts an id in a token (BPE) using the vocab."""
237
        assert 0 <= idx < len(self), "Index {} out of vocabulary range".format(idx)
thomwolf's avatar
thomwolf committed
238
239
        return self.idx2sym[idx]

240
241
    def _convert_token_to_id(self, sym):
        """ Converts a token (str/unicode) in an id using the vocab. """
thomwolf's avatar
thomwolf committed
242
243
244
        if sym in self.sym2idx:
            return self.sym2idx[sym]
        else:
thomwolf's avatar
thomwolf committed
245
            # logger.info('encounter unk {}'.format(sym))
246
            # assert '<eos>' not in sym
247
            if hasattr(self, "unk_idx"):
248
249
                return self.sym2idx.get(sym, self.unk_idx)
            # Backward compatibility with pre-trained models
250
251
252
253
            elif "<unk>" in self.sym2idx:
                return self.sym2idx["<unk>"]
            elif "<UNK>" in self.sym2idx:
                return self.sym2idx["<UNK>"]
254
            else:
255
                raise ValueError("Token not in vocabulary and no <unk> token in vocabulary for replacement")
thomwolf's avatar
thomwolf committed
256

257
258
    def convert_tokens_to_string(self, tokens):
        """ Converts a sequence of tokens (string) in a single string. """
259
        out_string = " ".join(tokens).strip()
260
        return out_string
thomwolf's avatar
thomwolf committed
261
262
263
264

    def convert_to_tensor(self, symbols):
        return torch.LongTensor(self.convert_tokens_to_ids(symbols))

265
266
    @property
    def vocab_size(self):
thomwolf's avatar
thomwolf committed
267
268
        return len(self.idx2sym)

269
    def _tokenize(self, line, add_eos=False, add_double_eos=False):
thomwolf's avatar
thomwolf committed
270
        line = line.strip()
271
272
273
        # convert to lower case
        if self.lower_case:
            line = line.lower()
thomwolf's avatar
thomwolf committed
274

275
        # empty delimiter '' will evaluate False
276
        if self.delimiter == "":
277
278
279
            symbols = line
        else:
            symbols = line.split(self.delimiter)
thomwolf's avatar
thomwolf committed
280

281
282
        if add_double_eos:  # lm1b
            return ["<S>"] + symbols + ["<S>"]
thomwolf's avatar
thomwolf committed
283
        elif add_eos:
284
            return symbols + ["<eos>"]
thomwolf's avatar
thomwolf committed
285
        else:
286
            return symbols
thomwolf's avatar
thomwolf committed
287
288
289


class LMOrderedIterator(object):
290
    def __init__(self, data, bsz, bptt, device="cpu", ext_len=None):
thomwolf's avatar
thomwolf committed
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
        """
            data -- LongTensor -- the LongTensor is strictly ordered
        """
        self.bsz = bsz
        self.bptt = bptt
        self.ext_len = ext_len if ext_len is not None else 0

        self.device = device

        # Work out how cleanly we can divide the dataset into bsz parts.
        self.n_step = data.size(0) // bsz

        # Trim off any extra elements that wouldn't cleanly fit (remainders).
        data = data.narrow(0, 0, self.n_step * bsz)

        # Evenly divide the data across the bsz batches.
        self.data = data.view(bsz, -1).t().contiguous().to(device)

        # Number of mini-batches
        self.n_batch = (self.n_step + self.bptt - 1) // self.bptt

    def get_batch(self, i, bptt=None):
313
314
        if bptt is None:
            bptt = self.bptt
thomwolf's avatar
thomwolf committed
315
316
317
318
319
320
        seq_len = min(bptt, self.data.size(0) - 1 - i)

        end_idx = i + seq_len
        beg_idx = max(0, i - self.ext_len)

        data = self.data[beg_idx:end_idx]
321
        target = self.data[i + 1 : i + 1 + seq_len]
thomwolf's avatar
thomwolf committed
322

323
324
325
326
        data_out = data.transpose(0, 1).contiguous().to(self.device)
        target_out = target.transpose(0, 1).contiguous().to(self.device)

        return data_out, target_out, seq_len
thomwolf's avatar
thomwolf committed
327
328
329
330
331
332
333
334
335

    def get_fixlen_iter(self, start=0):
        for i in range(start, self.data.size(0) - 1, self.bptt):
            yield self.get_batch(i)

    def get_varlen_iter(self, start=0, std=5, min_len=5, max_deviation=3):
        max_len = self.bptt + max_deviation * std
        i = start
        while True:
336
            bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2.0
thomwolf's avatar
thomwolf committed
337
338
339
340
341
342
343
344
345
346
347
348
            bptt = min(max_len, max(min_len, int(np.random.normal(bptt, std))))
            data, target, seq_len = self.get_batch(i, bptt)
            i += seq_len
            yield data, target, seq_len
            if i >= self.data.size(0) - 2:
                break

    def __iter__(self):
        return self.get_fixlen_iter()


class LMShuffledIterator(object):
349
    def __init__(self, data, bsz, bptt, device="cpu", ext_len=None, shuffle=False):
thomwolf's avatar
thomwolf committed
350
351
352
353
354
355
356
357
358
359
360
361
362
363
        """
            data -- list[LongTensor] -- there is no order among the LongTensors
        """
        self.data = data

        self.bsz = bsz
        self.bptt = bptt
        self.ext_len = ext_len if ext_len is not None else 0

        self.device = device
        self.shuffle = shuffle

    def get_sent_stream(self):
        # index iterator
364
        epoch_indices = np.random.permutation(len(self.data)) if self.shuffle else np.array(range(len(self.data)))
thomwolf's avatar
thomwolf committed
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395

        # sentence iterator
        for idx in epoch_indices:
            yield self.data[idx]

    def stream_iterator(self, sent_stream):
        # streams for each data in the batch
        streams = [None] * self.bsz

        data = torch.LongTensor(self.bptt, self.bsz)
        target = torch.LongTensor(self.bptt, self.bsz)

        n_retain = 0

        while True:
            # data   : [n_retain+bptt x bsz]
            # target : [bptt x bsz]
            data[n_retain:].fill_(-1)
            target.fill_(-1)

            valid_batch = True

            for i in range(self.bsz):
                n_filled = 0
                try:
                    while n_filled < self.bptt:
                        if streams[i] is None or len(streams[i]) <= 1:
                            streams[i] = next(sent_stream)
                        # number of new tokens to fill in
                        n_new = min(len(streams[i]) - 1, self.bptt - n_filled)
                        # first n_retain tokens are retained from last batch
396
397
                        data[n_retain + n_filled : n_retain + n_filled + n_new, i] = streams[i][:n_new]
                        target[n_filled : n_filled + n_new, i] = streams[i][1 : n_new + 1]
thomwolf's avatar
thomwolf committed
398
399
400
401
402
403
404
405
406
                        streams[i] = streams[i][n_new:]
                        n_filled += n_new
                except StopIteration:
                    valid_batch = False
                    break

            if not valid_batch:
                return

407
408
            data_out = data.transpose(0, 1).contiguous().to(self.device)
            target_out = target.transpose(0, 1).contiguous().to(self.device)
thomwolf's avatar
thomwolf committed
409

410
            yield data_out, target_out, self.bptt
thomwolf's avatar
thomwolf committed
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425

            n_retain = min(data.size(0), self.ext_len)
            if n_retain > 0:
                data[:n_retain] = data[-n_retain:]
            data.resize_(n_retain + self.bptt, data.size(1))

    def __iter__(self):
        # sent_stream is an iterator
        sent_stream = self.get_sent_stream()

        for batch in self.stream_iterator(sent_stream):
            yield batch


class LMMultiFileIterator(LMShuffledIterator):
426
    def __init__(self, paths, vocab, bsz, bptt, device="cpu", ext_len=None, shuffle=False):
thomwolf's avatar
thomwolf committed
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456

        self.paths = paths
        self.vocab = vocab

        self.bsz = bsz
        self.bptt = bptt
        self.ext_len = ext_len if ext_len is not None else 0

        self.device = device
        self.shuffle = shuffle

    def get_sent_stream(self, path):
        sents = self.vocab.encode_file(path, add_double_eos=True)
        if self.shuffle:
            np.random.shuffle(sents)
        sent_stream = iter(sents)

        return sent_stream

    def __iter__(self):
        if self.shuffle:
            np.random.shuffle(self.paths)

        for path in self.paths:
            # sent_stream is an iterator
            sent_stream = self.get_sent_stream(path)
            for batch in self.stream_iterator(sent_stream):
                yield batch


457
458
459
460
461
462
463
464
465
466
467
468
469
470
class TransfoXLCorpus(object):
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
        """
        Instantiate a pre-processed corpus.
        """
        vocab = TransfoXLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
        if pretrained_model_name_or_path in PRETRAINED_CORPUS_ARCHIVE_MAP:
            corpus_file = PRETRAINED_CORPUS_ARCHIVE_MAP[pretrained_model_name_or_path]
        else:
            corpus_file = os.path.join(pretrained_model_name_or_path, CORPUS_NAME)
        # redirect to the cache, if necessary
        try:
            resolved_corpus_file = cached_path(corpus_file, cache_dir=cache_dir)
thomwolf's avatar
thomwolf committed
471
        except EnvironmentError:
472
            logger.error(
473
                "Corpus '{}' was not found in corpus list ({}). "
474
475
476
                "We assumed '{}' was a path or url but couldn't find files {} "
                "at this path or url.".format(
                    pretrained_model_name_or_path,
477
                    ", ".join(PRETRAINED_CORPUS_ARCHIVE_MAP.keys()),
478
                    pretrained_model_name_or_path,
479
480
481
                    corpus_file,
                )
            )
482
483
484
485
            return None
        if resolved_corpus_file == corpus_file:
            logger.info("loading corpus file {}".format(corpus_file))
        else:
486
            logger.info("loading corpus file {} from cache at {}".format(corpus_file, resolved_corpus_file))
487
488
489
490
491
492
493

        # Instantiate tokenizer.
        corpus = cls(*inputs, **kwargs)
        corpus_dict = torch.load(resolved_corpus_file)
        for key, value in corpus_dict.items():
            corpus.__dict__[key] = value
        corpus.vocab = vocab
494
495
496
497
498
499
        if corpus.train is not None:
            corpus.train = torch.tensor(corpus.train, dtype=torch.long)
        if corpus.valid is not None:
            corpus.valid = torch.tensor(corpus.valid, dtype=torch.long)
        if corpus.test is not None:
            corpus.test = torch.tensor(corpus.test, dtype=torch.long)
500
501
502
503
504
505
506
507
508
509
        return corpus

    def __init__(self, *args, **kwargs):
        self.vocab = TransfoXLTokenizer(*args, **kwargs)
        self.dataset = None
        self.train = None
        self.valid = None
        self.test = None

    def build_corpus(self, path, dataset):
thomwolf's avatar
thomwolf committed
510
511
        self.dataset = dataset

512
513
514
515
516
517
518
        if self.dataset in ["ptb", "wt2", "enwik8", "text8"]:
            self.vocab.count_file(os.path.join(path, "train.txt"))
            self.vocab.count_file(os.path.join(path, "valid.txt"))
            self.vocab.count_file(os.path.join(path, "test.txt"))
        elif self.dataset == "wt103":
            self.vocab.count_file(os.path.join(path, "train.txt"))
        elif self.dataset == "lm1b":
thomwolf's avatar
thomwolf committed
519
            train_path_pattern = os.path.join(
520
521
522
523
524
                path,
                "1-billion-word-language-modeling-benchmark-r13output",
                "training-monolingual.tokenized.shuffled",
                "news.en-*",
            )
thomwolf's avatar
thomwolf committed
525
526
527
528
529
            train_paths = glob.glob(train_path_pattern)
            # the vocab will load from file when build_vocab() is called

        self.vocab.build_vocab()

530
531
532
533
534
535
536
537
538
        if self.dataset in ["ptb", "wt2", "wt103"]:
            self.train = self.vocab.encode_file(os.path.join(path, "train.txt"), ordered=True)
            self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=True)
            self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True)
        elif self.dataset in ["enwik8", "text8"]:
            self.train = self.vocab.encode_file(os.path.join(path, "train.txt"), ordered=True, add_eos=False)
            self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=True, add_eos=False)
            self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True, add_eos=False)
        elif self.dataset == "lm1b":
thomwolf's avatar
thomwolf committed
539
            self.train = train_paths
540
541
            self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=False, add_double_eos=True)
            self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=False, add_double_eos=True)
thomwolf's avatar
thomwolf committed
542
543

    def get_iterator(self, split, *args, **kwargs):
544
545
        if split == "train":
            if self.dataset in ["ptb", "wt2", "wt103", "enwik8", "text8"]:
thomwolf's avatar
thomwolf committed
546
                data_iter = LMOrderedIterator(self.train, *args, **kwargs)
547
548
            elif self.dataset == "lm1b":
                kwargs["shuffle"] = True
thomwolf's avatar
thomwolf committed
549
                data_iter = LMMultiFileIterator(self.train, self.vocab, *args, **kwargs)
550
551
552
        elif split in ["valid", "test"]:
            data = self.valid if split == "valid" else self.test
            if self.dataset in ["ptb", "wt2", "wt103", "enwik8", "text8"]:
thomwolf's avatar
thomwolf committed
553
                data_iter = LMOrderedIterator(data, *args, **kwargs)
554
            elif self.dataset == "lm1b":
thomwolf's avatar
thomwolf committed
555
556
557
558
559
560
                data_iter = LMShuffledIterator(data, *args, **kwargs)

        return data_iter


def get_lm_corpus(datadir, dataset):
561
562
    fn = os.path.join(datadir, "cache.pt")
    fn_pickle = os.path.join(datadir, "cache.pkl")
thomwolf's avatar
thomwolf committed
563
    if os.path.exists(fn):
564
        logger.info("Loading cached dataset...")
thomwolf's avatar
thomwolf committed
565
566
        corpus = torch.load(fn_pickle)
    elif os.path.exists(fn):
567
        logger.info("Loading cached dataset from pickle...")
thomwolf's avatar
thomwolf committed
568
569
570
        with open(fn, "rb") as fp:
            corpus = pickle.load(fp)
    else:
571
        logger.info("Producing dataset {}...".format(dataset))
thomwolf's avatar
thomwolf committed
572
        kwargs = {}
573
574
575
576
577
578
579
580
581
582
583
        if dataset in ["wt103", "wt2"]:
            kwargs["special"] = ["<eos>"]
            kwargs["lower_case"] = False
        elif dataset == "ptb":
            kwargs["special"] = ["<eos>"]
            kwargs["lower_case"] = True
        elif dataset == "lm1b":
            kwargs["special"] = []
            kwargs["lower_case"] = False
            kwargs["vocab_file"] = os.path.join(datadir, "1b_word_vocab.txt")
        elif dataset in ["enwik8", "text8"]:
thomwolf's avatar
thomwolf committed
584
585
            pass

586
        corpus = TransfoXLCorpus(datadir, dataset, **kwargs)
thomwolf's avatar
thomwolf committed
587
588
589
        torch.save(corpus, fn)

    return corpus