run_xlnet_classifier.py 25.3 KB
Newer Older
1
# coding=utf-8
thomwolf's avatar
thomwolf committed
2
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
4
5
6
7
8
9
10
11
12
13
14
15
16
17
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT finetuning runner."""

thomwolf's avatar
thomwolf committed
18
from __future__ import absolute_import, division, print_function
19

thomwolf's avatar
thomwolf committed
20
import argparse
21
import logging
thomwolf's avatar
thomwolf committed
22
import os
thomwolf's avatar
thomwolf committed
23
import sys
VictorSanh's avatar
VictorSanh committed
24
import random
25
from tqdm import tqdm, trange
thomwolf's avatar
thomwolf committed
26
27

import numpy as np
28

VictorSanh's avatar
VictorSanh committed
29
import torch
thomwolf's avatar
thomwolf committed
30
31
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
32
from torch.utils.data.distributed import DistributedSampler
33
34
from torch.nn import CrossEntropyLoss, MSELoss

35
36
37
from tensorboardX import SummaryWriter

from pytorch_pretrained_bert.file_utils import WEIGHTS_NAME, CONFIG_NAME
38
39
from pytorch_pretrained_bert.modeling_xlnet import XLNetForSequenceClassification
from pytorch_pretrained_bert.tokenization_xlnet import XLNetTokenizer
40
from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
41

42
from utils_glue import processors, output_modes, convert_examples_to_features, compute_metrics
43

thomwolf's avatar
thomwolf committed
44
45
46
47
48
49
if sys.version_info[0] == 2:
    import cPickle as pickle
else:
    import pickle


50
logger = logging.getLogger(__name__)
51

VictorSanh's avatar
WIP  
VictorSanh committed
52

53
def main():
54
55
56
57
58
59
60
61
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--data_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
62
63
    parser.add_argument("--xlnet_model", default="xlnet-large-cased", type=str,
                        help="XLNet pre-trained model: currently only xlnet-large-cased.")
64
65
66
67
68
69
70
71
72
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument("--output_dir",
                        default=None,
                        type=str,
                        required=True,
73
                        help="The output directory where the model predictions and checkpoints will be written.")
74
75

    ## Other parameters
76
77
78
79
    parser.add_argument("--cache_dir",
                        default="",
                        type=str,
                        help="Where do you want to store the pre-trained models downloaded from s3")
80
81
82
83
84
85
86
87
88
89
90
91
    parser.add_argument("--max_seq_length",
                        default=128,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
thomwolf's avatar
thomwolf committed
92
93
94
    parser.add_argument("--do_lower_case",
                        action='store_true',
                        help="Set this flag if you are using an uncased model.")
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_proportion",
                        default=0.1,
                        type=float,
                        help="Proportion of training to perform linear learning rate warmup for. "
                             "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
thomwolf's avatar
thomwolf committed
119
120
121
    parser.add_argument('--overwrite_output_dir',
                        action='store_true',
                        help="Overwrite the content of the output directory")
122
123
124
125
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
126
127
    parser.add_argument('--seed',
                        type=int,
VictorSanh's avatar
VictorSanh committed
128
129
                        default=42,
                        help="random seed for initialization")
130
131
132
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
133
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
thomwolf's avatar
thomwolf committed
134
135
136
137
    parser.add_argument('--fp16',
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--loss_scale',
138
139
140
141
                        type=float, default=0,
                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
                             "0 (default value): dynamic loss scaling.\n"
                             "Positive power of 2: static loss scaling value.\n")
142
143
    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
144
145
    args = parser.parse_args()

146
147
148
149
150
151
152
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

thomwolf's avatar
thomwolf committed
153
154
155
156
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
157
        torch.cuda.set_device(args.local_rank)
thomwolf's avatar
thomwolf committed
158
159
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
thomwolf's avatar
thomwolf committed
160
161
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
162
    args.device = device
163
164
165
166
167

    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt = '%m/%d/%Y %H:%M:%S',
                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)

168
169
    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        device, n_gpu, bool(args.local_rank != -1), args.fp16))
thomwolf's avatar
thomwolf committed
170

171
172
173
    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            args.gradient_accumulation_steps))
thomwolf's avatar
thomwolf committed
174

175
    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
thomwolf's avatar
thomwolf committed
176

VictorSanh's avatar
VictorSanh committed
177
178
179
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
thomwolf's avatar
thomwolf committed
180
181
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)
thomwolf's avatar
thomwolf committed
182

VictorSanh's avatar
WIP  
VictorSanh committed
183
184
    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")
thomwolf's avatar
thomwolf committed
185

thomwolf's avatar
thomwolf committed
186
    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
thomwolf's avatar
thomwolf committed
187
        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
thomwolf's avatar
thomwolf committed
188
    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
thomwolf's avatar
thomwolf committed
189
        os.makedirs(args.output_dir)
VictorSanh's avatar
WIP  
VictorSanh committed
190
191

    task_name = args.task_name.lower()
thomwolf's avatar
thomwolf committed
192

VictorSanh's avatar
WIP  
VictorSanh committed
193
194
195
196
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
197
198
    output_mode = output_modes[task_name]

VictorSanh's avatar
WIP  
VictorSanh committed
199
    label_list = processor.get_labels()
200
    num_labels = len(label_list)
VictorSanh's avatar
WIP  
VictorSanh committed
201

thomwolf's avatar
thomwolf committed
202
    if args.local_rank not in [-1, 0]:
thomwolf's avatar
thomwolf committed
203
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
204
205
    tokenizer = XLNetTokenizer.from_pretrained(args.xlnet_model, do_lower_case=args.do_lower_case)
    model = XLNetForSequenceClassification.from_pretrained(args.xlnet_model, num_labels=num_labels)
thomwolf's avatar
thomwolf committed
206
    if args.local_rank == 0:
thomwolf's avatar
thomwolf committed
207
        torch.distributed.barrier()
thomwolf's avatar
thomwolf committed
208

samuel.broscheit's avatar
samuel.broscheit committed
209
210
211
212
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
213
214
215
216
        model = torch.nn.parallel.DistributedDataParallel(model,
                                                          device_ids=[args.local_rank],
                                                          output_device=args.local_rank,
                                                          find_unused_parameters=True)
samuel.broscheit's avatar
samuel.broscheit committed
217
218
219
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

thomwolf's avatar
fixing  
thomwolf committed
220
221
222
223
    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0

VictorSanh's avatar
WIP  
VictorSanh committed
224
    if args.do_train:
225
226
        if args.local_rank in [-1, 0]:
            tb_writer = SummaryWriter()
samuel.broscheit's avatar
samuel.broscheit committed
227
228

        # Prepare data loader
VictorSanh's avatar
WIP  
VictorSanh committed
229
        train_examples = processor.get_train_examples(args.data_dir)
thomwolf's avatar
thomwolf committed
230
        cached_train_features_file = os.path.join(args.data_dir, 'train_{0}_{1}_{2}'.format(
231
            list(filter(None, args.xlnet_model.split('/'))).pop(),
232
                        str(args.max_seq_length),
thomwolf's avatar
thomwolf committed
233
                        str(task_name)))
234
235
236
237
238
239
240
241
242
243
244
        try:
            with open(cached_train_features_file, "rb") as reader:
                train_features = pickle.load(reader)
        except:
            train_features = convert_examples_to_features(
                train_examples, label_list, args.max_seq_length, tokenizer, output_mode)
            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
                logger.info("  Saving train features into cached file %s", cached_train_features_file)
                with open(cached_train_features_file, "wb") as writer:
                    pickle.dump(train_features, writer)

245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)

        if output_mode == "classification":
            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
        elif output_mode == "regression":
            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)

        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

261
        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
thomwolf's avatar
thomwolf committed
262

samuel.broscheit's avatar
samuel.broscheit committed
263
        # Prepare optimizer
thomwolf's avatar
thomwolf committed
264

265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]
        if args.fp16:
            try:
                from apex.optimizers import FP16_Optimizer
                from apex.optimizers import FusedAdam
            except ImportError:
                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

            optimizer = FusedAdam(optimizer_grouped_parameters,
                                  lr=args.learning_rate,
                                  bias_correction=False,
                                  max_grad_norm=1.0)
            if args.loss_scale == 0:
                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
            else:
                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
            warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
                                                 t_total=num_train_optimization_steps)
thomwolf's avatar
thomwolf committed
288

289
        else:
290
291
292
293
            optimizer = BertAdam(optimizer_grouped_parameters,
                                 lr=args.learning_rate,
                                 warmup=args.warmup_proportion,
                                 t_total=num_train_optimization_steps)
thomwolf's avatar
thomwolf committed
294

VictorSanh's avatar
WIP  
VictorSanh committed
295
296
297
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
298
        logger.info("  Num steps = %d", num_train_optimization_steps)
299
300

        model.train()
thomwolf's avatar
thomwolf committed
301
        for _ in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]):
302
303
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
thomwolf's avatar
thomwolf committed
304
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
305
306
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
307
308

                # define a new function to compute loss values for both output_modes
thomwolf's avatar
thomwolf committed
309
                logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)
310
311
312
313
314
315
316
317

                if output_mode == "classification":
                    loss_fct = CrossEntropyLoss()
                    loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
                elif output_mode == "regression":
                    loss_fct = MSELoss()
                    loss = loss_fct(logits.view(-1), label_ids.view(-1))

thomwolf's avatar
thomwolf committed
318
319
                if n_gpu > 1:
                    loss = loss.mean() # mean() to average on multi-gpu.
320
321
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
322
323
324
325
326
327

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()

328
                tr_loss += loss.item()
329
                nb_tr_examples += input_ids.size(0)
330
                nb_tr_steps += 1
thomwolf's avatar
thomwolf committed
331
                if (step + 1) % args.gradient_accumulation_steps == 0:
332
333
334
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
335
                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
336
337
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
338
339
                    optimizer.step()
                    optimizer.zero_grad()
thomwolf's avatar
thomwolf committed
340
                    global_step += 1
341
342
343
                    if args.local_rank in [-1, 0]:
                        tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
                        tb_writer.add_scalar('loss', loss.item(), global_step)
thomwolf's avatar
thomwolf committed
344

345
346
    ### Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    ### Example:
347
    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
348
        # Save a trained model, configuration and tokenizer
349
        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
350
351

        # If we save using the predefined names, we can load using `from_pretrained`
352
353
354
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

355
356
        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
357
        tokenizer.save_vocabulary(args.output_dir)
358

359
        # Load a trained model and vocabulary that you have fine-tuned
360
361
        model = XLNetForSequenceClassification.from_pretrained(args.output_dir, num_labels=num_labels)
        tokenizer = XLNetTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
thomwolf's avatar
thomwolf committed
362
363
364
365

        # Good practice: save your training arguments together with the trained model
        output_args_file = os.path.join(args.output_dir, 'training_args.bin')
        torch.save(args, output_args_file)
thomwolf's avatar
thomwolf committed
366
    else:
367
        model = XLNetForSequenceClassification.from_pretrained(args.xlnet_model, num_labels=num_labels)
368

thomwolf's avatar
thomwolf committed
369
    model.to(device)
370
371

    ### Evaluation
thomwolf's avatar
thomwolf committed
372
    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
VictorSanh's avatar
WIP  
VictorSanh committed
373
        eval_examples = processor.get_dev_examples(args.data_dir)
thomwolf's avatar
thomwolf committed
374
        cached_eval_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}'.format(
375
            list(filter(None, args.xlnet_model.split('/'))).pop(),
thomwolf's avatar
thomwolf committed
376
377
378
                        str(args.max_seq_length),
                        str(task_name)))
        try:
thomwolf's avatar
thomwolf committed
379
            with open(cached_eval_features_file, "rb") as reader:
thomwolf's avatar
thomwolf committed
380
                eval_features = pickle.load(reader)
thomwolf's avatar
thomwolf committed
381
        except:
thomwolf's avatar
thomwolf committed
382
383
            eval_features = convert_examples_to_features(
                eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
thomwolf's avatar
thomwolf committed
384
            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
thomwolf's avatar
thomwolf committed
385
386
387
                logger.info("  Saving eval features into cached file %s", cached_eval_features_file)
                with open(cached_eval_features_file, "wb") as writer:
                    pickle.dump(eval_features, writer)
thomwolf's avatar
thomwolf committed
388
389


VictorSanh's avatar
wip  
VictorSanh committed
390
391
392
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
393
394
395
        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
396
397
398
399
400
401

        if output_mode == "classification":
            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
        elif output_mode == "regression":
            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float)

402
        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
403
        # Run prediction for full data
404
405
406
407
        if args.local_rank == -1:
            eval_sampler = SequentialSampler(eval_data)
        else:
            eval_sampler = DistributedSampler(eval_data)  # Note that this sampler samples randomly
408
409
410
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

        model.eval()
411
412
413
        eval_loss = 0
        nb_eval_steps = 0
        preds = []
thomwolf's avatar
thomwolf committed
414
        out_label_ids = None
415

416
        for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
417
            input_ids = input_ids.to(device)
thomwolf's avatar
thomwolf committed
418
            input_mask = input_mask.to(device)
419
            segment_ids = segment_ids.to(device)
420
            label_ids = label_ids.to(device)
421

422
            with torch.no_grad():
thomwolf's avatar
thomwolf committed
423
                logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)
424

425
426
427
428
429
430
431
            # create eval loss and other metric required by the task
            if output_mode == "classification":
                loss_fct = CrossEntropyLoss()
                tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
            elif output_mode == "regression":
                loss_fct = MSELoss()
                tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1))
432

433
434
            eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
435
436
            if len(preds) == 0:
                preds.append(logits.detach().cpu().numpy())
thomwolf's avatar
hop  
thomwolf committed
437
                out_label_ids = label_ids.detach().cpu().numpy()
438
439
440
            else:
                preds[0] = np.append(
                    preds[0], logits.detach().cpu().numpy(), axis=0)
thomwolf's avatar
thomwolf committed
441
442
                out_label_ids = np.append(
                    out_label_ids, label_ids.detach().cpu().numpy(), axis=0)
VictorSanh's avatar
WIP  
VictorSanh committed
443

444
        eval_loss = eval_loss / nb_eval_steps
445
446
447
        preds = preds[0]
        if output_mode == "classification":
            preds = np.argmax(preds, axis=1)
448
449
        elif output_mode == "regression":
            preds = np.squeeze(preds)
thomwolf's avatar
thomwolf committed
450
        result = compute_metrics(task_name, preds, out_label_ids)
451

452
        loss = tr_loss/global_step if args.do_train else None
453
454
455
456

        result['eval_loss'] = eval_loss
        result['global_step'] = global_step
        result['loss'] = loss
VictorSanh's avatar
WIP  
VictorSanh committed
457
458

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
VictorSanh's avatar
wip  
VictorSanh committed
459
460
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
VictorSanh's avatar
WIP  
VictorSanh committed
461
            for key in sorted(result.keys()):
VictorSanh's avatar
wip  
VictorSanh committed
462
                logger.info("  %s = %s", key, str(result[key]))
VictorSanh's avatar
WIP  
VictorSanh committed
463
                writer.write("%s = %s\n" % (key, str(result[key])))
464

465
466
467
468
469
        # hack for MNLI-MM
        if task_name == "mnli":
            task_name = "mnli-mm"
            processor = processors[task_name]()

470
471
472
473
474
            if os.path.exists(args.output_dir + '-MM') and os.listdir(args.output_dir + '-MM') and args.do_train:
                raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
            if not os.path.exists(args.output_dir + '-MM'):
                os.makedirs(args.output_dir + '-MM')

475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
            eval_examples = processor.get_dev_examples(args.data_dir)
            eval_features = convert_examples_to_features(
                eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
            logger.info("***** Running evaluation *****")
            logger.info("  Num examples = %d", len(eval_examples))
            logger.info("  Batch size = %d", args.eval_batch_size)
            all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
            all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
            all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)

            eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
            # Run prediction for full data
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

            model.eval()
            eval_loss = 0
            nb_eval_steps = 0
            preds = []
thomwolf's avatar
thomwolf committed
495
            out_label_ids = None
496
497
498
499
500
501
502
503

            for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)

                with torch.no_grad():
thomwolf's avatar
thomwolf committed
504
                    logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=None)
thomwolf's avatar
thomwolf committed
505

506
507
                loss_fct = CrossEntropyLoss()
                tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
thomwolf's avatar
thomwolf committed
508

509
510
511
512
                eval_loss += tmp_eval_loss.mean().item()
                nb_eval_steps += 1
                if len(preds) == 0:
                    preds.append(logits.detach().cpu().numpy())
thomwolf's avatar
hop  
thomwolf committed
513
                    out_label_ids = label_ids.detach().cpu().numpy()
514
515
516
                else:
                    preds[0] = np.append(
                        preds[0], logits.detach().cpu().numpy(), axis=0)
thomwolf's avatar
thomwolf committed
517
518
                    out_label_ids = np.append(
                        out_label_ids, label_ids.detach().cpu().numpy(), axis=0)
519

520
521
522
            eval_loss = eval_loss / nb_eval_steps
            preds = preds[0]
            preds = np.argmax(preds, axis=1)
thomwolf's avatar
thomwolf committed
523
            result = compute_metrics(task_name, preds, out_label_ids)
524

525
            loss = tr_loss/global_step if args.do_train else None
526
527
528
529
530
531
532
533
534
535
536

            result['eval_loss'] = eval_loss
            result['global_step'] = global_step
            result['loss'] = loss

            output_eval_file = os.path.join(args.output_dir + '-MM', "eval_results.txt")
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))
537

VictorSanh's avatar
WIP  
VictorSanh committed
538
539
if __name__ == "__main__":
    main()