run_xlnet_classifier.py 25.9 KB
Newer Older
1
# coding=utf-8
thomwolf's avatar
thomwolf committed
2
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
4
5
6
7
8
9
10
11
12
13
14
15
16
17
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT finetuning runner."""

thomwolf's avatar
thomwolf committed
18
from __future__ import absolute_import, division, print_function
19

thomwolf's avatar
thomwolf committed
20
import argparse
21
import logging
thomwolf's avatar
thomwolf committed
22
import os
thomwolf's avatar
thomwolf committed
23
import sys
VictorSanh's avatar
VictorSanh committed
24
import random
25
from tqdm import tqdm, trange
thomwolf's avatar
thomwolf committed
26
27

import numpy as np
28

VictorSanh's avatar
VictorSanh committed
29
import torch
thomwolf's avatar
thomwolf committed
30
31
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
32
from torch.utils.data.distributed import DistributedSampler
33
34
from torch.nn import CrossEntropyLoss, MSELoss

35
36
37
from tensorboardX import SummaryWriter

from pytorch_pretrained_bert.file_utils import WEIGHTS_NAME, CONFIG_NAME
38
39
from pytorch_pretrained_bert.modeling_xlnet import XLNetForSequenceClassification
from pytorch_pretrained_bert.tokenization_xlnet import XLNetTokenizer
40
from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
41

42
from utils_glue import processors, output_modes, convert_examples_to_features, compute_metrics
43

thomwolf's avatar
thomwolf committed
44
45
46
47
48
49
if sys.version_info[0] == 2:
    import cPickle as pickle
else:
    import pickle


50
logger = logging.getLogger(__name__)
51

VictorSanh's avatar
WIP  
VictorSanh committed
52

53
def main():
54
55
56
    parser = argparse.ArgumentParser()

    ## Required parameters
thomwolf's avatar
thomwolf committed
57
    parser.add_argument("--data_dir", default=None, type=str, required=True,
58
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
thomwolf's avatar
thomwolf committed
59
    parser.add_argument("--task_name", default=None, type=str, required=True,
60
                        help="The name of the task to train.")
thomwolf's avatar
thomwolf committed
61
    parser.add_argument("--output_dir", default=None, type=str, required=True,
62
                        help="The output directory where the model predictions and checkpoints will be written.")
thomwolf's avatar
thomwolf committed
63
64
    # training
    parser.add_argument("--do_train", action='store_true',
65
                        help="Whether to run training.")
thomwolf's avatar
thomwolf committed
66
    parser.add_argument("--learning_rate", default=5e-5, type=float,
67
                        help="The initial learning rate for Adam.")
thomwolf's avatar
thomwolf committed
68
    parser.add_argument("--num_train_epochs", default=3.0, type=float,
69
                        help="Total number of training epochs to perform.")
thomwolf's avatar
thomwolf committed
70
    parser.add_argument("--warmup_proportion", default=0.1, type=float,
71
72
                        help="Proportion of training to perform linear learning rate warmup for. "
                             "E.g., 0.1 = 10%% of training.")
73
74
    parser.add_argument("--clip_gradients", default=1.0, type=float,
                        help="Clip gradient norms.")
thomwolf's avatar
thomwolf committed
75
76
77
    parser.add_argument("--train_batch_size", default=32, type=int,
                        help="Total batch size for training.")
    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
78
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
thomwolf's avatar
thomwolf committed
79
    parser.add_argument('--fp16', action='store_true',
thomwolf's avatar
thomwolf committed
80
                        help="Whether to use 16-bit float precision instead of 32-bit")
thomwolf's avatar
thomwolf committed
81
    parser.add_argument('--loss_scale', type=float, default=0,
82
83
84
                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
                             "0 (default value): dynamic loss scaling.\n"
                             "Positive power of 2: static loss scaling value.\n")
85
86
    parser.add_argument("--log_every", default=10, type=int,
                        help="Log metrics every X training steps.")
thomwolf's avatar
thomwolf committed
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
    # evaluation
    parser.add_argument("--do_eval", action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--eval_batch_size", default=8, type=int,
                        help="Total batch size for eval.")
    # Model
    parser.add_argument("--xlnet_model", default="xlnet-large-cased", type=str,
                        help="XLNet pre-trained model: currently only xlnet-large-cased.")
    parser.add_argument("--do_lower_case", action='store_true',
                        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--cache_dir", default="", type=str,
                        help="Where do you want to store the pre-trained models downloaded from s3")
    # task specific
    parser.add_argument("--max_seq_length", default=128, type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument('--overwrite_output_dir', action='store_true',
                        help="Overwrite the content of the output directory")
    # Misc
    parser.add_argument("--no_cuda", action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank", type=int, default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed', type=int, default=42,
                        help="random seed for initialization")
113
114
    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
115
116
    args = parser.parse_args()

117
118
119
120
121
122
123
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

thomwolf's avatar
thomwolf committed
124
125
126
127
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
128
        torch.cuda.set_device(args.local_rank)
thomwolf's avatar
thomwolf committed
129
130
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
thomwolf's avatar
thomwolf committed
131
132
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
133
    args.device = device
134
135
136
137
138

    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt = '%m/%d/%Y %H:%M:%S',
                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)

139
140
    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        device, n_gpu, bool(args.local_rank != -1), args.fp16))
thomwolf's avatar
thomwolf committed
141

142
143
144
    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            args.gradient_accumulation_steps))
thomwolf's avatar
thomwolf committed
145

146
    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
thomwolf's avatar
thomwolf committed
147

VictorSanh's avatar
VictorSanh committed
148
149
150
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
thomwolf's avatar
thomwolf committed
151
152
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)
thomwolf's avatar
thomwolf committed
153

VictorSanh's avatar
WIP  
VictorSanh committed
154
155
    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")
thomwolf's avatar
thomwolf committed
156

thomwolf's avatar
thomwolf committed
157
    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
158
        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
thomwolf's avatar
thomwolf committed
159
    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
thomwolf's avatar
thomwolf committed
160
        os.makedirs(args.output_dir)
VictorSanh's avatar
WIP  
VictorSanh committed
161
162

    task_name = args.task_name.lower()
thomwolf's avatar
thomwolf committed
163

VictorSanh's avatar
WIP  
VictorSanh committed
164
165
166
167
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
168
169
    output_mode = output_modes[task_name]

VictorSanh's avatar
WIP  
VictorSanh committed
170
    label_list = processor.get_labels()
171
    num_labels = len(label_list)
VictorSanh's avatar
WIP  
VictorSanh committed
172

thomwolf's avatar
thomwolf committed
173
    if args.local_rank not in [-1, 0]:
thomwolf's avatar
thomwolf committed
174
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
175
176
    tokenizer = XLNetTokenizer.from_pretrained(args.xlnet_model, do_lower_case=args.do_lower_case)
    model = XLNetForSequenceClassification.from_pretrained(args.xlnet_model, num_labels=num_labels)
thomwolf's avatar
thomwolf committed
177
    if args.local_rank == 0:
thomwolf's avatar
thomwolf committed
178
        torch.distributed.barrier()
thomwolf's avatar
thomwolf committed
179

samuel.broscheit's avatar
samuel.broscheit committed
180
181
182
183
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
184
185
186
187
        model = torch.nn.parallel.DistributedDataParallel(model,
                                                          device_ids=[args.local_rank],
                                                          output_device=args.local_rank,
                                                          find_unused_parameters=True)
samuel.broscheit's avatar
samuel.broscheit committed
188
189
190
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

thomwolf's avatar
fixing  
thomwolf committed
191
192
193
194
    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0

VictorSanh's avatar
WIP  
VictorSanh committed
195
    if args.do_train:
196
197
        if args.local_rank in [-1, 0]:
            tb_writer = SummaryWriter()
samuel.broscheit's avatar
samuel.broscheit committed
198
199

        # Prepare data loader
VictorSanh's avatar
WIP  
VictorSanh committed
200
        train_examples = processor.get_train_examples(args.data_dir)
thomwolf's avatar
thomwolf committed
201
        cached_train_features_file = os.path.join(args.data_dir, 'train_{0}_{1}_{2}'.format(
202
            list(filter(None, args.xlnet_model.split('/'))).pop(),
203
                        str(args.max_seq_length),
thomwolf's avatar
thomwolf committed
204
                        str(task_name)))
thomwolf's avatar
thomwolf committed
205
206
        if os.path.exists(cached_train_features_file):
            logger.info("Loading train features for cache file %s", cached_train_features_file)
207
208
            with open(cached_train_features_file, "rb") as reader:
                train_features = pickle.load(reader)
thomwolf's avatar
thomwolf committed
209
210
        else:
            logger.info("No cache file at %s, preparing train features", cached_train_features_file)
211
            train_features = convert_examples_to_features(
212
213
                train_examples, label_list, args.max_seq_length, tokenizer, output_mode,
                cls_token_at_end=True, cls_token=tokenizer.CLS_TOKEN,
thomwolf's avatar
thomwolf committed
214
215
                sep_token=tokenizer.SEP_TOKEN, cls_token_segment_id=2,
                pad_on_left=True, pad_token_segment_id=4)
216
217
218
219
220
            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
                logger.info("  Saving train features into cached file %s", cached_train_features_file)
                with open(cached_train_features_file, "wb") as writer:
                    pickle.dump(train_features, writer)

221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)

        if output_mode == "classification":
            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
        elif output_mode == "regression":
            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)

        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

237
        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
thomwolf's avatar
thomwolf committed
238

samuel.broscheit's avatar
samuel.broscheit committed
239
        # Prepare optimizer
thomwolf's avatar
thomwolf committed
240

241
242
243
244
245
246
247
        optimizer_grouped_parameters = model.parameters()
        # param_optimizer = list(model.named_parameters())
        # no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        # optimizer_grouped_parameters = [
        #     {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        #     {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        #     ]
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
        if args.fp16:
            try:
                from apex.optimizers import FP16_Optimizer
                from apex.optimizers import FusedAdam
            except ImportError:
                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

            optimizer = FusedAdam(optimizer_grouped_parameters,
                                  lr=args.learning_rate,
                                  bias_correction=False,
                                  max_grad_norm=1.0)
            if args.loss_scale == 0:
                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
            else:
                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
            warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
                                                 t_total=num_train_optimization_steps)
thomwolf's avatar
thomwolf committed
265

266
        else:
267
268
269
270
            optimizer = BertAdam(optimizer_grouped_parameters,
                                 lr=args.learning_rate,
                                 warmup=args.warmup_proportion,
                                 t_total=num_train_optimization_steps)
thomwolf's avatar
thomwolf committed
271

VictorSanh's avatar
WIP  
VictorSanh committed
272
273
274
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
275
        logger.info("  Num steps = %d", num_train_optimization_steps)
276
277

        model.train()
thomwolf's avatar
thomwolf committed
278
        for _ in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]):
279
280
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
thomwolf's avatar
thomwolf committed
281
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
282
283
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
284
285

                # define a new function to compute loss values for both output_modes
thomwolf's avatar
thomwolf committed
286
                logits, _ = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)
287
288
289
290
291
292
293
294

                if output_mode == "classification":
                    loss_fct = CrossEntropyLoss()
                    loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
                elif output_mode == "regression":
                    loss_fct = MSELoss()
                    loss = loss_fct(logits.view(-1), label_ids.view(-1))

thomwolf's avatar
thomwolf committed
295
296
                if n_gpu > 1:
                    loss = loss.mean() # mean() to average on multi-gpu.
297
298
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
299
300
301
302
303
304

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()

305
306
307
                if args.clip_gradients > 0.0:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_gradients)

308
                tr_loss += loss.item()
309
                nb_tr_examples += input_ids.size(0)
310
                nb_tr_steps += 1
thomwolf's avatar
thomwolf committed
311
                if (step + 1) % args.gradient_accumulation_steps == 0:
312
313
314
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
315
                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
316
317
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
318
319
                    optimizer.step()
                    optimizer.zero_grad()
thomwolf's avatar
thomwolf committed
320
                    global_step += 1
321
                    if args.local_rank in [-1, 0] and (args.log_every <= 0 or (step + 1) % args.log_every == 0):
322
323
                        tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
                        tb_writer.add_scalar('loss', loss.item(), global_step)
thomwolf's avatar
thomwolf committed
324

325
326
    ### Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    ### Example:
327
    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
328
        # Save a trained model, configuration and tokenizer
329
        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
330
331

        # If we save using the predefined names, we can load using `from_pretrained`
332
333
334
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

335
336
        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
337
        tokenizer.save_vocabulary(args.output_dir)
338

339
        # Load a trained model and vocabulary that you have fine-tuned
340
341
        model = XLNetForSequenceClassification.from_pretrained(args.output_dir, num_labels=num_labels)
        tokenizer = XLNetTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
thomwolf's avatar
thomwolf committed
342
343
344
345

        # Good practice: save your training arguments together with the trained model
        output_args_file = os.path.join(args.output_dir, 'training_args.bin')
        torch.save(args, output_args_file)
thomwolf's avatar
thomwolf committed
346
    else:
347
        model = XLNetForSequenceClassification.from_pretrained(args.xlnet_model, num_labels=num_labels)
348

thomwolf's avatar
thomwolf committed
349
    model.to(device)
350
351

    ### Evaluation
thomwolf's avatar
thomwolf committed
352
    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
VictorSanh's avatar
WIP  
VictorSanh committed
353
        eval_examples = processor.get_dev_examples(args.data_dir)
thomwolf's avatar
thomwolf committed
354
        cached_eval_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}'.format(
355
            list(filter(None, args.xlnet_model.split('/'))).pop(),
thomwolf's avatar
thomwolf committed
356
357
                        str(args.max_seq_length),
                        str(task_name)))
thomwolf's avatar
thomwolf committed
358
359
        if os.path.exists(cached_eval_features_file):
            logger.info("Loading eval features for cache file %s", cached_eval_features_file)
thomwolf's avatar
thomwolf committed
360
            with open(cached_eval_features_file, "rb") as reader:
thomwolf's avatar
thomwolf committed
361
                eval_features = pickle.load(reader)
thomwolf's avatar
thomwolf committed
362
363
        else:
            logger.info("No cache file at %s, preparing eval features", cached_eval_features_file)
thomwolf's avatar
thomwolf committed
364
            eval_features = convert_examples_to_features(
365
366
                eval_examples, label_list, args.max_seq_length, tokenizer, output_mode,
                cls_token_at_end=True, cls_token=tokenizer.CLS_TOKEN,
thomwolf's avatar
thomwolf committed
367
368
                sep_token=tokenizer.SEP_TOKEN, cls_token_segment_id=2,
                pad_on_left=True, pad_token_segment_id=4)
thomwolf's avatar
thomwolf committed
369
            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
thomwolf's avatar
thomwolf committed
370
371
372
                logger.info("  Saving eval features into cached file %s", cached_eval_features_file)
                with open(cached_eval_features_file, "wb") as writer:
                    pickle.dump(eval_features, writer)
thomwolf's avatar
thomwolf committed
373
374


VictorSanh's avatar
wip  
VictorSanh committed
375
376
377
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
378
379
380
        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
381
382
383
384
385
386

        if output_mode == "classification":
            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
        elif output_mode == "regression":
            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float)

387
        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
388
        # Run prediction for full data
389
390
391
392
        if args.local_rank == -1:
            eval_sampler = SequentialSampler(eval_data)
        else:
            eval_sampler = DistributedSampler(eval_data)  # Note that this sampler samples randomly
393
394
395
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

        model.eval()
396
397
398
        eval_loss = 0
        nb_eval_steps = 0
        preds = []
thomwolf's avatar
thomwolf committed
399
        out_label_ids = None
400

401
        for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
402
            input_ids = input_ids.to(device)
thomwolf's avatar
thomwolf committed
403
            input_mask = input_mask.to(device)
404
            segment_ids = segment_ids.to(device)
405
            label_ids = label_ids.to(device)
406

407
            with torch.no_grad():
thomwolf's avatar
thomwolf committed
408
                logits, _ = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)
409

410
411
412
413
414
415
416
            # create eval loss and other metric required by the task
            if output_mode == "classification":
                loss_fct = CrossEntropyLoss()
                tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
            elif output_mode == "regression":
                loss_fct = MSELoss()
                tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1))
417

418
419
            eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
420
421
            if len(preds) == 0:
                preds.append(logits.detach().cpu().numpy())
thomwolf's avatar
hop  
thomwolf committed
422
                out_label_ids = label_ids.detach().cpu().numpy()
423
424
425
            else:
                preds[0] = np.append(
                    preds[0], logits.detach().cpu().numpy(), axis=0)
thomwolf's avatar
thomwolf committed
426
427
                out_label_ids = np.append(
                    out_label_ids, label_ids.detach().cpu().numpy(), axis=0)
VictorSanh's avatar
WIP  
VictorSanh committed
428

429
        eval_loss = eval_loss / nb_eval_steps
430
431
432
        preds = preds[0]
        if output_mode == "classification":
            preds = np.argmax(preds, axis=1)
433
434
        elif output_mode == "regression":
            preds = np.squeeze(preds)
thomwolf's avatar
thomwolf committed
435
        result = compute_metrics(task_name, preds, out_label_ids)
436

437
        loss = tr_loss/global_step if args.do_train else None
438
439
440
441

        result['eval_loss'] = eval_loss
        result['global_step'] = global_step
        result['loss'] = loss
VictorSanh's avatar
WIP  
VictorSanh committed
442
443

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
VictorSanh's avatar
wip  
VictorSanh committed
444
445
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
VictorSanh's avatar
WIP  
VictorSanh committed
446
            for key in sorted(result.keys()):
VictorSanh's avatar
wip  
VictorSanh committed
447
                logger.info("  %s = %s", key, str(result[key]))
VictorSanh's avatar
WIP  
VictorSanh committed
448
                writer.write("%s = %s\n" % (key, str(result[key])))
449

450
451
452
453
454
        # hack for MNLI-MM
        if task_name == "mnli":
            task_name = "mnli-mm"
            processor = processors[task_name]()

455
456
457
458
459
            if os.path.exists(args.output_dir + '-MM') and os.listdir(args.output_dir + '-MM') and args.do_train:
                raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
            if not os.path.exists(args.output_dir + '-MM'):
                os.makedirs(args.output_dir + '-MM')

460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
            eval_examples = processor.get_dev_examples(args.data_dir)
            eval_features = convert_examples_to_features(
                eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
            logger.info("***** Running evaluation *****")
            logger.info("  Num examples = %d", len(eval_examples))
            logger.info("  Batch size = %d", args.eval_batch_size)
            all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
            all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
            all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)

            eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
            # Run prediction for full data
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

            model.eval()
            eval_loss = 0
            nb_eval_steps = 0
            preds = []
thomwolf's avatar
thomwolf committed
480
            out_label_ids = None
481
482
483
484
485
486
487
488

            for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)

                with torch.no_grad():
thomwolf's avatar
thomwolf committed
489
                    logits, _ = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=None)
thomwolf's avatar
thomwolf committed
490

491
492
                loss_fct = CrossEntropyLoss()
                tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
thomwolf's avatar
thomwolf committed
493

494
495
496
497
                eval_loss += tmp_eval_loss.mean().item()
                nb_eval_steps += 1
                if len(preds) == 0:
                    preds.append(logits.detach().cpu().numpy())
thomwolf's avatar
hop  
thomwolf committed
498
                    out_label_ids = label_ids.detach().cpu().numpy()
499
500
501
                else:
                    preds[0] = np.append(
                        preds[0], logits.detach().cpu().numpy(), axis=0)
thomwolf's avatar
thomwolf committed
502
503
                    out_label_ids = np.append(
                        out_label_ids, label_ids.detach().cpu().numpy(), axis=0)
504

505
506
507
            eval_loss = eval_loss / nb_eval_steps
            preds = preds[0]
            preds = np.argmax(preds, axis=1)
thomwolf's avatar
thomwolf committed
508
            result = compute_metrics(task_name, preds, out_label_ids)
509

510
            loss = tr_loss/global_step if args.do_train else None
511
512
513
514
515
516
517
518
519
520
521

            result['eval_loss'] = eval_loss
            result['global_step'] = global_step
            result['loss'] = loss

            output_eval_file = os.path.join(args.output_dir + '-MM', "eval_results.txt")
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))
522

VictorSanh's avatar
WIP  
VictorSanh committed
523
524
if __name__ == "__main__":
    main()