finetune_on_pregenerated.py 16.5 KB
Newer Older
jamin's avatar
jamin committed
1
2
3
4
from argparse import ArgumentParser
from pathlib import Path
import os
import torch
jamin's avatar
jamin committed
5
import logging
jamin's avatar
jamin committed
6
import json
7
import random
jamin's avatar
jamin committed
8
import numpy as np
9
from collections import namedtuple
10
from tempfile import TemporaryDirectory
11
12
13

from torch.utils.data import DataLoader, Dataset, RandomSampler
from torch.utils.data.distributed import DistributedSampler
14
from tqdm import tqdm
15

jamin's avatar
jamin committed
16
from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
thomwolf's avatar
thomwolf committed
17
from pytorch_transformers.modeling_bert import BertForPreTraining
jamin's avatar
jamin committed
18
from pytorch_transformers.tokenization_bert import BertTokenizer
jamin's avatar
jamin committed
19
from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule
20
21

InputFeatures = namedtuple("InputFeatures", "input_ids input_mask segment_ids lm_label_ids is_next")
Matthew Carrigan's avatar
Matthew Carrigan committed
22
23
24

log_format = '%(asctime)-10s: %(message)s'
logging.basicConfig(level=logging.INFO, format=log_format)
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58


def convert_example_to_features(example, tokenizer, max_seq_length):
    tokens = example["tokens"]
    segment_ids = example["segment_ids"]
    is_random_next = example["is_random_next"]
    masked_lm_positions = example["masked_lm_positions"]
    masked_lm_labels = example["masked_lm_labels"]

    assert len(tokens) == len(segment_ids) <= max_seq_length  # The preprocessed data should be already truncated
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    masked_label_ids = tokenizer.convert_tokens_to_ids(masked_lm_labels)

    input_array = np.zeros(max_seq_length, dtype=np.int)
    input_array[:len(input_ids)] = input_ids

    mask_array = np.zeros(max_seq_length, dtype=np.bool)
    mask_array[:len(input_ids)] = 1

    segment_array = np.zeros(max_seq_length, dtype=np.bool)
    segment_array[:len(segment_ids)] = segment_ids

    lm_label_array = np.full(max_seq_length, dtype=np.int, fill_value=-1)
    lm_label_array[masked_lm_positions] = masked_label_ids

    features = InputFeatures(input_ids=input_array,
                             input_mask=mask_array,
                             segment_ids=segment_array,
                             lm_label_ids=lm_label_array,
                             is_next=is_random_next)
    return features


class PregeneratedDataset(Dataset):
59
    def __init__(self, training_path, epoch, tokenizer, num_data_epochs, reduce_memory=False):
60
61
62
63
64
65
66
67
68
69
        self.vocab = tokenizer.vocab
        self.tokenizer = tokenizer
        self.epoch = epoch
        self.data_epoch = epoch % num_data_epochs
        data_file = training_path / f"epoch_{self.data_epoch}.json"
        metrics_file = training_path / f"epoch_{self.data_epoch}_metrics.json"
        assert data_file.is_file() and metrics_file.is_file()
        metrics = json.loads(metrics_file.read_text())
        num_samples = metrics['num_training_examples']
        seq_len = metrics['max_seq_len']
70
71
72
73
74
        self.temp_dir = None
        self.working_dir = None
        if reduce_memory:
            self.temp_dir = TemporaryDirectory()
            self.working_dir = Path(self.temp_dir.name)
jamin's avatar
jamin committed
75
            input_ids = np.memmap(filename=self.working_dir/'input_ids.memmap',
76
                                  mode='w+', dtype=np.int32, shape=(num_samples, seq_len))
jamin's avatar
jamin committed
77
            input_masks = np.memmap(filename=self.working_dir/'input_masks.memmap',
78
                                    shape=(num_samples, seq_len), mode='w+', dtype=np.bool)
jamin's avatar
jamin committed
79
            segment_ids = np.memmap(filename=self.working_dir/'segment_ids.memmap',
80
                                    shape=(num_samples, seq_len), mode='w+', dtype=np.bool)
jamin's avatar
jamin committed
81
            lm_label_ids = np.memmap(filename=self.working_dir/'lm_label_ids.memmap',
82
83
                                     shape=(num_samples, seq_len), mode='w+', dtype=np.int32)
            lm_label_ids[:] = -1
jamin's avatar
jamin committed
84
            is_nexts = np.memmap(filename=self.working_dir/'is_nexts.memmap',
85
86
87
88
89
90
91
                                 shape=(num_samples,), mode='w+', dtype=np.bool)
        else:
            input_ids = np.zeros(shape=(num_samples, seq_len), dtype=np.int32)
            input_masks = np.zeros(shape=(num_samples, seq_len), dtype=np.bool)
            segment_ids = np.zeros(shape=(num_samples, seq_len), dtype=np.bool)
            lm_label_ids = np.full(shape=(num_samples, seq_len), dtype=np.int32, fill_value=-1)
            is_nexts = np.zeros(shape=(num_samples,), dtype=np.bool)
Matthew Carrigan's avatar
Matthew Carrigan committed
92
        logging.info(f"Loading training examples for epoch {epoch}")
93
94
        with data_file.open() as f:
            for i, line in enumerate(tqdm(f, total=num_samples, desc="Training examples")):
95
96
                line = line.strip()
                example = json.loads(line)
97
98
99
100
101
102
103
                features = convert_example_to_features(example, tokenizer, seq_len)
                input_ids[i] = features.input_ids
                segment_ids[i] = features.segment_ids
                input_masks[i] = features.input_mask
                lm_label_ids[i] = features.lm_label_ids
                is_nexts[i] = features.is_next
        assert i == num_samples - 1  # Assert that the sample count metric was true
Matthew Carrigan's avatar
Matthew Carrigan committed
104
        logging.info("Loading complete!")
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
        self.num_samples = num_samples
        self.seq_len = seq_len
        self.input_ids = input_ids
        self.input_masks = input_masks
        self.segment_ids = segment_ids
        self.lm_label_ids = lm_label_ids
        self.is_nexts = is_nexts

    def __len__(self):
        return self.num_samples

    def __getitem__(self, item):
        return (torch.tensor(self.input_ids[item].astype(np.int64)),
                torch.tensor(self.input_masks[item].astype(np.int64)),
                torch.tensor(self.segment_ids[item].astype(np.int64)),
                torch.tensor(self.lm_label_ids[item].astype(np.int64)),
                torch.tensor(self.is_nexts[item].astype(np.int64)))


def main():
    parser = ArgumentParser()
    parser.add_argument('--pregenerated_data', type=Path, required=True)
    parser.add_argument('--output_dir', type=Path, required=True)
jamin's avatar
jamin committed
128
    parser.add_argument("--bert_model", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, "
129
                             "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
130
    parser.add_argument("--do_lower_case", action="store_true")
131
132
    parser.add_argument("--reduce_memory", action="store_true",
                        help="Store training data as on-disc memmaps to massively reduce memory usage")
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153

    parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument('--fp16',
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--loss_scale',
Matthew Carrigan's avatar
Matthew Carrigan committed
154
155
                        type=float, default=0,
                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
jamin's avatar
jamin committed
156
157
158
159
                        "0 (default value): dynamic loss scaling.\n"
                        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument("--warmup_steps", 
                        default=0, 
yzy5630's avatar
yzy5630 committed
160
161
                        type=int,
                        help="Linear warmup over warmup_steps.")
jamin's avatar
jamin committed
162
163
    parser.add_argument("--adam_epsilon", 
                        default=1e-8, 
yzy5630's avatar
yzy5630 committed
164
165
                        type=float,
                        help="Epsilon for Adam optimizer.")
166
167
168
169
170
171
172
173
174
175
    parser.add_argument("--learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    args = parser.parse_args()

176
177
    assert args.pregenerated_data.is_dir(), \
        "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!"
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204

    samples_per_epoch = []
    for i in range(args.epochs):
        epoch_file = args.pregenerated_data / f"epoch_{i}.json"
        metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json"
        if epoch_file.is_file() and metrics_file.is_file():
            metrics = json.loads(metrics_file.read_text())
            samples_per_epoch.append(metrics['num_training_examples'])
        else:
            if i == 0:
                exit("No training data was found!")
            print(f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs}).")
            print("This script will loop over the available data, but training diversity may be negatively impacted.")
            num_data_epochs = i
            break
    else:
        num_data_epochs = args.epochs

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
Matthew Carrigan's avatar
Matthew Carrigan committed
205
    logging.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
206
207
208
209
        device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
jamin's avatar
jamin committed
210
                            args.gradient_accumulation_steps))
211
212
213
214
215
216
217
218
219
220

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if args.output_dir.is_dir() and list(args.output_dir.iterdir()):
Matthew Carrigan's avatar
Matthew Carrigan committed
221
        logging.warning(f"Output directory ({args.output_dir}) already exists and is not empty!")
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
    args.output_dir.mkdir(parents=True, exist_ok=True)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

    total_train_examples = 0
    for i in range(args.epochs):
        # The modulo takes into account the fact that we may loop over limited epochs of data
        total_train_examples += samples_per_epoch[i % len(samples_per_epoch)]

    num_train_optimization_steps = int(
        total_train_examples / args.train_batch_size / args.gradient_accumulation_steps)
    if args.local_rank != -1:
        num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()

    # Prepare model
    model = BertForPreTraining.from_pretrained(args.bert_model)
jamin's avatar
jamin committed
238
239
240
    # We don't need to manually call model.half() following Apex's recommend
    # if args.fp16:
    #     model.half()
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

jamin's avatar
jamin committed
261
262
263
264
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps,
                                     t_total=num_train_optimization_steps)

265
266
    if args.fp16:
        try:
jamin's avatar
jamin committed
267
268
269
            # from apex.optimizers import FP16_Optimizer
            # from apex.optimizers import FusedAdam
            from apex import amp
270
        except ImportError:
271
272
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
273

jamin's avatar
jamin committed
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
        # This below line of code is the main upgrade of Apex Fp16 implementation. I chose opt_leve="01"
        # because it's recommended for typical use by Apex. We can make it configured
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1")

    # We don't need to use FP16_Optimizer wrapping over FusedAdam as well. Now Apex supports all Pytorch Optimizer

    #     optimizer = FusedAdam(optimizer_grouped_parameters,
    #                           lr=args.learning_rate,
    #                           bias_correction=False,
    #                           max_grad_norm=1.0)
    #     if args.loss_scale == 0:
    #         optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
    #     else:
    #         optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
    # else:
    #     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    # scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps)
291

292
293
294
295
296
297
298
299
    global_step = 0
    logging.info("***** Running training *****")
    logging.info(f"  Num examples = {total_train_examples}")
    logging.info("  Batch size = %d", args.train_batch_size)
    logging.info("  Num steps = %d", num_train_optimization_steps)
    model.train()
    for epoch in range(args.epochs):
        epoch_dataset = PregeneratedDataset(epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer,
300
                                            num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory)
301
302
303
304
305
306
307
308
309
310
311
        if args.local_rank == -1:
            train_sampler = RandomSampler(epoch_dataset)
        else:
            train_sampler = DistributedSampler(epoch_dataset)
        train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar:
            for step, batch in enumerate(train_dataloader):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch
yzy5630's avatar
yzy5630 committed
312
313
                outputs = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next)
                loss = outputs[0]
314
                if n_gpu > 1:
jamin's avatar
jamin committed
315
                    loss = loss.mean() # mean() to average on multi-gpu.
316
317
318
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                if args.fp16:
jamin's avatar
jamin committed
319
320
321
322
                    # I depricate FP16_Optimizer's backward func and replace as Apex document
                    # optimizer.backward(loss)
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
323
324
325
326
327
328
                else:
                    loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                pbar.update(1)
329
                mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps
330
331
                pbar.set_postfix_str(f"Loss: {mean_loss:.5f}")
                if (step + 1) % args.gradient_accumulation_steps == 0:
332
                    optimizer.step()
333
                    scheduler.step()  # Update learning rate schedule
334
335
336
337
                    optimizer.zero_grad()
                    global_step += 1

    # Save a trained model
338
    if args.local_rank == -1 or torch.distributed.get_rank() == 0:
339
        logging.info("** ** * Saving fine-tuned model ** ** * ")
jamin's avatar
jamin committed
340
        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
wangfei's avatar
wangfei committed
341
        model_to_save.save_pretrained(args.output_dir)
yzy5630's avatar
yzy5630 committed
342
        tokenizer.save_pretrained(args.output_dir)
343
344
345
346


if __name__ == '__main__':
    main()