finetune_on_pregenerated.py 16.5 KB
Newer Older
1
import json
jamin's avatar
jamin committed
2
import logging
3
import random
jamin's avatar
jamin committed
4
from argparse import ArgumentParser
5
from collections import namedtuple
jamin's avatar
jamin committed
6
from pathlib import Path
7
from tempfile import TemporaryDirectory
8

jamin's avatar
jamin committed
9
10
import numpy as np
import torch
11
12
from torch.utils.data import DataLoader, Dataset, RandomSampler
from torch.utils.data.distributed import DistributedSampler
13
from tqdm import tqdm
14

thomwolf's avatar
thomwolf committed
15
from pytorch_transformers.modeling_bert import BertForPreTraining
thomwolf's avatar
thomwolf committed
16
from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule
jamin's avatar
jamin committed
17
from pytorch_transformers.tokenization_bert import BertTokenizer
18
19

InputFeatures = namedtuple("InputFeatures", "input_ids input_mask segment_ids lm_label_ids is_next")
Matthew Carrigan's avatar
Matthew Carrigan committed
20
21
22

log_format = '%(asctime)-10s: %(message)s'
logging.basicConfig(level=logging.INFO, format=log_format)
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56


def convert_example_to_features(example, tokenizer, max_seq_length):
    tokens = example["tokens"]
    segment_ids = example["segment_ids"]
    is_random_next = example["is_random_next"]
    masked_lm_positions = example["masked_lm_positions"]
    masked_lm_labels = example["masked_lm_labels"]

    assert len(tokens) == len(segment_ids) <= max_seq_length  # The preprocessed data should be already truncated
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    masked_label_ids = tokenizer.convert_tokens_to_ids(masked_lm_labels)

    input_array = np.zeros(max_seq_length, dtype=np.int)
    input_array[:len(input_ids)] = input_ids

    mask_array = np.zeros(max_seq_length, dtype=np.bool)
    mask_array[:len(input_ids)] = 1

    segment_array = np.zeros(max_seq_length, dtype=np.bool)
    segment_array[:len(segment_ids)] = segment_ids

    lm_label_array = np.full(max_seq_length, dtype=np.int, fill_value=-1)
    lm_label_array[masked_lm_positions] = masked_label_ids

    features = InputFeatures(input_ids=input_array,
                             input_mask=mask_array,
                             segment_ids=segment_array,
                             lm_label_ids=lm_label_array,
                             is_next=is_random_next)
    return features


class PregeneratedDataset(Dataset):
57
    def __init__(self, training_path, epoch, tokenizer, num_data_epochs, reduce_memory=False):
58
59
60
61
62
63
64
65
66
67
        self.vocab = tokenizer.vocab
        self.tokenizer = tokenizer
        self.epoch = epoch
        self.data_epoch = epoch % num_data_epochs
        data_file = training_path / f"epoch_{self.data_epoch}.json"
        metrics_file = training_path / f"epoch_{self.data_epoch}_metrics.json"
        assert data_file.is_file() and metrics_file.is_file()
        metrics = json.loads(metrics_file.read_text())
        num_samples = metrics['num_training_examples']
        seq_len = metrics['max_seq_len']
68
69
70
71
72
        self.temp_dir = None
        self.working_dir = None
        if reduce_memory:
            self.temp_dir = TemporaryDirectory()
            self.working_dir = Path(self.temp_dir.name)
jamin's avatar
jamin committed
73
            input_ids = np.memmap(filename=self.working_dir / 'input_ids.memmap',
74
                                  mode='w+', dtype=np.int32, shape=(num_samples, seq_len))
jamin's avatar
jamin committed
75
            input_masks = np.memmap(filename=self.working_dir / 'input_masks.memmap',
76
                                    shape=(num_samples, seq_len), mode='w+', dtype=np.bool)
jamin's avatar
jamin committed
77
            segment_ids = np.memmap(filename=self.working_dir / 'segment_ids.memmap',
78
                                    shape=(num_samples, seq_len), mode='w+', dtype=np.bool)
jamin's avatar
jamin committed
79
            lm_label_ids = np.memmap(filename=self.working_dir / 'lm_label_ids.memmap',
80
81
                                     shape=(num_samples, seq_len), mode='w+', dtype=np.int32)
            lm_label_ids[:] = -1
jamin's avatar
jamin committed
82
            is_nexts = np.memmap(filename=self.working_dir / 'is_nexts.memmap',
83
84
85
86
87
88
89
                                 shape=(num_samples,), mode='w+', dtype=np.bool)
        else:
            input_ids = np.zeros(shape=(num_samples, seq_len), dtype=np.int32)
            input_masks = np.zeros(shape=(num_samples, seq_len), dtype=np.bool)
            segment_ids = np.zeros(shape=(num_samples, seq_len), dtype=np.bool)
            lm_label_ids = np.full(shape=(num_samples, seq_len), dtype=np.int32, fill_value=-1)
            is_nexts = np.zeros(shape=(num_samples,), dtype=np.bool)
Matthew Carrigan's avatar
Matthew Carrigan committed
90
        logging.info(f"Loading training examples for epoch {epoch}")
91
92
        with data_file.open() as f:
            for i, line in enumerate(tqdm(f, total=num_samples, desc="Training examples")):
93
94
                line = line.strip()
                example = json.loads(line)
95
96
97
98
99
100
101
                features = convert_example_to_features(example, tokenizer, seq_len)
                input_ids[i] = features.input_ids
                segment_ids[i] = features.segment_ids
                input_masks[i] = features.input_mask
                lm_label_ids[i] = features.lm_label_ids
                is_nexts[i] = features.is_next
        assert i == num_samples - 1  # Assert that the sample count metric was true
Matthew Carrigan's avatar
Matthew Carrigan committed
102
        logging.info("Loading complete!")
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
        self.num_samples = num_samples
        self.seq_len = seq_len
        self.input_ids = input_ids
        self.input_masks = input_masks
        self.segment_ids = segment_ids
        self.lm_label_ids = lm_label_ids
        self.is_nexts = is_nexts

    def __len__(self):
        return self.num_samples

    def __getitem__(self, item):
        return (torch.tensor(self.input_ids[item].astype(np.int64)),
                torch.tensor(self.input_masks[item].astype(np.int64)),
                torch.tensor(self.segment_ids[item].astype(np.int64)),
                torch.tensor(self.lm_label_ids[item].astype(np.int64)),
                torch.tensor(self.is_nexts[item].astype(np.int64)))


def main():
    parser = ArgumentParser()
    parser.add_argument('--pregenerated_data', type=Path, required=True)
    parser.add_argument('--output_dir', type=Path, required=True)
jamin's avatar
jamin committed
126
127
    parser.add_argument("--bert_model", type=str, required=True,
                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
128
                             "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
129
    parser.add_argument("--do_lower_case", action="store_true")
130
131
    parser.add_argument("--reduce_memory", action="store_true",
                        help="Store training data as on-disc memmaps to massively reduce memory usage")
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152

    parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument('--fp16',
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--loss_scale',
Matthew Carrigan's avatar
Matthew Carrigan committed
153
154
                        type=float, default=0,
                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
jamin's avatar
jamin committed
155
156
157
158
                             "0 (default value): dynamic loss scaling.\n"
                             "Positive power of 2: static loss scaling value.\n")
    parser.add_argument("--warmup_steps",
                        default=0,
yzy5630's avatar
yzy5630 committed
159
160
                        type=int,
                        help="Linear warmup over warmup_steps.")
jamin's avatar
jamin committed
161
162
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
yzy5630's avatar
yzy5630 committed
163
164
                        type=float,
                        help="Epsilon for Adam optimizer.")
165
166
167
168
169
170
171
172
173
174
    parser.add_argument("--learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    args = parser.parse_args()

175
176
    assert args.pregenerated_data.is_dir(), \
        "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!"
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203

    samples_per_epoch = []
    for i in range(args.epochs):
        epoch_file = args.pregenerated_data / f"epoch_{i}.json"
        metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json"
        if epoch_file.is_file() and metrics_file.is_file():
            metrics = json.loads(metrics_file.read_text())
            samples_per_epoch.append(metrics['num_training_examples'])
        else:
            if i == 0:
                exit("No training data was found!")
            print(f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs}).")
            print("This script will loop over the available data, but training diversity may be negatively impacted.")
            num_data_epochs = i
            break
    else:
        num_data_epochs = args.epochs

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
Matthew Carrigan's avatar
Matthew Carrigan committed
204
    logging.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
205
206
207
208
        device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
jamin's avatar
jamin committed
209
            args.gradient_accumulation_steps))
210
211
212
213
214
215
216
217
218
219

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if args.output_dir.is_dir() and list(args.output_dir.iterdir()):
Matthew Carrigan's avatar
Matthew Carrigan committed
220
        logging.warning(f"Output directory ({args.output_dir}) already exists and is not empty!")
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
    args.output_dir.mkdir(parents=True, exist_ok=True)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

    total_train_examples = 0
    for i in range(args.epochs):
        # The modulo takes into account the fact that we may loop over limited epochs of data
        total_train_examples += samples_per_epoch[i % len(samples_per_epoch)]

    num_train_optimization_steps = int(
        total_train_examples / args.train_batch_size / args.gradient_accumulation_steps)
    if args.local_rank != -1:
        num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()

    # Prepare model
    model = BertForPreTraining.from_pretrained(args.bert_model)
jamin's avatar
jamin committed
237
238
239
    # We don't need to manually call model.half() following Apex's recommend
    # if args.fp16:
    #     model.half()
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

jamin's avatar
jamin committed
260
261
262
263
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps,
                                     t_total=num_train_optimization_steps)

264
265
    if args.fp16:
        try:
jamin's avatar
jamin committed
266
267
268
            # from apex.optimizers import FP16_Optimizer
            # from apex.optimizers import FusedAdam
            from apex import amp
269
        except ImportError:
270
271
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
272

jamin's avatar
jamin committed
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
        # This below line of code is the main upgrade of Apex Fp16 implementation. I chose opt_leve="01"
        # because it's recommended for typical use by Apex. We can make it configured
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1")

    # We don't need to use FP16_Optimizer wrapping over FusedAdam as well. Now Apex supports all Pytorch Optimizer

    #     optimizer = FusedAdam(optimizer_grouped_parameters,
    #                           lr=args.learning_rate,
    #                           bias_correction=False,
    #                           max_grad_norm=1.0)
    #     if args.loss_scale == 0:
    #         optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
    #     else:
    #         optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
    # else:
    #     optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    # scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps)
290

291
292
293
294
295
296
297
298
    global_step = 0
    logging.info("***** Running training *****")
    logging.info(f"  Num examples = {total_train_examples}")
    logging.info("  Batch size = %d", args.train_batch_size)
    logging.info("  Num steps = %d", num_train_optimization_steps)
    model.train()
    for epoch in range(args.epochs):
        epoch_dataset = PregeneratedDataset(epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer,
299
                                            num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory)
300
301
302
303
304
305
306
307
308
309
310
        if args.local_rank == -1:
            train_sampler = RandomSampler(epoch_dataset)
        else:
            train_sampler = DistributedSampler(epoch_dataset)
        train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar:
            for step, batch in enumerate(train_dataloader):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch
yzy5630's avatar
yzy5630 committed
311
312
                outputs = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next)
                loss = outputs[0]
313
                if n_gpu > 1:
jamin's avatar
jamin committed
314
                    loss = loss.mean()  # mean() to average on multi-gpu.
315
316
317
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                if args.fp16:
jamin's avatar
jamin committed
318
319
320
321
                    # I depricate FP16_Optimizer's backward func and replace as Apex document
                    # optimizer.backward(loss)
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
322
323
324
325
326
327
                else:
                    loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                pbar.update(1)
328
                mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps
329
330
                pbar.set_postfix_str(f"Loss: {mean_loss:.5f}")
                if (step + 1) % args.gradient_accumulation_steps == 0:
331
                    optimizer.step()
332
                    scheduler.step()  # Update learning rate schedule
333
334
335
336
                    optimizer.zero_grad()
                    global_step += 1

    # Save a trained model
337
    if args.local_rank == -1 or torch.distributed.get_rank() == 0:
338
        logging.info("** ** * Saving fine-tuned model ** ** * ")
jamin's avatar
jamin committed
339
340
        model_to_save = model.module if hasattr(model,
                                                'module') else model  # Take care of distributed/parallel training
wangfei's avatar
wangfei committed
341
        model_to_save.save_pretrained(args.output_dir)
yzy5630's avatar
yzy5630 committed
342
        tokenizer.save_pretrained(args.output_dir)
343
344
345
346


if __name__ == '__main__':
    main()