"Plugson/src/Lib/xz-embedded/linux/lib/decompress_unxz.c" did not exist on "05a1b863a66bf72b26e5d87570c4e0e61b9736cd"
train_ser.py 9.28 KB
Newer Older
littletomatodonkey's avatar
littletomatodonkey committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
WenmuZhou's avatar
add re  
WenmuZhou committed
16
17
18
19
20
21
import sys

__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(__dir__)
sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))

littletomatodonkey's avatar
littletomatodonkey committed
22
import random
zhoujun's avatar
zhoujun committed
23
import time
littletomatodonkey's avatar
littletomatodonkey committed
24
25
26
27
28
29
30
31
import copy
import logging

import argparse
import paddle
import numpy as np
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
from paddlenlp.transformers import LayoutXLMModel, LayoutXLMTokenizer, LayoutXLMForTokenClassification
zhoujun's avatar
zhoujun committed
32
33
from paddlenlp.transformers import LayoutLMModel, LayoutLMTokenizer, LayoutLMForTokenClassification

littletomatodonkey's avatar
littletomatodonkey committed
34
from xfun import XFUNDataset
littletomatodonkey's avatar
littletomatodonkey committed
35
from vqa_utils import parse_args, get_bio_label_maps, print_arguments, set_seed
zhoujun's avatar
zhoujun committed
36
from eval_ser import evaluate
zhoujun's avatar
zhoujun committed
37
from losses import SERLoss
WenmuZhou's avatar
add re  
WenmuZhou committed
38
from ppocr.utils.logging import get_logger
littletomatodonkey's avatar
littletomatodonkey committed
39

zhoujun's avatar
zhoujun committed
40
41
42
43
44
45
46
MODELS = {
    'LayoutXLM':
    (LayoutXLMTokenizer, LayoutXLMModel, LayoutXLMForTokenClassification),
    'LayoutLM':
    (LayoutLMTokenizer, LayoutLMModel, LayoutLMForTokenClassification)
}

littletomatodonkey's avatar
littletomatodonkey committed
47
48
49

def train(args):
    os.makedirs(args.output_dir, exist_ok=True)
WenmuZhou's avatar
WenmuZhou committed
50
51
52
    rank = paddle.distributed.get_rank()
    distributed = paddle.distributed.get_world_size() > 1

WenmuZhou's avatar
add re  
WenmuZhou committed
53
54
    logger = get_logger(log_file=os.path.join(args.output_dir, "train.log"))
    print_arguments(args, logger)
littletomatodonkey's avatar
littletomatodonkey committed
55
56

    label2id_map, id2label_map = get_bio_label_maps(args.label_map_path)
zhoujun's avatar
zhoujun committed
57
58
59
    loss_class = SERLoss(len(label2id_map))

    pad_token_label_id = loss_class.ignore_index
littletomatodonkey's avatar
littletomatodonkey committed
60
61

    # dist mode
WenmuZhou's avatar
WenmuZhou committed
62
    if distributed:
littletomatodonkey's avatar
littletomatodonkey committed
63
64
        paddle.distributed.init_parallel_env()

zhoujun's avatar
zhoujun committed
65
66
    tokenizer_class, base_model_class, model_class = MODELS[args.ser_model_type]
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
zhoujun's avatar
zhoujun committed
67
    if not args.resume:
zhoujun's avatar
zhoujun committed
68
69
70
        base_model = base_model_class.from_pretrained(args.model_name_or_path)
        model = model_class(
            base_model, num_classes=len(label2id_map), dropout=None)
zhoujun's avatar
zhoujun committed
71
72
73
        logger.info('train from scratch')
    else:
        logger.info('resume from {}'.format(args.model_name_or_path))
zhoujun's avatar
zhoujun committed
74
        model = model_class.from_pretrained(args.model_name_or_path)
littletomatodonkey's avatar
littletomatodonkey committed
75
76

    # dist mode
WenmuZhou's avatar
WenmuZhou committed
77
    if distributed:
littletomatodonkey's avatar
littletomatodonkey committed
78
79
80
81
82
83
84
85
86
87
88
89
90
        model = paddle.DataParallel(model)

    train_dataset = XFUNDataset(
        tokenizer,
        data_dir=args.train_data_dir,
        label_path=args.train_label_path,
        label2id_map=label2id_map,
        img_size=(224, 224),
        pad_token_label_id=pad_token_label_id,
        contains_re=False,
        add_special_ids=False,
        return_attention_mask=True,
        load_mode='all')
zhoujun's avatar
zhoujun committed
91
92
93
94
95
96
97
98
99
100
101
    eval_dataset = XFUNDataset(
        tokenizer,
        data_dir=args.eval_data_dir,
        label_path=args.eval_label_path,
        label2id_map=label2id_map,
        img_size=(224, 224),
        pad_token_label_id=pad_token_label_id,
        contains_re=False,
        add_special_ids=False,
        return_attention_mask=True,
        load_mode='all')
littletomatodonkey's avatar
littletomatodonkey committed
102
103
104
105
106
107
108

    train_sampler = paddle.io.DistributedBatchSampler(
        train_dataset, batch_size=args.per_gpu_train_batch_size, shuffle=True)

    train_dataloader = paddle.io.DataLoader(
        train_dataset,
        batch_sampler=train_sampler,
WenmuZhou's avatar
WenmuZhou committed
109
        num_workers=args.num_workers,
littletomatodonkey's avatar
littletomatodonkey committed
110
111
112
        use_shared_memory=True,
        collate_fn=None, )

zhoujun's avatar
zhoujun committed
113
114
115
    eval_dataloader = paddle.io.DataLoader(
        eval_dataset,
        batch_size=args.per_gpu_eval_batch_size,
WenmuZhou's avatar
WenmuZhou committed
116
        num_workers=args.num_workers,
zhoujun's avatar
zhoujun committed
117
118
119
        use_shared_memory=True,
        collate_fn=None, )

littletomatodonkey's avatar
littletomatodonkey committed
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
    t_total = len(train_dataloader) * args.num_train_epochs

    # build linear decay with warmup lr sch
    lr_scheduler = paddle.optimizer.lr.PolynomialDecay(
        learning_rate=args.learning_rate,
        decay_steps=t_total,
        end_lr=0.0,
        power=1.0)
    if args.warmup_steps > 0:
        lr_scheduler = paddle.optimizer.lr.LinearWarmup(
            lr_scheduler,
            args.warmup_steps,
            start_lr=0,
            end_lr=args.learning_rate, )

    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        parameters=model.parameters(),
        epsilon=args.adam_epsilon,
        weight_decay=args.weight_decay)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed) = %d",
WenmuZhou's avatar
WenmuZhou committed
149
        args.per_gpu_train_batch_size * paddle.distributed.get_world_size(), )
littletomatodonkey's avatar
littletomatodonkey committed
150
151
152
153
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss = 0.0
WenmuZhou's avatar
WenmuZhou committed
154
    set_seed(args.seed)
littletomatodonkey's avatar
littletomatodonkey committed
155
156
    best_metrics = None

zhoujun's avatar
zhoujun committed
157
158
159
160
161
162
163
    train_reader_cost = 0.0
    train_run_cost = 0.0
    total_samples = 0
    reader_start = time.time()

    print_step = 1
    model.train()
littletomatodonkey's avatar
littletomatodonkey committed
164
165
    for epoch_id in range(args.num_train_epochs):
        for step, batch in enumerate(train_dataloader):
zhoujun's avatar
zhoujun committed
166
167
            train_reader_cost += time.time() - reader_start

zhoujun's avatar
zhoujun committed
168
169
170
171
172
            if args.ser_model_type == 'LayoutLM':
                if 'image' in batch:
                    batch.pop('image')
            labels = batch.pop('labels')

zhoujun's avatar
zhoujun committed
173
            train_start = time.time()
littletomatodonkey's avatar
littletomatodonkey committed
174
            outputs = model(**batch)
zhoujun's avatar
zhoujun committed
175
            train_run_cost += time.time() - train_start
zhoujun's avatar
zhoujun committed
176
177
178
            if args.ser_model_type == 'LayoutXLM':
                outputs = outputs[0]
            loss = loss_class(labels, outputs, batch['attention_mask'])
zhoujun's avatar
zhoujun committed
179

littletomatodonkey's avatar
littletomatodonkey committed
180
181
182
183
184
185
186
187
            # model outputs are always tuple in ppnlp (see doc)
            loss = loss.mean()
            loss.backward()
            tr_loss += loss.item()
            optimizer.step()
            lr_scheduler.step()  # Update learning rate schedule
            optimizer.clear_grad()
            global_step += 1
zhoujun's avatar
zhoujun committed
188
            total_samples += batch['input_ids'].shape[0]
zhoujun's avatar
zhoujun committed
189

WenmuZhou's avatar
WenmuZhou committed
190
            if rank == 0 and step % print_step == 0:
zhoujun's avatar
zhoujun committed
191
192
193
194
195
196
197
198
199
200
201
202
203
                logger.info(
                    "epoch: [{}/{}], iter: [{}/{}], global_step:{}, train loss: {:.6f}, lr: {:.6f}, avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} images/sec".
                    format(epoch_id, args.num_train_epochs, step,
                           len(train_dataloader), global_step,
                           loss.numpy()[0],
                           lr_scheduler.get_lr(), train_reader_cost /
                           print_step, (train_reader_cost + train_run_cost) /
                           print_step, total_samples / print_step, total_samples
                           / (train_reader_cost + train_run_cost)))

                train_reader_cost = 0.0
                train_run_cost = 0.0
                total_samples = 0
littletomatodonkey's avatar
littletomatodonkey committed
204

WenmuZhou's avatar
WenmuZhou committed
205
            if rank == 0 and args.eval_steps > 0 and global_step % args.eval_steps == 0 and args.evaluate_during_training:
littletomatodonkey's avatar
littletomatodonkey committed
206
207
                # Log metrics
                # Only evaluate when single GPU otherwise metrics may not average well
zhoujun's avatar
zhoujun committed
208
209
210
                results, _ = evaluate(args, model, tokenizer, loss_class,
                                      eval_dataloader, label2id_map,
                                      id2label_map, pad_token_label_id, logger)
WenmuZhou's avatar
WenmuZhou committed
211
212
213
214
215
216
217
218
219

                if best_metrics is None or results["f1"] >= best_metrics["f1"]:
                    best_metrics = copy.deepcopy(results)
                    output_dir = os.path.join(args.output_dir, "best_model")
                    os.makedirs(output_dir, exist_ok=True)
                    if distributed:
                        model._layers.save_pretrained(output_dir)
                    else:
                        model.save_pretrained(output_dir)
littletomatodonkey's avatar
littletomatodonkey committed
220
221
222
                    tokenizer.save_pretrained(output_dir)
                    paddle.save(args,
                                os.path.join(output_dir, "training_args.bin"))
zhoujun's avatar
zhoujun committed
223
224
                    logger.info("Saving model checkpoint to {}".format(
                        output_dir))
WenmuZhou's avatar
WenmuZhou committed
225
226
227
228
229
230

                logger.info("[epoch {}/{}][iter: {}/{}] results: {}".format(
                    epoch_id, args.num_train_epochs, step,
                    len(train_dataloader), results))
                if best_metrics is not None:
                    logger.info("best metrics: {}".format(best_metrics))
zhoujun's avatar
zhoujun committed
231
            reader_start = time.time()
WenmuZhou's avatar
WenmuZhou committed
232
233
234
235
236
237
238
239
240
241
        if rank == 0:
            # Save model checkpoint
            output_dir = os.path.join(args.output_dir, "latest_model")
            os.makedirs(output_dir, exist_ok=True)
            if distributed:
                model._layers.save_pretrained(output_dir)
            else:
                model.save_pretrained(output_dir)
            tokenizer.save_pretrained(output_dir)
            paddle.save(args, os.path.join(output_dir, "training_args.bin"))
zhoujun's avatar
zhoujun committed
242
            logger.info("Saving model checkpoint to {}".format(output_dir))
littletomatodonkey's avatar
littletomatodonkey committed
243
244
245
246
247
248
    return global_step, tr_loss / global_step


if __name__ == "__main__":
    args = parse_args()
    train(args)