"lightx2v/models/vscode:/vscode.git/clone" did not exist on "3488b1878a4dc80e0e4f3c4d1801f39bee3ccc5d"
finetune.py 14.7 KB
Newer Older
1
2
3
4
5
import argparse
import glob
import logging
import os
import time
6
import warnings
7
from collections import defaultdict
8
9
from pathlib import Path
from typing import Dict, List, Tuple
10

11
12
import numpy as np
import pytorch_lightning as pl
13
14
15
import torch
from torch.utils.data import DataLoader

16
from lightning_base import BaseTransformer, add_generic_args, generic_train
17
from transformers import MBartTokenizer, get_linear_schedule_with_warmup
18
19
20


try:
21
    from .utils import (
22
        assert_all_frozen,
23
24
25
26
27
        use_task_specific_params,
        lmap,
        flatten_list,
        pickle_save,
        save_git_info,
28
        save_json,
29
30
31
32
        freeze_params,
        calculate_rouge,
        get_git_info,
        ROUGE_KEYS,
33
        calculate_bleu_score,
34
35
        Seq2SeqDataset,
        MBartDataset,
36
    )
37

38
    from .callbacks import Seq2SeqLoggingCallback, get_checkpoint_callback
39
except ImportError:
40
    from utils import (
41
42
43
        Seq2SeqDataset,
        MBartDataset,
        assert_all_frozen,
44
45
46
47
48
        use_task_specific_params,
        lmap,
        flatten_list,
        pickle_save,
        save_git_info,
49
        save_json,
50
51
52
53
        freeze_params,
        calculate_rouge,
        get_git_info,
        ROUGE_KEYS,
54
        calculate_bleu_score,
55
    )
56
    from callbacks import Seq2SeqLoggingCallback, get_checkpoint_callback
57
58
59
60

logger = logging.getLogger(__name__)


61
62
63
class SummarizationModule(BaseTransformer):
    mode = "summarization"
    loss_names = ["loss"]
64
65
    metric_names = ROUGE_KEYS
    val_metric = "rouge2"
66

67
68
69
70
    def __init__(self, hparams, **kwargs):
        super().__init__(hparams, num_labels=None, mode=self.mode, **kwargs)
        use_task_specific_params(self.model, "summarization")
        save_git_info(self.hparams.output_dir)
71
        self.metrics_save_path = Path(self.output_dir) / "metrics.json"
72
        self.hparams_save_path = Path(self.output_dir) / "hparams.pkl"
73
        pickle_save(self.hparams, self.hparams_save_path)
74
        self.step_count = 0
75
        self.metrics = defaultdict(list)
76

77
78
79
        self.dataset_kwargs: dict = dict(
            data_dir=self.hparams.data_dir,
            max_source_length=self.hparams.max_source_length,
80
            prefix=self.model.config.prefix or "",
81
        )
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
        n_observations_per_split = {
            "train": self.hparams.n_train,
            "val": self.hparams.n_val,
            "test": self.hparams.n_test,
        }
        self.n_obs = {k: v if v >= 0 else None for k, v in n_observations_per_split.items()}

        self.target_lens = {
            "train": self.hparams.max_target_length,
            "val": self.hparams.val_max_target_length,
            "test": self.hparams.test_max_target_length,
        }
        assert self.target_lens["train"] <= self.target_lens["val"], f"target_lens: {self.target_lens}"
        assert self.target_lens["train"] <= self.target_lens["test"], f"target_lens: {self.target_lens}"

        if self.hparams.freeze_embeds:
            self.freeze_embeds()
        if self.hparams.freeze_encoder:
100
101
102
            freeze_params(self.model.get_encoder())
            assert_all_frozen(self.model.get_encoder())

103
        self.hparams.git_sha = get_git_info()["repo_sha"]
104
        self.num_workers = hparams.num_workers
105
        self.decoder_start_token_id = None
106
        self.dataset_class = Seq2SeqDataset
107
108
109

    def freeze_embeds(self):
        """Freeze token embeddings and positional embeddings for bart, just token embeddings for t5."""
110
        try:
111
112
113
114
            freeze_params(self.model.model.shared)
            for d in [self.model.model.encoder, self.model.model.decoder]:
                freeze_params(d.embed_positions)
                freeze_params(d.embed_tokens)
115
        except AttributeError:
116
117
118
119
120
121
122
123
124
125
            freeze_params(self.model.shared)
            for d in [self.model.encoder, self.model.decoder]:
                freeze_params(d.embed_tokens)

    def forward(self, input_ids, **kwargs):
        return self.model(input_ids, **kwargs)

    def ids_to_clean_text(self, generated_ids: List[int]):
        gen_text = self.tokenizer.batch_decode(
            generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
126
        )
127
        return lmap(str.strip, gen_text)
128

129
    def _step(self, batch: dict) -> Tuple:
130
        pad_token_id = self.tokenizer.pad_token_id
131
        source_ids, source_mask, y = batch["input_ids"], batch["attention_mask"], batch["decoder_input_ids"]
132
133
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone()
134
        lm_labels[y[:, 1:] == pad_token_id] = -100
135
        outputs = self(source_ids, attention_mask=source_mask, decoder_input_ids=y_ids, labels=lm_labels,)
136
        loss = outputs[0]
137
138
139
140
141
142
143
144
145
146
        return (loss,)

    def training_step(self, batch, batch_idx) -> Dict:
        loss_tensors = self._step(batch)
        logs = {name: loss for name, loss in zip(self.loss_names, loss_tensors)}
        return {"loss": loss_tensors[0], "log": logs}

    def validation_step(self, batch, batch_idx) -> Dict:
        return self._generative_step(batch)

147
    def validation_epoch_end(self, outputs, prefix="val") -> Dict:
148
149
150
        self.step_count += 1
        losses = {k: torch.stack([x[k] for x in outputs]).mean() for k in self.loss_names}
        loss = losses["loss"]
151
152
        rouges = {k: np.array([x[k] for x in outputs]).mean() for k in self.metric_names + ["gen_time", "summ_len"]}
        rouge_tensor: torch.FloatTensor = torch.tensor(rouges[self.val_metric]).type_as(loss)
153
154
155
156
157
158
        rouges.update({k: v.item() for k, v in losses.items()})
        losses.update(rouges)
        metrics = {f"{prefix}_avg_{k}": x for k, x in losses.items()}
        metrics["step_count"] = self.step_count
        self.save_metrics(metrics, prefix)  # writes to self.metrics_save_path
        preds = flatten_list([x["preds"] for x in outputs])
159
160
161
162
163
        return {"log": metrics, "preds": preds, f"{prefix}_loss": loss, f"{prefix}_{self.val_metric}": rouge_tensor}

    def save_metrics(self, latest_metrics, type_path) -> None:
        self.metrics[type_path].append(latest_metrics)
        save_json(self.metrics, self.metrics_save_path)
164

165
166
    def calc_generative_metrics(self, preds, target) -> Dict:
        return calculate_rouge(preds, target)
167

168
    def _generative_step(self, batch: dict) -> dict:
169
        pad_token_id = self.tokenizer.pad_token_id
170
        source_ids, source_mask, y = Seq2SeqDataset.trim_seq2seq_batch(batch, pad_token_id)
171
        t0 = time.time()
172
173
174
175
176
177
        generated_ids = self.model.generate(
            input_ids=source_ids,
            attention_mask=source_mask,
            use_cache=True,
            decoder_start_token_id=self.decoder_start_token_id,
        )
178
        gen_time = (time.time() - t0) / source_ids.shape[0]
179
180
181
182
        preds = self.ids_to_clean_text(generated_ids)
        target = self.ids_to_clean_text(y)
        loss_tensors = self._step(batch)
        base_metrics = {name: loss for name, loss in zip(self.loss_names, loss_tensors)}
183
        rouge: Dict = self.calc_generative_metrics(preds, target)
184
185
186
        summ_len = np.mean(lmap(len, generated_ids))
        base_metrics.update(gen_time=gen_time, summ_len=summ_len, preds=preds, target=target, **rouge)
        return base_metrics
187

188
189
    def test_step(self, batch, batch_idx):
        return self._generative_step(batch)
190
191

    def test_epoch_end(self, outputs):
192
        return self.validation_epoch_end(outputs, prefix="test")
193

194
    def get_dataset(self, type_path) -> Seq2SeqDataset:
195
196
        n_obs = self.n_obs[type_path]
        max_target_length = self.target_lens[type_path]
197
        dataset = self.dataset_class(
198
199
200
201
202
203
204
205
            self.tokenizer,
            type_path=type_path,
            n_obs=n_obs,
            max_target_length=max_target_length,
            **self.dataset_kwargs,
        )
        return dataset

206
    def get_dataloader(self, type_path: str, batch_size: int, shuffle: bool = False) -> DataLoader:
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
        dataset = self.get_dataset(type_path)
        sampler = None
        if self.hparams.sortish_sampler and type_path == "train":
            assert self.hparams.gpus <= 1  # TODO: assert earlier
            sampler = dataset.make_sortish_sampler(batch_size)
            shuffle = False

        dataloader = DataLoader(
            dataset,
            batch_size=batch_size,
            collate_fn=dataset.collate_fn,
            shuffle=shuffle,
            num_workers=self.num_workers,
            sampler=sampler,
        )
222
223
224
        return dataloader

    def train_dataloader(self) -> DataLoader:
225
        dataloader = self.get_dataloader("train", batch_size=self.hparams.train_batch_size, shuffle=True)
226
        t_total = (
227
            (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.gpus)))
228
229
            // self.hparams.accumulate_grad_batches
            * float(self.hparams.max_epochs)
230
231
232
233
        )
        scheduler = get_linear_schedule_with_warmup(
            self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
        )
234
235
        if max(scheduler.get_last_lr()) > 0:
            warnings.warn("All learning rates are 0")
236
237
238
        self.lr_scheduler = scheduler
        return dataloader

239
240
    def val_dataloader(self) -> DataLoader:
        return self.get_dataloader("val", batch_size=self.hparams.eval_batch_size)
241

242
243
    def test_dataloader(self) -> DataLoader:
        return self.get_dataloader("test", batch_size=self.hparams.eval_batch_size)
244
245
246
247

    @staticmethod
    def add_model_specific_args(parser, root_dir):
        BaseTransformer.add_model_specific_args(parser, root_dir)
248
        add_generic_args(parser, root_dir)
249
        parser.add_argument(
250
            "--max_source_length",
251
252
253
254
255
            default=1024,
            type=int,
            help="The maximum total input sequence length after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded.",
        )
256
257
258
259
260
261
262
        parser.add_argument(
            "--max_target_length",
            default=56,
            type=int,
            help="The maximum total input sequence length after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded.",
        )
263
264
265
266
267
268
269
270
271
272
273
274
275
276
        parser.add_argument(
            "--val_max_target_length",
            default=142,  # these defaults are optimized for CNNDM. For xsum, see README.md.
            type=int,
            help="The maximum total input sequence length after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded.",
        )
        parser.add_argument(
            "--test_max_target_length",
            default=142,
            type=int,
            help="The maximum total input sequence length after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded.",
        )
277
278
279
280
        parser.add_argument(
            "--data_dir",
            type=str,
            required=True,
281
            help="The input data dir. Should contain train.source, train.target, val.source, val.target, test.source, test.target",
282
        )
283
284
285
        parser.add_argument("--freeze_encoder", action="store_true")
        parser.add_argument("--freeze_embeds", action="store_true")
        parser.add_argument("--sortish_sampler", action="store_true", default=False)
286
        parser.add_argument("--logger_name", type=str, choices=["default", "wandb", "wandb_shared"], default="default")
287
288
289
        parser.add_argument("--n_train", type=int, default=-1, required=False, help="# examples. -1 means use all.")
        parser.add_argument("--n_val", type=int, default=500, required=False, help="# examples. -1 means use all.")
        parser.add_argument("--n_test", type=int, default=-1, required=False, help="# examples. -1 means use all.")
290
291
292
        parser.add_argument(
            "--task", type=str, default="summarization", required=False, help="# examples. -1 means use all."
        )
293
294
        parser.add_argument("--src_lang", type=str, default="", required=False)
        parser.add_argument("--tgt_lang", type=str, default="", required=False)
295
296
297
        return parser


298
299
300
301
302
303
class TranslationModule(SummarizationModule):
    mode = "translation"
    loss_names = ["loss"]
    metric_names = ["bleu"]
    val_metric = "bleu"

304
305
306
307
308
309
    def __init__(self, hparams, **kwargs):
        super().__init__(hparams, **kwargs)
        self.dataset_kwargs["src_lang"] = hparams.src_lang
        self.dataset_kwargs["tgt_lang"] = hparams.tgt_lang
        if self.model.config.decoder_start_token_id is None and isinstance(self.tokenizer, MBartTokenizer):
            self.decoder_start_token_id = self.tokenizer.lang_code_to_id[hparams.tgt_lang]
310
311
        if isinstance(self.tokenizer, MBartTokenizer):
            self.dataset_class = MBartDataset
312

313
314
315
316
    def calc_generative_metrics(self, preds, target) -> dict:
        return calculate_bleu_score(preds, target)


317
318
319
320
321
def main(args, model=None) -> SummarizationModule:
    Path(args.output_dir).mkdir(exist_ok=True)
    if len(os.listdir(args.output_dir)) > 3 and args.do_train:
        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
    if model is None:
322
323
324
325
        if args.task == "summarization":
            model: SummarizationModule = SummarizationModule(args)
        else:
            model: SummarizationModule = TranslationModule(args)
326
327

    dataset = Path(args.data_dir).name
328
    if (
329
        args.logger_name == "default"
330
331
332
333
334
        or args.fast_dev_run
        or str(args.output_dir).startswith("/tmp")
        or str(args.output_dir).startswith("/var")
    ):
        logger = True  # don't pollute wandb logs unnecessarily
335
    elif args.logger_name == "wandb":
336
337
        from pytorch_lightning.loggers import WandbLogger

338
        logger = WandbLogger(name=model.output_dir.name, project=dataset)
339

340
    elif args.logger_name == "wandb_shared":
341
342
        from pytorch_lightning.loggers import WandbLogger

343
        logger = WandbLogger(name=model.output_dir.name, project=f"hf_{dataset}")
344
345
346
347
    trainer: pl.Trainer = generic_train(
        model,
        args,
        logging_callback=Seq2SeqLoggingCallback(),
348
        checkpoint_callback=get_checkpoint_callback(args.output_dir, model.val_metric),
349
350
351
        logger=logger,
        # TODO: early stopping callback seems messed up
    )
352
    pickle_save(model.hparams, model.output_dir / "hparams.pkl")
353
354
355
356
357
358
359
360
361
    if not args.do_predict:
        return model

    model.hparams.test_checkpoint = ""
    checkpoints = list(sorted(glob.glob(os.path.join(args.output_dir, "*.ckpt"), recursive=True)))
    if checkpoints:
        model.hparams.test_checkpoint = checkpoints[-1]
        trainer.resume_from_checkpoint = checkpoints[-1]
    trainer.logger.log_hyperparams(model.hparams)
362
363
364

    # test() without a model tests using the best checkpoint automatically
    trainer.test()
365
    return model
366
367
368
369


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
370
    parser = pl.Trainer.add_argparse_args(parser)
371
    parser = SummarizationModule.add_model_specific_args(parser, os.getcwd())
372

373
374
375
    args = parser.parse_args()

    main(args)