# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from dataclasses import dataclass, field import paddle from paddle.metric import Accuracy from paddle.static import InputSpec from sklearn.metrics import f1_score from utils import UTCLoss, read_local_dataset from paddlenlp.datasets import load_dataset from paddlenlp.prompt import ( PromptModelForSequenceClassification, PromptTrainer, PromptTuningArguments, UTCTemplate, ) from paddlenlp.trainer import PdArgumentParser from paddlenlp.transformers import UTC, AutoTokenizer, export_model @dataclass class DataArguments: dataset_path: str = field( default="./data", metadata={"help": "Local dataset directory including train.txt, dev.txt and label.txt (optional)."}, ) train_file: str = field(default="train.txt", metadata={"help": "Train dataset file name."}) dev_file: str = field(default="dev.txt", metadata={"help": "Dev dataset file name."}) threshold: float = field(default=0.5, metadata={"help": "The threshold to produce predictions."}) single_label: str = field(default=False, metadata={"help": "Predict exactly one label per sample."}) @dataclass class ModelArguments: model_name_or_path: str = field( default="utc-base", metadata={ "help": "The build-in pretrained UTC model name or path to its checkpoints, such as " "`utc-xbase`, `utc-base`, `utc-medium`, `utc-mini`, `utc-micro`, `utc-nano` and `utc-pico`." }, ) export_type: str = field(default="paddle", metadata={"help": "The type to export. Support `paddle` and `onnx`."}) export_model_dir: str = field(default="checkpoints/model_best", metadata={"help": "The export model path."}) def main(): # Parse the arguments. parser = PdArgumentParser((ModelArguments, DataArguments, PromptTuningArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() training_args.print_config(model_args, "Model") training_args.print_config(data_args, "Data") paddle.set_device(training_args.device) # Load the pretrained language model. tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) model = UTC.from_pretrained(model_args.model_name_or_path) # Define template for preprocess and verbalizer for postprocess. template = UTCTemplate(tokenizer, training_args.max_seq_length) # Load and preprocess dataset. train_ds = load_dataset( read_local_dataset, data_path=data_args.dataset_path, data_file=data_args.train_file, lazy=False, ) dev_ds = load_dataset( read_local_dataset, data_path=data_args.dataset_path, data_file=data_args.dev_file, lazy=False, ) # Define the criterion. criterion = UTCLoss() # Initialize the prompt model. prompt_model = PromptModelForSequenceClassification( model, template, None, freeze_plm=training_args.freeze_plm, freeze_dropout=training_args.freeze_dropout ) # Define the metric function. def compute_metrics_single_label(eval_preds): labels = paddle.to_tensor(eval_preds.label_ids, dtype="int64") preds = paddle.to_tensor(eval_preds.predictions) preds = paddle.nn.functional.softmax(preds, axis=-1) labels = paddle.argmax(labels, axis=-1) metric = Accuracy() correct = metric.compute(preds, labels) metric.update(correct) acc = metric.accumulate() return {"accuracy": acc} def compute_metrics(eval_preds): labels = paddle.to_tensor(eval_preds.label_ids, dtype="int64") preds = paddle.to_tensor(eval_preds.predictions) preds = paddle.nn.functional.sigmoid(preds) preds = preds[labels != -100].numpy() labels = labels[labels != -100].numpy() preds = preds > data_args.threshold micro_f1 = f1_score(y_pred=preds, y_true=labels, average="micro") macro_f1 = f1_score(y_pred=preds, y_true=labels, average="macro") return {"micro_f1": micro_f1, "macro_f1": macro_f1} trainer = PromptTrainer( model=prompt_model, tokenizer=tokenizer, args=training_args, criterion=criterion, train_dataset=train_ds, eval_dataset=dev_ds, callbacks=None, compute_metrics=compute_metrics_single_label if data_args.single_label else compute_metrics, ) # Training. if training_args.do_train: train_results = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) metrics = train_results.metrics trainer.save_model() trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Export. if training_args.do_export: input_spec = [ InputSpec(shape=[None, None], dtype="int64", name="input_ids"), InputSpec(shape=[None, None], dtype="int64", name="token_type_ids"), InputSpec(shape=[None, None], dtype="int64", name="position_ids"), InputSpec(shape=[None, None, None, None], dtype="float32", name="attention_mask"), InputSpec(shape=[None, None], dtype="int64", name="omask_positions"), InputSpec(shape=[None], dtype="int64", name="cls_positions"), ] export_model(trainer.pretrained_model, input_spec, model_args.export_model_dir, model_args.export_type) if __name__ == "__main__": main()