Commit 3a094e93 authored by Rayyyyy's avatar Rayyyyy
Browse files

Update

parent b0f4f53a
...@@ -68,21 +68,18 @@ export HF_ENDPOINT=https://hf-mirror.com ...@@ -68,21 +68,18 @@ export HF_ENDPOINT=https://hf-mirror.com
``` ```
## 数据集 ## 数据集
使用来自多个数据集的結合来微调模型,句子对的总数超过10亿个句子。因数据较多,这里仅用[stsbenchmark](https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/stsbenchmark.tsv.gz)[Simple Wikipedia Version 1.0](https://cs.pomona.edu/~dkauchak/simplification/)数据集进行展示,数据集已在`datasets`中提供,详细数据请参考[all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)模型中的Model card。 **训练数据**: [sentence-transformers/stsb](https://huggingface.co/datasets/sentence-transformers/stsb),训练代码自动下载。
**推理数据**: 需要转换成txt格式,参考[gen_simple_wikipedia_v1.py](./gen_simple_wikipedia_v1.py)文件,生成`simple_wiki_pair.txt`
bert-base-uncased
数据集的目录结构如下: 数据集的目录结构如下:
``` ```
├── datasets ├── dataset
│ ├──stsbenchmark.tsv.gz
│ ├──simple_wikipedia_v1 │ ├──simple_wikipedia_v1
│ ├──simple_wiki_pair.txt # 生成的 │ ├──simple_wiki_pair.txt # 生成的
│ ├──wiki.simple │ ├──wiki.simple
│ └──wiki.unsimplified │ └──wiki.unsimplified
``` ```
推理数据需要转换成txt格式,参考[gen_simple_wikipedia_v1.py](./gen_simple_wikipedia_v1.py)文件,生成`simple_wiki_pair.txt`
## 训练 ## 训练
- **训练**默认模型[bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased) - **训练**默认模型[bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased)
- **微调**默认模型[all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) - **微调**默认模型[all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)
......
...@@ -6,3 +6,4 @@ scipy ...@@ -6,3 +6,4 @@ scipy
huggingface-hub>=0.15.1 huggingface-hub>=0.15.1
Pillow Pillow
datasets datasets
accelerate>=0.20.3
\ No newline at end of file
"""
This examples trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) for the STSbenchmark from scratch. It generates sentence embeddings
that can be compared using cosine-similarity to measure the similarity.
"""
import math
import sys
import os
import gzip
import csv
import logging import logging
import argparse import argparse
from datetime import datetime from datetime import datetime
from torch.utils.data import DataLoader from datasets import load_dataset
from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses, models, util from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import STSBenchmarkDataReader, InputExample from sentence_transformers.similarity_functions import SimilarityFunction
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments
#### Just some code to print debug information to stdout # Set the log level to INFO to get more information
logging.basicConfig(format='%(asctime)s - %(message)s', logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO)
datefmt='%Y-%m-%d %H:%M:%S',
level=logging.INFO,
handlers=[LoggingHandler()])
#### params # params
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('--data_path', type=str, default='datasets/stsbenchmark.tsv.gz', help='Input txt path')
parser.add_argument('--train_batch_size', type=int, default=16) parser.add_argument('--train_batch_size', type=int, default=16)
parser.add_argument('--num_epochs', type=int, default=10) parser.add_argument('--num_epochs', type=int, default=10)
parser.add_argument('--model_name_or_path', type=str, default="bert-base-uncased") parser.add_argument('--model_name_or_path', type=str, default="bert-base-uncased")
parser.add_argument('--save_root_path', type=str, default="output", help='Model output folder') parser.add_argument('--save_root_path', type=str, default="output", help='Model output folder')
parser.add_argument('--lr', default=2e-05) parser.add_argument('--lr', default=2e-05)
parser.add_argument('--eval_steps', type=int, default=100)
parser.add_argument('--save_steps', type=int, default=100)
parser.add_argument('--save_total_limit', type=int, default=2)
parser.add_argument('--logging_steps', type=int, default=10)
args = parser.parse_args() args = parser.parse_args()
# Check if dataset exsist. If not, download and extract it # You can specify any Hugging Face pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
sts_dataset_path = args.data_path
if not os.path.exists(sts_dataset_path):
util.http_get('https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/stsbenchmark.tsv.gz', sts_dataset_path)
#You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
model_name = args.model_name_or_path model_name = args.model_name_or_path
# Read the dataset
train_batch_size = args.train_batch_size train_batch_size = args.train_batch_size
num_epochs = args.num_epochs num_epochs = args.num_epochs
model_save_path = args.save_root_path + "/training_stsbenchmark_" + model_name.replace("/", "-") + '-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name)
# Apply mean pooling to get one fixed sized sentence vector output_dir = (
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), args.save_root_path + "/training_stsbenchmark_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
pooling_mode_mean_tokens=True, )
pooling_mode_cls_token=False,
pooling_mode_max_tokens=False)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # 1. Here we define our SentenceTransformer model. If not already a Sentence Transformer model, it will automatically
# create one with "mean" pooling.
model = SentenceTransformer(model_name)
# Convert the dataset to a DataLoader ready for training logging.info("Loading STSbenchmark train dataset")
logging.info("Read STSbenchmark train dataset") # 2. Load the STSB dataset: https://huggingface.co/datasets/sentence-transformers/stsb
train_dataset = load_dataset("sentence-transformers/stsb", split="train")
eval_dataset = load_dataset("sentence-transformers/stsb", split="validation")
test_dataset = load_dataset("sentence-transformers/stsb", split="test")
logging.info(train_dataset)
train_samples = [] # 3. Define our training loss
dev_samples = [] # CosineSimilarityLoss (https://sbert.net/docs/package_reference/sentence_transformer/losses.html#cosinesimilarityloss) needs two text columns and one
test_samples = [] # similarity score column (between 0 and 1)
with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
for row in reader:
score = float(row['score']) / 5.0 # Normalize score to range 0 ... 1
inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=score)
if row['split'] == 'dev':
dev_samples.append(inp_example)
elif row['split'] == 'test':
test_samples.append(inp_example)
else:
train_samples.append(inp_example)
train_dataset = SentencesDataset(train_samples, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model) train_loss = losses.CosineSimilarityLoss(model=model)
# train_loss = losses.CoSENTLoss(model=model)
# 4. Define an evaluator for use during training. This is useful to keep track of alongside the evaluation loss.
dev_evaluator = EmbeddingSimilarityEvaluator(
sentences1=eval_dataset["sentence1"],
sentences2=eval_dataset["sentence2"],
scores=eval_dataset["score"],
main_similarity=SimilarityFunction.COSINE,
name="sts-dev",
)
# 5. Define the training arguments
args = SentenceTransformerTrainingArguments(
# Required parameter:
output_dir=output_dir,
# Optional training parameters:
num_train_epochs=num_epochs,
per_device_train_batch_size=train_batch_size,
per_device_eval_batch_size=train_batch_size,
warmup_ratio=0.1,
fp16=True, # Set to False if you get an error that your GPU can't run on FP16
bf16=False, # Set to True if you have a GPU that supports BF16
# Optional tracking/debugging parameters:
evaluation_strategy="steps",
eval_steps=args.eval_steps,
save_strategy="steps",
save_steps=args.save_steps,
save_total_limit=args.save_total_limit,
logging_steps=args.logging_steps,
run_name="sts", # Will be used in W&B if `wandb` is installed
)
# 6. Create the trainer & start training
trainer = SentenceTransformerTrainer(
model=model,
args=args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
loss=train_loss,
evaluator=dev_evaluator,
)
trainer.train()
# 7. Evaluate the model performance on the STS Benchmark test dataset
test_evaluator = EmbeddingSimilarityEvaluator(
sentences1=test_dataset["sentence1"],
sentences2=test_dataset["sentence2"],
scores=test_dataset["score"],
main_similarity=SimilarityFunction.COSINE,
name="sts-test",
)
test_evaluator(model, output_path=output_dir)
# 8. Save the trained & evaluated model locally
final_output_dir = f"{output_dir}/final"
model.save(final_output_dir)
logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')
# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
evaluator=evaluator,
epochs=num_epochs,
evaluation_steps=1000,
warmup_steps=warmup_steps,
output_path=model_save_path)
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
test_evaluator(model, output_path=model_save_path)
import os """
import math This example loads the pre-trained SentenceTransformer model 'nli-distilroberta-base-v2' from Hugging Face.
import gzip It then fine-tunes this model for some epochs on the STS benchmark dataset.
import csv
Note: In this example, you must specify a SentenceTransformer model.
If you want to fine-tune a huggingface/transformers model like bert-base-uncased, see training_nli.py and training_stsbenchmark.py
"""
import logging import logging
import argparse import argparse
from datetime import datetime from datetime import datetime
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses, util, InputExample from datasets import load_dataset
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.similarity_functions import SimilarityFunction
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments
#### Just some code to print debug information to stdout # Set the log level to INFO to get more information
logging.basicConfig( logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO)
format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('--data_path', type=str, default='datasets/stsbenchmark.tsv.gz', help='Input txt path')
parser.add_argument('--train_batch_size', type=int, default=16) parser.add_argument('--train_batch_size', type=int, default=16)
parser.add_argument('--num_epochs', type=int, default=10) parser.add_argument('--num_epochs', type=int, default=10)
parser.add_argument('--model_name_or_path', type=str, default="all-MiniLM-L6-v2") parser.add_argument('--model_name_or_path', type=str, default="all-MiniLM-L6-v2")
parser.add_argument('--save_root_path', type=str, default="output", help='Model output folder') parser.add_argument('--save_root_path', type=str, default="output", help='Model output folder')
parser.add_argument('--lr', default=2e-05) parser.add_argument('--lr', default=2e-05)
parser.add_argument('--eval_steps', type=int, default=100)
parser.add_argument('--save_steps', type=int, default=100)
parser.add_argument('--save_total_limit', type=int, default=2)
parser.add_argument('--logging_steps', type=int, default=10)
args = parser.parse_args() args = parser.parse_args()
# You can specify any Sentence Transformer model here, for example all-mpnet-base-v2, all-MiniLM-L6-v2, mixedbread-ai/mxbai-embed-large-v1
model_name = args.model_name_or_path
train_batch_size = args.train_batch_size
num_epochs = args.num_epochs
output_dir = (
args.save_root_path + "training_stsbenchmark_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)
if __name__ == "__main__": # 1. Here we define our SentenceTransformer model.
model = SentenceTransformer(model_name)
sts_dataset_path = args.data_path
# 2. Load the STSB dataset: https://huggingface.co/datasets/sentence-transformers/stsb
# Check if dataset exists. If not, download and extract it train_dataset = load_dataset("sentence-transformers/stsb", split="train")
if not os.path.exists(sts_dataset_path): eval_dataset = load_dataset("sentence-transformers/stsb", split="validation")
util.http_get('https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/stsbenchmark.tsv.gz', sts_dataset_path) test_dataset = load_dataset("sentence-transformers/stsb", split="test")
logging.info(train_dataset)
model_name = args.model_name_or_path
train_batch_size = args.train_batch_size # 3. Define our training loss
num_epochs = args.num_epochs # CosineSimilarityLoss (https://sbert.net/docs/package_reference/sentence_transformer/losses.html#cosinesimilarityloss) needs two text columns and one
model_save_path = args.save_root_path + "/training_stsbenchmark_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") # similarity score column (between 0 and 1)
train_loss = losses.CosineSimilarityLoss(model=model)
# Load a pre-trained sentence transformer model # train_loss = losses.CoSENTLoss(model=model)
model = SentenceTransformer(model_name, device='cuda')
# 4. Define an evaluator for use during training. This is useful to keep track of alongside the evaluation loss.
# Convert the dataset to a DataLoader ready for training dev_evaluator = EmbeddingSimilarityEvaluator(
logging.info("Read STSbenchmark train dataset") sentences1=eval_dataset["sentence1"],
# Read the dataset sentences2=eval_dataset["sentence2"],
train_samples = [] scores=eval_dataset["score"],
dev_samples = [] main_similarity=SimilarityFunction.COSINE,
test_samples = [] name="sts-dev",
with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn: )
reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
for row in reader:
score = float(row['score']) / 5.0 # Normalize score to range 0 ... 1
inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=score)
if row['split'] == 'dev':
dev_samples.append(inp_example)
elif row['split'] == 'test':
test_samples.append(inp_example)
else:
train_samples.append(inp_example)
logging.info("Dealing data end.")
train_dataset = SentencesDataset(train_samples, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)
# Development set: Measure correlation between cosine score and gold labels # 5. Define the training arguments
logging.info("Read STSbenchmark dev dataset") args = SentenceTransformerTrainingArguments(
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name="sts-dev") # Required parameter:
output_dir=output_dir,
# Optional training parameters:
num_train_epochs=num_epochs,
per_device_train_batch_size=train_batch_size,
per_device_eval_batch_size=train_batch_size,
warmup_ratio=0.1,
fp16=True, # Set to False if you get an error that your GPU can't run on FP16
bf16=False, # Set to True if you have a GPU that supports BF16
# Optional tracking/debugging parameters:
evaluation_strategy="steps",
eval_steps=args.eval_steps,
save_strategy="steps",
save_steps=args.save_steps,
save_total_limit=args.save_total_limit,
logging_steps=args.logging_steps,
run_name="sts", # Will be used in W&B if `wandb` is installed
)
# Configure the training. We skip evaluation in this example # 6. Create the trainer & start training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up trainer = SentenceTransformerTrainer(
logging.info("Warmup-steps: {}".format(warmup_steps)) model=model,
args=args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
loss=train_loss,
evaluator=dev_evaluator,
)
trainer.train()
print("Start training ...") # 7. Evaluate the model performance on the STS Benchmark test dataset
# Train the model
model.fit(
train_objectives=[(train_dataloader, train_loss)],
evaluator=evaluator,
epochs=num_epochs,
evaluation_steps=1000,
warmup_steps=warmup_steps,
optimizer_params={'lr': args.lr},
output_path=model_save_path,
)
logging.info("Finetune end")
############################################################################## test_evaluator = EmbeddingSimilarityEvaluator(
# sentences1=test_dataset["sentence1"],
# Load the stored model and evaluate its performance on STS benchmark dataset sentences2=test_dataset["sentence2"],
# scores=test_dataset["score"],
############################################################################## main_similarity=SimilarityFunction.COSINE,
name="sts-test",
)
test_evaluator(model, output_path=output_dir)
model = SentenceTransformer(model_save_path) # 8. Save the trained & evaluated model locally
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test') final_output_dir = f"{output_dir}/final"
test_evaluator(model, output_path=model_save_path) model.save(final_output_dir)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment