"src/targets/vscode:/vscode.git/clone" did not exist on "c3e62f5f3acbdc71be8b1a05d52ac26c04cb5cfd"
Commit 10f294ff authored by yuguo-Jack's avatar yuguo-Jack
Browse files

llama_paddle

parent 7c64e6ec
Pipeline #678 failed with stages
in 0 seconds
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from functools import partial
import os
import paddle
from paddlenlp.data import Tuple, Pad
from paddlenlp.datasets import MapDataset
from paddlenlp.transformers import AutoModel, AutoTokenizer
from base_model import SemanticIndexBaseStatic
from data import convert_example, create_dataloader
if __name__ == "__main__":
device = "gpu"
max_seq_length = 64
output_emb_size = 256
batch_size = 1
params_path = "checkpoints/inbatch/model_40/model_state.pdparams"
id2corpus = {0: "国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据"}
model_name_or_path = "rocketqa-zh-base-query-encoder"
paddle.set_device(device)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=max_seq_length)
batchify_fn = lambda samples, fn=Tuple( # noqa: E731
Pad(axis=0, pad_val=tokenizer.pad_token_id), # text_input
Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # text_segment
): [data for data in fn(samples)]
pretrained_model = AutoModel.from_pretrained(model_name_or_path)
model = SemanticIndexBaseStatic(pretrained_model, output_emb_size=output_emb_size)
# Load pretrained semantic model
if params_path and os.path.isfile(params_path):
state_dict = paddle.load(params_path)
model.set_dict(state_dict)
print("Loaded parameters from %s" % params_path)
else:
raise ValueError("Please set --params_path with correct pretrained model file")
# convert_example function's input must be dict
corpus_list = [{idx: text} for idx, text in id2corpus.items()]
corpus_ds = MapDataset(corpus_list)
corpus_data_loader = create_dataloader(
corpus_ds, mode="predict", batch_size=batch_size, batchify_fn=batchify_fn, trans_fn=trans_func
)
all_embeddings = []
model.eval()
with paddle.no_grad():
for batch_data in corpus_data_loader:
input_ids, token_type_ids = batch_data
text_embeddings = model.get_pooled_embedding(input_ids, token_type_ids)
all_embeddings.append(text_embeddings)
text_embedding = all_embeddings[0]
print(text_embedding.shape)
print(text_embedding.numpy())
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
from functools import partial
import numpy as np
import paddle
from base_model import SemanticIndexBase
from data import convert_example, create_dataloader, read_text_pair
from paddlenlp.data import Pad, Tuple
from paddlenlp.datasets import load_dataset
from paddlenlp.transformers import AutoModel, AutoTokenizer
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--text_pair_file", type=str,
required=True, help="The full path of input file")
parser.add_argument("--params_path", type=str, required=True,
help="The path to model parameters to be loaded.")
parser.add_argument("--max_seq_length", default=64, type=int, help="The maximum total input sequence length after tokenization. "
"Sequences longer than this will be truncated, sequences shorter will be padded.")
parser.add_argument('--model_name_or_path', default="rocketqa-zh-base-query-encoder", help="The pretrained model used for training")
parser.add_argument("--batch_size", default=32, type=int,
help="Batch size per GPU/CPU for training.")
parser.add_argument("--output_emb_size", default=None,
type=int, help="output_embedding_size")
parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu",
help="Select which device to train model, defaults to gpu.")
parser.add_argument("--pad_to_max_seq_len", action="store_true",
help="Whether to pad to max seq length.")
args = parser.parse_args()
# yapf: enable
def predict(model, data_loader):
"""
Predicts the data labels.
Args:
model (obj:`SemanticIndexBase`): A model to extract text embedding or calculate similarity of text pair.
data_loader (obj:`List(Example)`): The processed data ids of text pair: [query_input_ids, query_token_type_ids, title_input_ids, title_token_type_ids]
Returns:
results(obj:`List`): cosine similarity of text pairs.
"""
cosine_sims = []
model.eval()
with paddle.no_grad():
for batch_data in data_loader:
query_input_ids, query_token_type_ids, title_input_ids, title_token_type_ids = batch_data
batch_cosine_sim = model.cosine_sim(
query_input_ids=query_input_ids,
title_input_ids=title_input_ids,
query_token_type_ids=query_token_type_ids,
title_token_type_ids=title_token_type_ids,
).numpy()
cosine_sims.append(batch_cosine_sim)
cosine_sims = np.concatenate(cosine_sims, axis=0)
return cosine_sims
if __name__ == "__main__":
paddle.set_device(args.device)
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
trans_func = partial(
convert_example,
tokenizer=tokenizer,
max_seq_length=args.max_seq_length,
pad_to_max_seq_len=args.pad_to_max_seq_len,
)
batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # query_input
Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # query_segment
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # title_input
Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # title_segment
): [data for data in fn(samples)]
valid_ds = load_dataset(read_text_pair, data_path=args.text_pair_file, lazy=False)
valid_data_loader = create_dataloader(
valid_ds, mode="predict", batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func
)
pretrained_model = AutoModel.from_pretrained(args.model_name_or_path)
model = SemanticIndexBase(pretrained_model, output_emb_size=args.output_emb_size)
if args.params_path and os.path.isfile(args.params_path):
state_dict = paddle.load(args.params_path)
model.set_dict(state_dict)
print("Loaded parameters from %s" % args.params_path)
else:
raise ValueError("Please set --params_path with correct pretrained model file")
cosin_sim = predict(model, valid_data_loader)
for idx, cosine in enumerate(cosin_sim):
print("{}".format(cosine))
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# coding=UTF-8
import argparse
import os
from functools import partial
import paddle
from ann_util import build_index
from base_model import SemanticIndexBase
from data import convert_example, create_dataloader, gen_id2corpus, gen_text_file
from paddlenlp.data import Pad, Tuple
from paddlenlp.datasets import MapDataset
from paddlenlp.transformers import AutoModel, AutoTokenizer
from paddlenlp.utils.log import logger
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--corpus_file", type=str, required=True,
help="The full path of input file")
parser.add_argument("--similar_text_pair_file", type=str,
required=True, help="The full path of similar text pair file")
parser.add_argument("--recall_result_dir", type=str, default='recall_result',
help="The full path of recall result file to save")
parser.add_argument("--recall_result_file", type=str,
default='recall_result_file', help="The file name of recall result")
parser.add_argument("--params_path", type=str, required=True,
help="The path to model parameters to be loaded.")
parser.add_argument("--max_seq_length", default=64, type=int, help="The maximum total input sequence length after tokenization. "
"Sequences longer than this will be truncated, sequences shorter will be padded.")
parser.add_argument("--batch_size", default=32, type=int,
help="Batch size per GPU/CPU for training.")
parser.add_argument("--output_emb_size", default=None,
type=int, help="output_embedding_size")
parser.add_argument("--recall_num", default=10, type=int,
help="Recall number for each query from Ann index.")
parser.add_argument('--model_name_or_path', default="rocketqa-zh-base-query-encoder", help="The pretrained model used for training")
parser.add_argument("--hnsw_m", default=100, type=int,
help="Recall number for each query from Ann index.")
parser.add_argument("--hnsw_ef", default=100, type=int,
help="Recall number for each query from Ann index.")
parser.add_argument("--hnsw_max_elements", default=1000000,
type=int, help="Recall number for each query from Ann index.")
parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu",
help="Select which device to train model, defaults to gpu.")
args = parser.parse_args()
# yapf: enable
if __name__ == "__main__":
paddle.set_device(args.device)
rank = paddle.distributed.get_rank()
if paddle.distributed.get_world_size() > 1:
paddle.distributed.init_parallel_env()
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length)
batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # text_input
Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # text_segment
): [data for data in fn(samples)]
pretrained_model = AutoModel.from_pretrained(args.model_name_or_path)
model = SemanticIndexBase(pretrained_model, output_emb_size=args.output_emb_size)
model = paddle.DataParallel(model)
# Load pretrained semantic model
if args.params_path and os.path.isfile(args.params_path):
state_dict = paddle.load(args.params_path)
model.set_dict(state_dict)
logger.info("Loaded parameters from %s" % args.params_path)
else:
raise ValueError("Please set --params_path with correct pretrained model file")
id2corpus = gen_id2corpus(args.corpus_file)
# conver_example function's input must be dict
corpus_list = [{idx: text} for idx, text in id2corpus.items()]
corpus_ds = MapDataset(corpus_list)
corpus_data_loader = create_dataloader(
corpus_ds, mode="predict", batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func
)
# Need better way to get inner model of DataParallel
inner_model = model._layers
final_index = build_index(args, corpus_data_loader, inner_model)
text_list, text2similar_text = gen_text_file(args.similar_text_pair_file)
query_ds = MapDataset(text_list)
query_data_loader = create_dataloader(
query_ds, mode="predict", batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func
)
query_embedding = inner_model.get_semantic_embedding(query_data_loader)
if not os.path.exists(args.recall_result_dir):
os.mkdir(args.recall_result_dir)
recall_result_file = os.path.join(args.recall_result_dir, args.recall_result_file)
with open(recall_result_file, "w", encoding="utf-8") as f:
for batch_index, batch_query_embedding in enumerate(query_embedding):
recalled_idx, cosine_sims = final_index.knn_query(batch_query_embedding.numpy(), args.recall_num)
batch_size = len(cosine_sims)
for row_index in range(batch_size):
text_index = args.batch_size * batch_index + row_index
for idx, doc_idx in enumerate(recalled_idx[row_index]):
f.write(
"{}\t{}\t{}\n".format(
text_list[text_index]["text"], id2corpus[doc_idx], 1.0 - cosine_sims[row_index][idx]
)
)
python -u evaluate.py \
--similar_text_pair "recall/dev.csv" \
--recall_result_file "./recall_result_dir/recall_result.txt" \
--recall_num 50
\ No newline at end of file
python export_model.py --params_path checkpoints/inbatch/model_40/model_state.pdparams \
--model_name_or_path rocketqa-zh-base-query-encoder \
--output_path=./output
\ No newline at end of file
python export_to_serving.py \
--dirname "output" \
--model_filename "inference.get_pooled_embedding.pdmodel" \
--params_filename "inference.get_pooled_embedding.pdiparams" \
--server_path "serving_server" \
--client_path "serving_client" \
--fetch_alias_names "output_embedding"
# gpu version
root_dir="checkpoints/inbatch"
python -u -m paddle.distributed.launch --gpus "0" \
predict.py \
--device gpu \
--params_path "${root_dir}/model_40/model_state.pdparams" \
--model_name_or_path rocketqa-zh-base-query-encoder \
--output_emb_size 256 \
--batch_size 128 \
--max_seq_length 64 \
--text_pair_file "recall/test.csv"
# cpu
# root_dir="checkpoints/inbatch"
# python predict.py \
# --device cpu \
# --params_path "${root_dir}/model_40/model_state.pdparams" \
# --output_emb_size 256 \
# --batch_size 128 \
# --max_seq_length 64 \
# --text_pair_file "recall/test.csv"
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# GPU version
root_dir="checkpoints/inbatch"
python -u -m paddle.distributed.launch --gpus "0" --log_dir "recall_log/" \
recall.py \
--device gpu \
--recall_result_dir "recall_result_dir" \
--recall_result_file "recall_result.txt" \
--params_path "${root_dir}/model_40/model_state.pdparams" \
--model_name_or_path rocketqa-zh-base-query-encoder \
--hnsw_m 100 \
--hnsw_ef 100 \
--batch_size 64 \
--output_emb_size 256\
--max_seq_length 64 \
--recall_num 50 \
--similar_text_pair "recall/dev.csv" \
--corpus_file "recall/corpus.csv"
# CPU version
# python recall.py \
# --device cpu \
# --recall_result_dir "recall_result_dir" \
# --recall_result_file "recall_result.txt" \
# --params_path "${root_dir}/model_40/model_state.pdparams" \
# --hnsw_m 100 \
# --hnsw_ef 100 \
# --batch_size 64 \
# --output_emb_size 256\
# --max_seq_length 60 \
# --recall_num 50 \
# --similar_text_pair "recall/dev.csv" \
# --corpus_file "recall/corpus.csv"
\ No newline at end of file
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import random
import time
from functools import partial
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ann_util import build_index
from batch_negative.model import SemanticIndexBatchNeg, SemanticIndexCacheNeg
from data import (
convert_example,
create_dataloader,
gen_id2corpus,
gen_text_file,
read_text_pair,
)
from paddlenlp.data import Pad, Tuple
from paddlenlp.datasets import MapDataset, load_dataset
from paddlenlp.transformers import AutoModel, AutoTokenizer, LinearDecayWithWarmup
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--save_dir", default='./checkpoint', type=str, help="The output directory where the model checkpoints will be written.")
parser.add_argument("--max_seq_length", default=512, type=int, help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.")
parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.")
parser.add_argument('--model_name_or_path', default="rocketqa-zh-base-query-encoder", help="The pretrained model used for training")
parser.add_argument("--output_emb_size", default=256, type=int, help="output_embedding_size")
parser.add_argument("--learning_rate", default=5E-5, type=float, help="The initial learning rate for Adam.")
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
parser.add_argument("--epochs", default=10, type=int, help="Total number of training epochs to perform.")
parser.add_argument("--warmup_proportion", default=0.0, type=float, help="Linear warmup proportion over the training process.")
parser.add_argument("--init_from_ckpt", type=str, default=None, help="The path of checkpoint to be loaded.")
parser.add_argument("--seed", type=int, default=1000, help="random seed for initialization")
parser.add_argument('--device', choices=['cpu', 'gpu'], default="cpu", help="Select which device to train model, defaults to gpu.")
parser.add_argument('--save_steps', type=int, default=10000, help="Interval steps to save checkpoint")
parser.add_argument('--log_steps', type=int, default=10, help="Interval steps to print log")
parser.add_argument("--train_set_file", type=str, default='./recall/train.csv', help="The full path of train_set_file.")
parser.add_argument("--dev_set_file", type=str, default='./recall/dev.csv', help="The full path of dev_set_file.")
parser.add_argument("--margin", default=0.2, type=float, help="Margin between pos_sample and neg_samples")
parser.add_argument("--scale", default=30, type=int, help="Scale for pair-wise margin_rank_loss")
parser.add_argument("--corpus_file", type=str, default='./recall/corpus.csv', help="The full path of input file")
parser.add_argument("--similar_text_pair_file", type=str, default='./recall/dev.csv', help="The full path of similar text pair file")
parser.add_argument("--recall_result_dir", type=str, default='./recall_result_dir', help="The full path of recall result file to save")
parser.add_argument("--recall_result_file", type=str, default='recall_result_init.txt', help="The file name of recall result")
parser.add_argument("--recall_num", default=50, type=int, help="Recall number for each query from Ann index.")
parser.add_argument("--hnsw_m", default=100, type=int, help="Recall number for each query from Ann index.")
parser.add_argument("--hnsw_ef", default=100, type=int, help="Recall number for each query from Ann index.")
parser.add_argument("--hnsw_max_elements", default=1000000, type=int, help="Recall number for each query from Ann index.")
parser.add_argument("--evaluate_result", type=str, default='evaluate_result.txt', help="evaluate_result")
parser.add_argument('--evaluate', action='store_true', help='whether evaluate while training')
parser.add_argument("--max_grad_norm", type=float, default=5.0, help="max grad norm for global norm clip")
parser.add_argument("--use_amp", action="store_true", help="Whether to use AMP.")
parser.add_argument("--amp_loss_scale", default=32768, type=float, help="The value of scale_loss for fp16. This is only used for AMP training.")
parser.add_argument("--use_recompute", action='store_true', help="Using the recompute to scale up the batch size and save the memory.")
parser.add_argument("--use_gradient_cache", action='store_true', help="Using the gradient cache to scale up the batch size and save the memory.")
parser.add_argument("--chunk_numbers", type=int, default=50, help="The number of the chunks for model")
args = parser.parse_args()
# yapf: enable
def set_seed(seed):
"""sets random seed"""
random.seed(seed)
np.random.seed(seed)
paddle.seed(seed)
def recall(rs, N=10):
recall_flags = [np.sum(r[0:N]) for r in rs]
return np.mean(recall_flags)
@paddle.no_grad()
def evaluate(model, corpus_data_loader, query_data_loader, recall_result_file, text_list, id2corpus):
# Load pretrained semantic model
inner_model = model._layers
final_index = build_index(args, corpus_data_loader, inner_model)
query_embedding = inner_model.get_semantic_embedding(query_data_loader)
with open(recall_result_file, "w", encoding="utf-8") as f:
for batch_index, batch_query_embedding in enumerate(query_embedding):
recalled_idx, cosine_sims = final_index.knn_query(batch_query_embedding.numpy(), args.recall_num)
batch_size = len(cosine_sims)
for row_index in range(batch_size):
text_index = args.batch_size * batch_index + row_index
for idx, doc_idx in enumerate(recalled_idx[row_index]):
f.write(
"{}\t{}\t{}\n".format(
text_list[text_index]["text"], id2corpus[doc_idx], 1.0 - cosine_sims[row_index][idx]
)
)
text2similar = {}
with open(args.similar_text_pair_file, "r", encoding="utf-8") as f:
for line in f:
text, similar_text = line.rstrip().split("\t")
text2similar[text] = similar_text
rs = []
with open(recall_result_file, "r", encoding="utf-8") as f:
relevance_labels = []
for index, line in enumerate(f):
if index % args.recall_num == 0 and index != 0:
rs.append(relevance_labels)
relevance_labels = []
text, recalled_text, cosine_sim = line.rstrip().split("\t")
if text == recalled_text:
continue
if text2similar[text] == recalled_text:
relevance_labels.append(1)
else:
relevance_labels.append(0)
recall_N = []
recall_num = [1, 5, 10, 20, 50]
for topN in recall_num:
R = round(100 * recall(rs, N=topN), 3)
recall_N.append(str(R))
evaluate_result_file = os.path.join(args.recall_result_dir, args.evaluate_result)
result = open(evaluate_result_file, "a")
res = []
timestamp = time.strftime("%Y%m%d-%H%M%S", time.localtime())
res.append(timestamp)
for key, val in zip(recall_num, recall_N):
print("recall@{}={}".format(key, val))
res.append(str(val))
result.write("\t".join(res) + "\n")
return float(recall_N[1])
def train(
train_data_loader,
model,
optimizer,
lr_scheduler,
rank,
corpus_data_loader,
query_data_loader,
recall_result_file,
text_list,
id2corpus,
tokenizer,
):
global_step = 0
best_recall = 0.0
tic_train = time.time()
for epoch in range(1, args.epochs + 1):
for step, batch in enumerate(train_data_loader, start=1):
query_input_ids, query_token_type_ids, title_input_ids, title_token_type_ids = batch
loss = model(
query_input_ids=query_input_ids,
title_input_ids=title_input_ids,
query_token_type_ids=query_token_type_ids,
title_token_type_ids=title_token_type_ids,
)
global_step += 1
if global_step % args.log_steps == 0 and rank == 0:
print(
"global step %d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s"
% (global_step, epoch, step, loss, args.log_steps / (time.time() - tic_train))
)
tic_train = time.time()
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.clear_grad()
if not args.evaluate:
if global_step % args.save_steps == 0 and rank == 0:
save_dir = os.path.join(args.save_dir, "model_%d" % global_step)
if not os.path.exists(save_dir):
os.makedirs(save_dir)
save_param_path = os.path.join(save_dir, "model_state.pdparams")
paddle.save(model.state_dict(), save_param_path)
tokenizer.save_pretrained(save_dir)
if args.evaluate and rank == 0:
print("evaluating")
recall_5 = evaluate(model, corpus_data_loader, query_data_loader, recall_result_file, text_list, id2corpus)
if recall_5 > best_recall:
best_recall = recall_5
save_dir = os.path.join(args.save_dir, "model_best")
if not os.path.exists(save_dir):
os.makedirs(save_dir)
save_param_path = os.path.join(save_dir, "model_state.pdparams")
paddle.save(model.state_dict(), save_param_path)
tokenizer.save_pretrained(save_dir)
with open(os.path.join(save_dir, "train_result.txt"), "a", encoding="utf-8") as fp:
fp.write("epoch=%d, global_step: %d, recall: %s\n" % (epoch, global_step, recall_5))
def gradient_cache_train(train_data_loader, model, optimizer, lr_scheduler, rank, tokenizer):
if args.use_amp:
scaler = paddle.amp.GradScaler(init_loss_scaling=args.amp_loss_scale)
if args.batch_size % args.chunk_numbers == 0:
chunk_numbers = args.chunk_numbers
else:
raise Exception(
f" Batch_size {args.batch_size} must divides chunk_numbers {args.chunk_numbers} without producing a remainder "
)
def split(inputs, chunk_numbers, axis=0):
if inputs.shape[0] % chunk_numbers == 0:
return paddle.split(inputs, chunk_numbers, axis=0)
else:
return paddle.split(inputs, inputs.shape[0], axis=0)
global_step = 0
tic_train = time.time()
for epoch in range(1, args.epochs + 1):
for step, batch in enumerate(train_data_loader, start=1):
# Separate large batches into several sub batches
chunked_x = [split(t, chunk_numbers, axis=0) for t in batch]
sub_batchs = [list(s) for s in zip(*chunked_x)]
all_grads = []
all_CUDA_rnd_state = []
all_query = []
all_title = []
for sub_batch in sub_batchs:
all_reps = []
all_labels = []
(
sub_query_input_ids,
sub_query_token_type_ids,
sub_title_input_ids,
sub_title_token_type_ids,
) = sub_batch
with paddle.amp.auto_cast(args.use_amp, custom_white_list=["layer_norm", "softmax", "gelu"]):
with paddle.no_grad():
sub_CUDA_rnd_state = paddle.framework.random.get_cuda_rng_state()
all_CUDA_rnd_state.append(sub_CUDA_rnd_state)
sub_cosine_sim, sub_label, query_embedding, title_embedding = model(
query_input_ids=sub_query_input_ids,
title_input_ids=sub_title_input_ids,
query_token_type_ids=sub_query_token_type_ids,
title_token_type_ids=sub_title_token_type_ids,
)
all_reps.append(sub_cosine_sim)
all_labels.append(sub_label)
all_title.append(title_embedding)
all_query.append(query_embedding)
model_reps = paddle.concat(all_reps, axis=0)
model_title = paddle.concat(all_title)
model_query = paddle.concat(all_query)
model_title = model_title.detach()
model_query = model_query.detach()
model_query.stop_gradient = False
model_title.stop_gradient = False
model_reps.stop_gradient = False
model_label = paddle.concat(all_labels, axis=0)
loss = F.cross_entropy(input=model_reps, label=model_label)
loss.backward()
# Store gradients
all_grads.append(model_reps.grad)
for sub_batch, CUDA_state, grad in zip(sub_batchs, all_CUDA_rnd_state, all_grads):
(
sub_query_input_ids,
sub_query_token_type_ids,
sub_title_input_ids,
sub_title_token_type_ids,
) = sub_batch
paddle.framework.random.set_cuda_rng_state(CUDA_state)
# Recompute the forward propagation
sub_cosine_sim, sub_label, query_embedding, title_embedding = model(
query_input_ids=sub_query_input_ids,
title_input_ids=sub_title_input_ids,
query_token_type_ids=sub_query_token_type_ids,
title_token_type_ids=sub_title_token_type_ids,
)
# Chain rule
surrogate = paddle.dot(sub_cosine_sim, grad)
# Backward propagation
if args.use_amp:
scaled = scaler.scale(surrogate)
scaled.backward()
else:
surrogate.backward()
# Update model parameters
if args.use_amp:
scaler.minimize(optimizer, scaled)
else:
optimizer.step()
global_step += 1
if global_step % args.log_steps == 0 and rank == 0:
print(
"global step %d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s"
% (global_step, epoch, step, loss, args.log_steps / (time.time() - tic_train))
)
tic_train = time.time()
lr_scheduler.step()
optimizer.clear_grad()
if global_step % args.save_steps == 0 and rank == 0:
save_dir = os.path.join(args.save_dir, "model_%d" % global_step)
if not os.path.exists(save_dir):
os.makedirs(save_dir)
save_param_path = os.path.join(save_dir, "model_state.pdparams")
paddle.save(model.state_dict(), save_param_path)
tokenizer.save_pretrained(save_dir)
def do_train():
paddle.set_device(args.device)
rank = paddle.distributed.get_rank()
if paddle.distributed.get_world_size() > 1:
paddle.distributed.init_parallel_env()
set_seed(args.seed)
train_ds = load_dataset(read_text_pair, data_path=args.train_set_file, lazy=False)
pretrained_model = AutoModel.from_pretrained(args.model_name_or_path, enable_recompute=args.use_recompute)
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length)
batchify_fn = lambda samples, fn=Tuple( # noqa: E731
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # query_input
Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # query_segment
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # title_input
Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # title_segment
): [data for data in fn(samples)]
train_data_loader = create_dataloader(
train_ds, mode="train", batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func
)
if args.use_gradient_cache:
model = SemanticIndexCacheNeg(
pretrained_model, margin=args.margin, scale=args.scale, output_emb_size=args.output_emb_size
)
else:
model = SemanticIndexBatchNeg(
pretrained_model, margin=args.margin, scale=args.scale, output_emb_size=args.output_emb_size
)
if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt):
state_dict = paddle.load(args.init_from_ckpt)
model.set_dict(state_dict)
print("warmup from:{}".format(args.init_from_ckpt))
model = paddle.DataParallel(model)
batchify_fn_dev = lambda samples, fn=Tuple( # noqa: E731
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # text_input
Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # text_segment
): [data for data in fn(samples)]
id2corpus = gen_id2corpus(args.corpus_file)
# convert_example function's input must be dict
corpus_list = [{idx: text} for idx, text in id2corpus.items()]
corpus_ds = MapDataset(corpus_list)
corpus_data_loader = create_dataloader(
corpus_ds, mode="predict", batch_size=args.batch_size, batchify_fn=batchify_fn_dev, trans_fn=trans_func
)
text_list, text2similar_text = gen_text_file(args.similar_text_pair_file)
query_ds = MapDataset(text_list)
query_data_loader = create_dataloader(
query_ds, mode="predict", batch_size=args.batch_size, batchify_fn=batchify_fn_dev, trans_fn=trans_func
)
if not os.path.exists(args.recall_result_dir):
os.mkdir(args.recall_result_dir)
recall_result_file = os.path.join(args.recall_result_dir, args.recall_result_file)
num_training_steps = len(train_data_loader) * args.epochs
lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion)
# Generate parameter names needed to perform weight decay.
# All bias and LayerNorm parameters are excluded.
decay_params = [p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"])]
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
parameters=model.parameters(),
weight_decay=args.weight_decay,
apply_decay_param_fun=lambda x: x in decay_params,
grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm),
)
if args.use_gradient_cache:
gradient_cache_train(train_data_loader, model, optimizer, lr_scheduler, rank, tokenizer)
else:
train(
train_data_loader,
model,
optimizer,
lr_scheduler,
rank,
corpus_data_loader,
query_data_loader,
recall_result_file,
text_list,
id2corpus,
tokenizer,
)
if __name__ == "__main__":
do_train()
**目录**
* [背景介绍](#背景介绍)
* [Milvus召回](#Milvus召回)
* [1. 技术方案和评估指标](#技术方案)
* [2. 环境依赖](#环境依赖)
* [3. 代码结构](#代码结构)
* [4. 数据准备](#数据准备)
* [5. 向量检索](#向量检索)
<a name="背景介绍"></a>
# 背景介绍
基于某检索平台开源的数据集构造生成了面向语义索引的召回库。
<a name="Milvus召回"></a>
# Milvus召回
<a name="技术方案"></a>
## 1. 技术方案和评估指标
### 技术方案
使用 Milvus 搭建召回系统,然后使用训练好的语义索引模型,抽取向量,插入到 Milvus 中,然后进行检索。
<a name="环境依赖"></a>
## 2. 环境依赖和安装说明
**环境依赖**
* python >= 3.6.2
* paddlepaddle >= 2.2
* paddlenlp >= 2.2
* milvus >= 2.1.0
* pymilvus >= 2.1.0
<a name="代码结构"></a>
## 3. 代码结构
## 代码结构:
```
|—— scripts
|—— feature_extract.sh 提取特征向量的bash脚本
|—— search.sh 插入向量和向量检索bash脚本
├── base_model.py # 语义索引模型基类
├── config.py # milvus配置文件
├── data.py # 数据处理函数
├── milvus_ann_search.py # 向量插入和检索的脚本
├── inference.py # 动态图模型向量抽取脚本
├── feature_extract.py # 批量抽取向量脚本
├── milvus_util.py # milvus的工具类
└── README.md
```
<a name="数据准备"></a>
## 4. 数据准备
数据集的样例如下,有两种,第一种是 title+keywords 进行拼接;第二种是一句话。
```
煤矸石-污泥基活性炭介导强化污水厌氧消化煤矸石,污泥,复合基活性炭,厌氧消化,直接种间电子传递
睡眠障碍与常见神经系统疾病的关系睡眠觉醒障碍,神经系统疾病,睡眠,快速眼运动,细胞增殖,阿尔茨海默病
城市道路交通流中观仿真研究智能运输系统;城市交通管理;计算机仿真;城市道路;交通流;路径选择
....
```
### 数据集下载
- [literature_search_data](https://bj.bcebos.com/v1/paddlenlp/data/literature_search_data.zip)
```
├── milvus # milvus建库数据集
├── milvus_data.csv. # 构建召回库的数据
├── recall # 召回(语义索引)数据集
├── corpus.csv # 用于测试的召回库
├── dev.csv # 召回验证集
├── test.csv # 召回测试集
├── train.csv # 召回训练集
├── train_unsupervised.csv # 无监督训练集
├── sort # 排序数据集
├── test_pairwise.csv # 排序测试集
├── dev_pairwise.csv # 排序验证集
└── train_pairwise.csv # 排序训练集
```
<a name="向量检索"></a>
## 5. 向量检索
### 5.1 基于Milvus的向量检索系统搭建
数据准备结束以后,我们开始搭建 Milvus 的语义检索引擎,用于语义向量的快速检索,我们使用[Milvus](https://milvus.io/)开源工具进行召回,Milvus 的搭建教程请参考官方教程 [Milvus官方安装教程](https://milvus.io/docs/v2.1.x/install_standalone-docker.md)本案例使用的是 Milvus 的2.1版本,建议使用官方的 Docker 安装方式,简单快捷。
Milvus 搭建完系统以后就可以插入和检索向量了,首先生成 embedding 向量,每个样本生成256维度的向量,使用的是32GB的V100的卡进行的提取:
```
CUDA_VISIBLE_DEVICES=0 python feature_extract.py \
--model_dir=./output \
--model_name_or_path rocketqa-zh-base-query-encoder \
--corpus_file "data/milvus_data.csv"
```
其中 output 目录下存放的是召回的 Paddle Inference 静态图模型。
| 数据量 | 时间 |
| ------------ | ------------ |
|1000万条|3hour40min39s|
运行结束后会生成 corpus_embedding.npy
生成了向量后,需要把数据插入到 Milvus 库中,首先修改配置:
修改 config.py 的配置 ip 和端口,本项目使用的是8530端口,而 Milvus 默认的是19530,需要根据情况进行修改:
```
MILVUS_HOST='your milvus ip'
MILVUS_PORT = 8530
```
然后运行下面的命令把向量插入到Milvus库中:
```
python milvus_ann_search.py --data_path milvus/milvus_data.csv \
--embedding_path corpus_embedding.npy \
--batch_size 100000 \
--insert
```
参数含义说明
* `data_path`: 数据的路径
* `embedding_path`: 数据对应向量的路径
* `index`: 选择检索向量的索引,用于向量检索
* `insert`: 是否插入向量
* `search`: 是否检索向量
* `batch_size`: 表示的是一次性插入的向量的数量
| 数据量 | 时间 |
| ------------ | ------------ |
|1000万条|21min12s|
另外,Milvus提供了可视化的管理界面,可以很方便的查看数据,安装地址为[Attu](https://github.com/zilliztech/attu).
![](../../img/attu.png)
运行召回脚本:
```
python milvus_ann_search.py --data_path milvus/milvus_data.csv \
--embedding_path corpus_embedding.npy \
--batch_size 100000 \
--index 18 \
--search
```
运行以后的结果的输出为:
```
hit: (distance: 0.0, id: 18), text field: 吉林铁合金集团资产管理现状分析及对策资产管理;资金控制;应收帐款风险;造价控制;集中化财务控制
hit: (distance: 0.45325806736946106, id: 7611689), text field: 哈药集团应收账款分析应收账款,流动资产,财务报告
hit: (distance: 0.5440893769264221, id: 4297885), text field: 宝钢集团负债经营风险控制策略研究钢铁行业;负债经营;风险控制
hit: (distance: 0.5455711483955383, id: 5661135), text field: 浅谈电网企业固定资产风险管理大数据,固定资产,风险管理
...
```
返回的是向量的距离,向量的id,以及对应的文本。
也可以一键执行上述的过程:
```
sh scripts/search.sh
```
### 5.2 文本检索
首先修改代码的模型路径和样本:
```
params_path='checkpoints/model_40/model_state.pdparams'
id2corpus={0:'国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据'}
```
运行命令
```
python3 inference.py
```
运行的输出为,分别是抽取的向量和召回的结果:
```
[1, 256]
Tensor(shape=[1, 256], dtype=float32, place=Place(gpu:0), stop_gradient=True,
[[ 0.07830613, -0.14036864, 0.03433795, -0.14967985, -0.03386058,
0.06630671, 0.01357946, 0.03531205, 0.02411086, 0.02000865,
0.05724005, -0.08119474, 0.06286906, 0.06509133, 0.07193415,
....
hit: (distance: 0.40141725540161133, id: 2742485), text field: 完善国有企业技术创新投入机制的探讨--基于经济责任审计实践国有企业,技术创新,投
入机制
hit: (distance: 0.40258315205574036, id: 1472893), text field: 企业技术创新与组织冗余--基于国有企业与非国有企业的情境研究
hit: (distance: 0.4121206998825073, id: 51831), text field: 企业创新影响对外直接投资决策—基于中国制造业上市公司的研究企业创新;对外直接投资;
制造业;上市公司
hit: (distance: 0.42234909534454346, id: 8682312), text field: 政治关联对企业创新绩效的影响——国有企业与民营企业的对比政治关联,创新绩效,国有
企业,民营企业,双重差分
hit: (distance: 0.46187296509742737, id: 9324797), text field: 财务杠杆、股权激励与企业创新——基于中国A股制造业经验数据制造业;上市公司;股权激
励;财务杠杆;企业创新
....
```
## FAQ
#### 抽取文本语义向量后,利用 Milvus 进行 ANN 检索查询到了完全相同的文本,但是计算出的距离为什么不是 0?
使用的是近似索引,详情请参考Milvus官方文档,[索引创建机制](https://milvus.io/cn/docs/v2.0.x/index.md)
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import abc
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
class SemanticIndexBase(nn.Layer):
def __init__(self, pretrained_model, dropout=None, output_emb_size=None):
super().__init__()
self.ptm = pretrained_model
self.dropout = nn.Dropout(dropout if dropout is not None else 0.1)
# if output_emb_size is not None, then add Linear layer to reduce embedding_size,
# we recommend set output_emb_size = 256 considering the trade-off between
# recall performance and efficiency
self.output_emb_size = output_emb_size
if output_emb_size > 0:
weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=0.02))
self.emb_reduce_linear = paddle.nn.Linear(
self.ptm.config.hidden_size, output_emb_size, weight_attr=weight_attr
)
@paddle.jit.to_static(
input_spec=[
paddle.static.InputSpec(shape=[None, None], dtype="int64"),
paddle.static.InputSpec(shape=[None, None], dtype="int64"),
]
)
def get_pooled_embedding(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
_, cls_embedding = self.ptm(input_ids, token_type_ids, position_ids, attention_mask)
if self.output_emb_size > 0:
cls_embedding = self.emb_reduce_linear(cls_embedding)
cls_embedding = self.dropout(cls_embedding)
cls_embedding = F.normalize(cls_embedding, p=2, axis=-1)
return cls_embedding
def get_semantic_embedding(self, data_loader):
self.eval()
with paddle.no_grad():
for batch_data in data_loader:
input_ids, token_type_ids = batch_data
input_ids = paddle.to_tensor(input_ids)
token_type_ids = paddle.to_tensor(token_type_ids)
text_embeddings = self.get_pooled_embedding(input_ids, token_type_ids=token_type_ids)
yield text_embeddings
def cosine_sim(
self,
query_input_ids,
title_input_ids,
query_token_type_ids=None,
query_position_ids=None,
query_attention_mask=None,
title_token_type_ids=None,
title_position_ids=None,
title_attention_mask=None,
):
query_cls_embedding = self.get_pooled_embedding(
query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask
)
title_cls_embedding = self.get_pooled_embedding(
title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask
)
cosine_sim = paddle.sum(query_cls_embedding * title_cls_embedding, axis=-1)
return cosine_sim
@abc.abstractmethod
def forward(self):
pass
class SemanticIndexBaseStatic(nn.Layer):
def __init__(self, pretrained_model, dropout=None, output_emb_size=None):
super().__init__()
self.ptm = pretrained_model
self.dropout = nn.Dropout(dropout if dropout is not None else 0.1)
# if output_emb_size is not None, then add Linear layer to reduce embedding_size,
# we recommend set output_emb_size = 256 considering the trade-off between
# recall performance and efficiency
self.output_emb_size = output_emb_size
if output_emb_size > 0:
weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=0.02))
self.emb_reduce_linear = paddle.nn.Linear(
self.ptm.config.hidden_size, output_emb_size, weight_attr=weight_attr
)
@paddle.jit.to_static(
input_spec=[
paddle.static.InputSpec(shape=[None, None], dtype="int64"),
paddle.static.InputSpec(shape=[None, None], dtype="int64"),
]
)
def get_pooled_embedding(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
_, cls_embedding = self.ptm(input_ids, token_type_ids, position_ids, attention_mask)
if self.output_emb_size > 0:
cls_embedding = self.emb_reduce_linear(cls_embedding)
cls_embedding = self.dropout(cls_embedding)
cls_embedding = F.normalize(cls_embedding, p=2, axis=-1)
return cls_embedding
def get_semantic_embedding(self, data_loader):
self.eval()
with paddle.no_grad():
for batch_data in data_loader:
input_ids, token_type_ids = batch_data
input_ids = paddle.to_tensor(input_ids)
token_type_ids = paddle.to_tensor(token_type_ids)
text_embeddings = self.get_pooled_embedding(input_ids, token_type_ids=token_type_ids)
yield text_embeddings
def cosine_sim(
self,
query_input_ids,
title_input_ids,
query_token_type_ids=None,
query_position_ids=None,
query_attention_mask=None,
title_token_type_ids=None,
title_position_ids=None,
title_attention_mask=None,
):
query_cls_embedding = self.get_pooled_embedding(
query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask
)
title_cls_embedding = self.get_pooled_embedding(
title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask
)
cosine_sim = paddle.sum(query_cls_embedding * title_cls_embedding, axis=-1)
return cosine_sim
def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
_, cls_embedding = self.ptm(input_ids, token_type_ids, position_ids, attention_mask)
if self.output_emb_size > 0:
cls_embedding = self.emb_reduce_linear(cls_embedding)
cls_embedding = self.dropout(cls_embedding)
cls_embedding = F.normalize(cls_embedding, p=2, axis=-1)
return cls_embedding
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
MILVUS_HOST = "10.21.226.175"
MILVUS_PORT = 8530
data_dim = 256
top_k = 100
collection_name = "literature_search"
partition_tag = "partition_2"
embedding_name = "embeddings"
index_config = {
"index_type": "IVF_FLAT",
"metric_type": "L2",
"params": {"nlist": 1000},
}
search_params = {
"metric_type": "L2",
"params": {"nprobe": top_k},
}
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import paddle
from paddlenlp.utils.log import logger
def create_dataloader(dataset, mode="train", batch_size=1, batchify_fn=None, trans_fn=None):
if trans_fn:
dataset = dataset.map(trans_fn)
shuffle = True if mode == "train" else False
if mode == "train":
batch_sampler = paddle.io.DistributedBatchSampler(dataset, batch_size=batch_size, shuffle=shuffle)
else:
batch_sampler = paddle.io.BatchSampler(dataset, batch_size=batch_size, shuffle=shuffle)
return paddle.io.DataLoader(dataset=dataset, batch_sampler=batch_sampler, collate_fn=batchify_fn, return_list=True)
def convert_example(example, tokenizer, max_seq_length=512, pad_to_max_seq_len=False):
"""
Builds model inputs from a sequence.
A BERT sequence has the following format:
- single sequence: ``[CLS] X [SEP]``
Args:
example(obj:`list(str)`): The list of text to be converted to ids.
tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
which contains most of the methods. Users should refer to the superclass for more information regarding methods.
max_seq_len(obj:`int`): The maximum total input sequence length after tokenization.
Sequences longer than this will be truncated, sequences shorter will be padded.
is_test(obj:`False`, defaults to `False`): Whether the example contains label or not.
Returns:
input_ids(obj:`list[int]`): The list of query token ids.
token_type_ids(obj: `list[int]`): List of query sequence pair mask.
"""
result = []
for key, text in example.items():
encoded_inputs = tokenizer(text=text, max_seq_len=max_seq_length, pad_to_max_seq_len=pad_to_max_seq_len)
input_ids = encoded_inputs["input_ids"]
token_type_ids = encoded_inputs["token_type_ids"]
result += [input_ids, token_type_ids]
return result
def read_text_pair(data_path):
"""Reads data."""
with open(data_path, "r", encoding="utf-8") as f:
for line in f:
data = line.rstrip().split("\t")
if len(data) != 2:
continue
yield {"text_a": data[0], "text_b": data[1]}
def read_text_triplet(data_path):
"""Reads data."""
with open(data_path, "r", encoding="utf-8") as f:
for line in f:
data = line.rstrip().split("\t")
if len(data) != 3:
continue
yield {"text": data[0], "pos_sample": data[1], "neg_sample": data[2]}
# ANN - active learning ------------------------------------------------------
def get_latest_checkpoint(args):
"""
Return: (latest_checkpoint_path, global_step)
"""
if not os.path.exists(args.save_dir):
return args.init_from_ckpt, 0
subdirectories = list(next(os.walk(args.save_dir))[1])
def valid_checkpoint(checkpoint):
chk_path = os.path.join(args.save_dir, checkpoint)
scheduler_path = os.path.join(chk_path, "model_state.pdparams")
succeed_flag_file = os.path.join(chk_path, "succeed_flag_file")
return os.path.exists(scheduler_path) and os.path.exists(succeed_flag_file)
trained_steps = [int(s) for s in subdirectories if valid_checkpoint(s)]
if len(trained_steps) > 0:
return os.path.join(args.save_dir, str(max(trained_steps)), "model_state.pdparams"), max(trained_steps)
return args.init_from_ckpt, 0
# ANN - active learning ------------------------------------------------------
def get_latest_ann_data(ann_data_dir):
if not os.path.exists(ann_data_dir):
return None, -1
subdirectories = list(next(os.walk(ann_data_dir))[1])
def valid_checkpoint(step):
ann_data_file = os.path.join(ann_data_dir, step, "new_ann_data")
# succeed_flag_file is an empty file that indicates ann data has been generated
succeed_flag_file = os.path.join(ann_data_dir, step, "succeed_flag_file")
return os.path.exists(succeed_flag_file) and os.path.exists(ann_data_file)
ann_data_steps = [int(s) for s in subdirectories if valid_checkpoint(s)]
if len(ann_data_steps) > 0:
latest_ann_data_file = os.path.join(ann_data_dir, str(max(ann_data_steps)), "new_ann_data")
logger.info("Using latest ann_data_file:{}".format(latest_ann_data_file))
return latest_ann_data_file, max(ann_data_steps)
logger.info("no new ann_data, return (None, -1)")
return None, -1
def gen_id2corpus(corpus_file):
id2corpus = {}
with open(corpus_file, "r", encoding="utf-8") as f:
for idx, line in enumerate(f):
id2corpus[idx] = line.rstrip()
return id2corpus
def gen_text_file(similar_text_pair_file):
text2similar_text = {}
texts = []
with open(similar_text_pair_file, "r", encoding="utf-8") as f:
for line in f:
splited_line = line.rstrip().split("\t")
if len(splited_line) != 2:
continue
text, similar_text = line.rstrip().split("\t")
if not text or not similar_text:
continue
text2similar_text[text] = similar_text
texts.append({"text": text})
return texts, text2similar_text
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import sys
import numpy as np
import paddle
from paddle import inference
from tqdm import tqdm
from paddlenlp.data import Pad, Tuple
from paddlenlp.transformers import AutoTokenizer
sys.path.append(".")
from data import convert_example # noqa E402
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--model_dir", type=str, required=True, help="The directory to static model.")
parser.add_argument("--corpus_file", type=str, required=True, help="The corpus_file path.")
parser.add_argument("--max_seq_length", default=64, type=int, help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.")
parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.")
parser.add_argument('--device', choices=['cpu', 'gpu', 'xpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
parser.add_argument('--use_tensorrt', default=False, type=eval, choices=[True, False], help='Enable to use tensorrt to speed up.')
parser.add_argument("--precision", default="fp32", type=str, choices=["fp32", "fp16", "int8"], help='The tensorrt precision.')
parser.add_argument('--cpu_threads', default=10, type=int, help='Number of threads to predict when using cpu.')
parser.add_argument('--enable_mkldnn', default=False, type=eval, choices=[True, False], help='Enable to use mkldnn to speed up when using cpu.')
parser.add_argument("--model_name_or_path", default='rocketqa-zh-base-query-encoder', type=str, help='The pretrained model used for training')
args = parser.parse_args()
# yapf: enable
class Predictor(object):
def __init__(
self,
model_dir,
device="gpu",
max_seq_length=128,
batch_size=32,
use_tensorrt=False,
precision="fp32",
cpu_threads=10,
enable_mkldnn=False,
):
self.max_seq_length = max_seq_length
self.batch_size = batch_size
model_file = model_dir + "/inference.get_pooled_embedding.pdmodel"
params_file = model_dir + "/inference.get_pooled_embedding.pdiparams"
if not os.path.exists(model_file):
raise ValueError("not find model file path {}".format(model_file))
if not os.path.exists(params_file):
raise ValueError("not find params file path {}".format(params_file))
config = paddle.inference.Config(model_file, params_file)
if device == "gpu":
# set GPU configs accordingly
# such as initialize the gpu memory, enable tensorrt
config.enable_use_gpu(100, 0)
precision_map = {
"fp16": inference.PrecisionType.Half,
"fp32": inference.PrecisionType.Float32,
"int8": inference.PrecisionType.Int8,
}
precision_mode = precision_map[precision]
if args.use_tensorrt:
config.enable_tensorrt_engine(
max_batch_size=batch_size, min_subgraph_size=30, precision_mode=precision_mode
)
elif device == "cpu":
# set CPU configs accordingly,
# such as enable_mkldnn, set_cpu_math_library_num_threads
config.disable_gpu()
if args.enable_mkldnn:
# cache 10 different shapes for mkldnn to avoid memory leak
config.set_mkldnn_cache_capacity(10)
config.enable_mkldnn()
config.set_cpu_math_library_num_threads(args.cpu_threads)
elif device == "xpu":
# set XPU configs accordingly
config.enable_xpu(100)
config.switch_use_feed_fetch_ops(False)
self.predictor = paddle.inference.create_predictor(config)
self.input_handles = [self.predictor.get_input_handle(name) for name in self.predictor.get_input_names()]
self.output_handle = self.predictor.get_output_handle(self.predictor.get_output_names()[0])
def predict(self, data, tokenizer):
"""
Predicts the data labels.
Args:
data (obj:`List(str)`): The batch data whose each element is a raw text.
tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
which contains most of the methods. Users should refer to the superclass for more information regarding methods.
Returns:
results(obj:`dict`): All the predictions labels.
"""
batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # input
Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # segment
): fn(samples)
all_embeddings = []
examples = []
for idx, text in enumerate(tqdm(data)):
input_ids, segment_ids = convert_example(
text, tokenizer, max_seq_length=self.max_seq_length, pad_to_max_seq_len=True
)
examples.append((input_ids, segment_ids))
if len(examples) > self.batch_size:
input_ids, segment_ids = batchify_fn(examples)
self.input_handles[0].copy_from_cpu(input_ids)
self.input_handles[1].copy_from_cpu(segment_ids)
self.predictor.run()
logits = self.output_handle.copy_to_cpu()
all_embeddings.append(logits)
examples = []
if len(examples) > 0:
input_ids, segment_ids = batchify_fn(examples)
self.input_handles[0].copy_from_cpu(input_ids)
self.input_handles[1].copy_from_cpu(segment_ids)
self.predictor.run()
logits = self.output_handle.copy_to_cpu()
all_embeddings.append(logits)
all_embeddings = np.concatenate(all_embeddings, axis=0)
np.save("corpus_embedding", all_embeddings)
def read_text(file_path):
file = open(file_path)
id2corpus = {}
for idx, data in enumerate(file.readlines()):
id2corpus[idx] = data.strip()
return id2corpus
if __name__ == "__main__":
predictor = Predictor(
args.model_dir,
args.device,
args.max_seq_length,
args.batch_size,
args.use_tensorrt,
args.precision,
args.cpu_threads,
args.enable_mkldnn,
)
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
id2corpus = read_text(args.corpus_file)
corpus_list = [{idx: text} for idx, text in id2corpus.items()]
predictor.predict(corpus_list, tokenizer)
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from functools import partial
import paddle
from base_model import SemanticIndexBaseStatic
from config import collection_name, embedding_name, partition_tag
from data import convert_example, create_dataloader
from milvus_util import RecallByMilvus
from paddlenlp.data import Pad, Tuple
from paddlenlp.datasets import MapDataset
from paddlenlp.transformers import AutoModel, AutoTokenizer
def search_in_milvus(text_embedding):
recall_client = RecallByMilvus()
result = recall_client.search(
text_embedding.numpy(),
embedding_name,
collection_name,
partition_names=[partition_tag],
output_fields=["pk", "text"],
)
for hits in result:
for hit in hits:
print(f"hit: {hit}, text field: {hit.entity.get('text')}")
if __name__ == "__main__":
device = "gpu"
max_seq_length = 64
output_emb_size = 256
batch_size = 1
params_path = "checkpoints/model_40/model_state.pdparams"
id2corpus = {0: "国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据"}
model_name_or_path = "rocketqa-zh-base-query-encoder"
paddle.set_device(device)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=max_seq_length)
batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # text_input
Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # text_segment
): [data for data in fn(samples)]
pretrained_model = AutoModel.from_pretrained(model_name_or_path)
model = SemanticIndexBaseStatic(pretrained_model, output_emb_size=output_emb_size)
# Load pretrained semantic model
if params_path and os.path.isfile(params_path):
state_dict = paddle.load(params_path)
model.set_dict(state_dict)
print("Loaded parameters from %s" % params_path)
else:
raise ValueError("Please set --params_path with correct pretrained model file")
# convert_example function's input must be dict
corpus_list = [{idx: text} for idx, text in id2corpus.items()]
corpus_ds = MapDataset(corpus_list)
corpus_data_loader = create_dataloader(
corpus_ds, mode="predict", batch_size=batch_size, batchify_fn=batchify_fn, trans_fn=trans_func
)
# Need better way to get inner model of DataParallel
all_embeddings = []
model.eval()
with paddle.no_grad():
for batch_data in corpus_data_loader:
input_ids, token_type_ids = batch_data
text_embeddings = model.get_pooled_embedding(input_ids, token_type_ids)
all_embeddings.append(text_embeddings)
text_embedding = all_embeddings[0]
print(text_embedding.shape)
print(text_embedding)
search_in_milvus(text_embedding)
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import time
import numpy as np
from config import collection_name, embedding_name, partition_tag
from milvus_util import RecallByMilvus, VecToMilvus, text_max_len
from tqdm import tqdm
parser = argparse.ArgumentParser()
parser.add_argument(
"--data_path", default="milvus/milvus_data.csv", type=str, required=True, help="The data for vector extraction."
)
parser.add_argument(
"--embedding_path", default="corpus_embedding.npy", type=str, required=True, help="The vector path for data."
)
parser.add_argument("--index", default=0, type=int, help="index of the vector for search")
parser.add_argument("--insert", action="store_true", help="whether to insert data")
parser.add_argument("--search", action="store_true", help="whether to search data")
parser.add_argument("--batch_size", default=100000, type=int, help="number of examples to insert each time")
args = parser.parse_args()
def read_text(file_path):
file = open(file_path)
id2corpus = []
for idx, data in enumerate(file.readlines()):
id2corpus.append(data.strip())
return id2corpus
def milvus_data_insert(data_path, embedding_path, batch_size):
corpus_list = read_text(data_path)
embeddings = np.load(embedding_path)
embedding_ids = [i for i in range(embeddings.shape[0])]
client = VecToMilvus()
client.drop_collection(collection_name)
data_size = len(embedding_ids)
for i in tqdm(range(0, data_size, batch_size)):
cur_end = i + batch_size
if cur_end > data_size:
cur_end = data_size
batch_emb = embeddings[np.arange(i, cur_end)]
entities = [
[j for j in range(i, cur_end, 1)],
[corpus_list[j][: text_max_len - 1] for j in range(i, cur_end, 1)],
batch_emb, # field embeddings, supports numpy.ndarray and list
]
client.insert(
collection_name=collection_name, entities=entities, index_name=embedding_name, partition_tag=partition_tag
)
def milvus_data_recall(embedding_path, index):
embeddings = np.load(embedding_path)
embedding_ids = [i for i in range(embeddings.shape[0])]
recall_client = RecallByMilvus()
if index > len(embedding_ids):
print("Index should not be larger than embedding size")
return
embeddings = embeddings[np.arange(index, index + 1)]
time_start = time.time()
result = recall_client.search(
embeddings, embedding_name, collection_name, partition_names=[partition_tag], output_fields=["pk", "text"]
)
time_end = time.time()
sum_t = time_end - time_start
print("time cost", sum_t, "s")
for hits in result:
for hit in hits:
print(f"hit: {hit}, text field: {hit.entity.get('text')}")
if __name__ == "__main__":
if args.insert:
milvus_data_insert(args.data_path, args.embedding_path, args.batch_size)
if args.search:
milvus_data_recall(args.embedding_path, args.index)
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from config import (
MILVUS_HOST,
MILVUS_PORT,
data_dim,
index_config,
search_params,
top_k,
)
from pymilvus import (
Collection,
CollectionSchema,
DataType,
FieldSchema,
connections,
utility,
)
fmt = "\n=== {:30} ===\n"
text_max_len = 1000
fields = [
FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=False, max_length=100),
FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=text_max_len),
FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=data_dim),
]
schema = CollectionSchema(fields, "Neural Search Index")
class VecToMilvus:
def __init__(self):
print(fmt.format("start connecting to Milvus"))
connections.connect("default", host=MILVUS_HOST, port=MILVUS_PORT)
self.collection = None
def has_collection(self, collection_name):
try:
has = utility.has_collection(collection_name)
print(f"Does collection {collection_name} exist in Milvus: {has}")
return has
except Exception as e:
print("Milvus has_table error:", e)
def creat_collection(self, collection_name):
try:
print(fmt.format("Create collection {}".format(collection_name)))
self.collection = Collection(collection_name, schema, consistency_level="Strong")
except Exception as e:
print("Milvus create collection error:", e)
def drop_collection(self, collection_name):
try:
utility.drop_collection(collection_name)
except Exception as e:
print("Milvus delete collection error:", e)
def create_index(self, index_name):
try:
print(fmt.format("Start Creating index"))
self.collection.create_index(index_name, index_config)
print(fmt.format("Start loading"))
self.collection.load()
except Exception as e:
print("Milvus create index error:", e)
def has_partition(self, partition_tag):
try:
result = self.collection.has_partition(partition_tag)
return result
except Exception as e:
print("Milvus has partition error: ", e)
def create_partition(self, partition_tag):
try:
self.collection.create_partition(partition_tag)
print("create partition {} successfully".format(partition_tag))
except Exception as e:
print("Milvus create partition error: ", e)
def insert(self, entities, collection_name, index_name, partition_tag=None):
try:
if not self.has_collection(collection_name):
self.creat_collection(collection_name)
self.create_index(index_name)
else:
self.collection = Collection(collection_name)
if (partition_tag is not None) and (not self.has_partition(partition_tag)):
self.create_partition(partition_tag)
self.collection.insert(entities, partition_name=partition_tag)
print(f"Number of entities in Milvus: {self.collection.num_entities}") # check the num_entites
except Exception as e:
print("Milvus insert error:", e)
class RecallByMilvus:
def __init__(self):
print(fmt.format("start connecting to Milvus"))
connections.connect("default", host=MILVUS_HOST, port=MILVUS_PORT)
self.collection = None
def get_collection(self, collection_name):
try:
print(fmt.format("Connect collection {}".format(collection_name)))
self.collection = Collection(collection_name)
except Exception as e:
print("Milvus create collection error:", e)
def search(self, vectors, embedding_name, collection_name, partition_names=[], output_fields=[]):
try:
self.get_collection(collection_name)
result = self.collection.search(
vectors,
embedding_name,
search_params,
limit=top_k,
partition_names=partition_names,
output_fields=output_fields,
)
return result
except Exception as e:
print("Milvus recall error: ", e)
if __name__ == "__main__":
print(fmt.format("Start inserting entities"))
rng = np.random.default_rng(seed=19530)
num_entities = 3000
entities = [
# provide the pk field because `auto_id` is set to False
[i for i in range(num_entities)],
["第{}个样本".format(i) for i in range(num_entities)], # field text, only supports list
rng.random((num_entities, data_dim)), # field embeddings, supports numpy.ndarray and list
]
print(entities[-1].shape)
collection_name = "test1"
partition_tag = "partition_1"
embedding_name = "embeddings"
client = VecToMilvus()
client.insert(
collection_name=collection_name, entities=entities, index_name=embedding_name, partition_tag=partition_tag
)
print(fmt.format("Start searching entities"))
vectors_to_search = entities[-1][-2:]
recall_client = RecallByMilvus()
result = recall_client.search(
vectors_to_search,
embedding_name,
collection_name,
partition_names=[partition_tag],
output_fields=["pk", "text"],
)
for hits in result:
for hit in hits:
print(f"hit: {hit}, random field: {hit.entity.get('text')}")
CUDA_VISIBLE_DEVICES=2 python feature_extract.py \
--model_dir ./output \
--model_name_or_path rocketqa-zh-base-query-encoder \
--batch_size 512 \
--corpus_file "milvus/milvus_data.csv"
python milvus_ann_search.py --data_path milvus/milvus_data.csv \
--embedding_path corpus_embedding.npy \
--batch_size 100000 \
--index 18 \
--insert \
--search
\ No newline at end of file
**目录**
* [背景介绍](#背景介绍)
* [SimCSE](#SimCSE)
* [1. 技术方案和评估指标](#技术方案)
* [2. 环境依赖](#环境依赖)
* [3. 代码结构](#代码结构)
* [4. 数据准备](#数据准备)
* [5. 模型训练](#模型训练)
* [6. 评估](#开始评估)
* [7. 预测](#预测)
* [8. 部署](#部署)
<a name="背景介绍"></a>
# 背景介绍
语义索引(可通俗理解为向量索引)技术是搜索引擎、推荐系统、广告系统在召回阶段的核心技术之一。语义索引模型的目标是:给定输入文本,模型可以从海量候选召回库中**快速、准确**地召回一批语义相关文本。语义索引模型的效果直接决定了语义相关的物料能否被成功召回进入系统参与上层排序,从基础层面影响整个系统的效果。
在召回阶段,最常见的方式是通过双塔模型,学习Document(简写为Doc)的向量表示,对Doc端建立索引,用ANN召回。我们在这种方式的基础上,引入无监督预训练策略,以如下训练数据为例:
```
我手机丢了,我想换个手机 我想买个新手机,求推荐
求秋色之空漫画全集 求秋色之空全集漫画
学日语软件手机上的 手机学日语的软件
侠盗飞车罪恶都市怎样改车 侠盗飞车罪恶都市怎么改车
```
SimCSE 模型适合缺乏监督数据,但是又有大量无监督数据的匹配和检索场景。
<a name="SimCSE"></a>
# SimCSE
<a name="技术方案"></a>
## 1. 技术方案和评估指标
### 技术方案
双塔模型,采用ERNIE1.0热启,在召回阶段引入 SimCSE 策略。
### 评估指标
(1)采用 Recall@1,Recall@5 ,Recall@10 ,Recall@20 和 Recall@50 指标来评估语义索引模型的召回效果。
**效果评估**
| 策略 | 模型| Recall@1 | Recall@5 |Recall@10 |Recall@20 |Recall@50 |
| ------------ | ------------ | ------------ |--------- |--------- |--------- |--------- |
| SimCSE | ernie 1.0 |42.374 | 57.505| 62.641| 67.09|72.331|
| SimCSE | rocketqa-zh-base-query-encoder |**50.108** | **64.005**| **68.288**| **72.306**|**77.306**|
<a name="环境依赖"></a>
## 2. 环境依赖和安装说明
**环境依赖**
* python >= 3.6
* paddlepaddle >= 2.1.3
* paddlenlp >= 2.2
* [hnswlib](https://github.com/nmslib/hnswlib) >= 0.5.2
* visualdl >= 2.2.2
<a name="代码结构"></a>
## 3. 代码结构
以下是本项目主要代码结构及说明:
```
simcse/
├── model.py # SimCSE 模型组网代码
|—— deploy
|—— python
|—— predict.py # PaddleInference
├── deploy.sh # Paddle Inference的bash脚本
|—— scripts
├── export_model.sh # 动态图转静态图bash脚本
├── predict.sh # 预测的bash脚本
├── evaluate.sh # 召回评估bash脚本
├── run_build_index.sh # 索引的构建脚本
├── train.sh # 训练的bash脚本
|—— ann_util.py # Ann 建索引库相关函数
├── data.py # 无监督语义匹配训练数据、测试数据的读取逻辑
├── export_model.py # 动态图转静态图
├── predict.py # 基于训练好的无监督语义匹配模型计算文本 Pair 相似度
├── evaluate.py # 根据召回结果和评估集计算评估指标
|—— inference.py # 动态图抽取向量
|—— recall.py # 基于训练好的语义索引模型,从召回库中召回给定文本的相似文本
└── train.py # SimCSE 模型训练、评估逻辑
```
<a name="数据准备"></a>
## 4. 数据准备
### 数据集说明
我们基于开源的语义匹配数据集构造生成了面向语义索引的训练集、评估集、召回库。
样例数据如下:
```
睡眠障碍与常见神经系统疾病的关系睡眠觉醒障碍,神经系统疾病,睡眠,快速眼运动,细胞增殖,阿尔茨海默病
城市道路交通流中观仿真研究
城市道路交通流中观仿真研究智能运输系统;城市交通管理;计算机仿真;城市道路;交通流;路径选择
网络健康可信性研究
网络健康可信性研究网络健康信息;可信性;评估模式
脑瘫患儿家庭复原力的影响因素及干预模式雏形 研究
脑瘫患儿家庭复原力的影响因素及干预模式雏形研究脑瘫患儿;家庭功能;干预模式
地西他滨与HA方案治疗骨髓增生异常综合征转化的急性髓系白血病患者近期疗效比较
地西他滨与HA方案治疗骨髓增生异常综合征转化的急性髓系白血病患者近期疗效比较
个案工作 社会化
个案社会工作介入社区矫正再社会化研究——以东莞市清溪镇为例社会工作者;社区矫正人员;再社会化;角色定位
圆周运动加速度角速度
圆周运动向心加速度物理意义的理论分析匀速圆周运动,向心加速度,物理意义,角速度,物理量,线速度,周期
```
召回集,验证集,测试集与inbatch-negative实验的数据保持一致
### 数据集下载
- [literature_search_data](https://bj.bcebos.com/v1/paddlenlp/data/literature_search_data.zip)
```
├── milvus # milvus建库数据集
├── milvus_data.csv. # 构建召回库的数据
├── recall # 召回(语义索引)数据集
├── corpus.csv # 用于测试的召回库
├── dev.csv # 召回验证集
├── test.csv # 召回测试集
├── train.csv # 召回训练集
├── train_unsupervised.csv # 无监督训练集
├── sort # 排序数据集
├── test_pairwise.csv # 排序测试集
├── dev_pairwise.csv # 排序验证集
└── train_pairwise.csv # 排序训练集
```
<a name="模型训练"></a>
## 5. 模型训练
**语义索引预训练模型下载链接:**
以下模型结构参数为: `TrasformerLayer:12, Hidden:768, Heads:12, OutputEmbSize: 256`
|Model|训练参数配置|硬件|MD5|
| ------------ | ------------ | ------------ |-----------|
|[SimCSE](https://bj.bcebos.com/v1/paddlenlp/models/simcse_model.zip)|<div style="width: 150pt">ernie 1.0 epoch:3 lr:5E-5 bs:64 max_len:64 </div>|<div style="width: 100pt">4卡 v100-16g</div>|7c46d9b15a214292e3897c0eb70d0c9f|
### 训练环境说明
+ NVIDIA Driver Version: 440.64.00
+ Ubuntu 16.04.6 LTS (Docker)
+ Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
### 单机单卡训练/单机多卡训练
这里采用单机多卡方式进行训练,通过如下命令,指定 GPU 0,1,2,3 卡, 基于SimCSE训练模型,无监督的数据量比较大,4卡的训练的时长在16个小时左右。如果采用单机单卡训练,只需要把`--gpu`参数设置成单卡的卡号即可。
训练的命令如下:
```shell
$ unset CUDA_VISIBLE_DEVICES
python -u -m paddle.distributed.launch --gpus '0,1,2,3' \
train.py \
--device gpu \
--save_dir ./checkpoints/ \
--batch_size 64 \
--learning_rate 5E-5 \
--epochs 3 \
--save_steps 2000 \
--eval_steps 100 \
--max_seq_length 64 \
--infer_with_fc_pooler \
--dropout 0.2 \
--output_emb_size 256 \
--train_set_file "./recall/train_unsupervised.csv" \
--test_set_file "./recall/dev.csv" \
--model_name_or_path "rocketqa-zh-base-query-encoder"
```
也可以使用bash脚本:
```
sh scripts/train.sh
```
可支持配置的参数:
* `infer_with_fc_pooler`:可选,在预测阶段计算文本 embedding 表示的时候网络前向是否会过训练阶段最后一层的 fc; 建议打开模型效果最好。
* `scale`:可选,在计算 cross_entropy loss 之前对 cosine 相似度进行缩放的因子;默认为 20。
* `dropout`:可选,SimCSE 网络前向使用的 dropout 取值;默认 0.1。
* `save_dir`:可选,保存训练模型的目录;默认保存在当前目录checkpoints文件夹下。
* `max_seq_length`:可选,ERNIE-Gram 模型使用的最大序列长度,最大不能超过512, 若出现显存不足,请适当调低这一参数;默认为128。
* `batch_size`:可选,批处理大小,请结合显存情况进行调整,若出现显存不足,请适当调低这一参数;默认为32。
* `learning_rate`:可选,Fine-tune的最大学习率;默认为5e-5。
* `weight_decay`:可选,控制正则项力度的参数,用于防止过拟合,默认为0.0。
* `epochs`: 训练轮次,默认为1。
* `warmup_proption`:可选,学习率warmup策略的比例,如果0.1,则学习率会在前10%训练step的过程中从0慢慢增长到learning_rate, 而后再缓慢衰减,默认为0.0。
* `init_from_ckpt`:可选,模型参数路径,热启动模型训练;默认为None。
* `seed`:可选,随机种子,默认为1000.
* `device`: 选用什么设备进行训练,可选cpu或gpu。如使用gpu训练则参数gpus指定GPU卡号。
* `model_name_or_path`: 预训练模型,用于模型和`Tokenizer`的参数初始化。
程序运行时将会自动进行训练,评估。同时训练过程中会自动保存模型在指定的`save_dir`中。
如:
```text
checkpoints/
├── model_100
│   ├── model_state.pdparams
│   ├── tokenizer_config.json
│   └── vocab.txt
└── ...
```
<a name="评估"></a>
## 6. 评估
效果评估分为 4 个步骤:
a. 获取Doc端Embedding
基于语义索引模型抽取出Doc样本库的文本向量,
b. 采用hnswlib对Doc端Embedding建库
使用 ANN 引擎构建索引库(这里基于 [hnswlib](https://github.com/nmslib/hnswlib) 进行 ANN 索引)
c. 获取Query的Embedding并查询相似结果
基于语义索引模型抽取出评估集 *Source Text* 的文本向量,在第 2 步中建立的索引库中进行 ANN 查询,召回 Top50 最相似的 *Target Text*, 产出评估集中 *Source Text* 的召回结果 `recall_result` 文件
d. 评估
基于评估集 `dev.csv` 和召回结果 `recall_result` 计算评估指标 Recall@k,其中k取值1,5,10,20,50.
运行如下命令进行 ANN 建库、召回,产出召回结果数据 `recall_result`
```
python -u -m paddle.distributed.launch --gpus "6" --log_dir "recall_log/" \
recall.py \
--device gpu \
--recall_result_dir "recall_result_dir" \
--recall_result_file "recall_result.txt" \
--params_path "checkpoints/model_12000/model_state.pdparams" \
--model_name_or_path rocketqa-zh-base-query-encoder \
--hnsw_m 100 \
--hnsw_ef 100 \
--batch_size 64 \
--output_emb_size 256\
--max_seq_length 60 \
--recall_num 50 \
--similar_text_pair "recall/dev.csv" \
--corpus_file "recall/corpus.csv"
```
也可以使用下面的bash脚本:
```
sh scripts/run_build_index.sh
```
run_build_index.sh还包含cpu和gpu运行的脚本,默认是gpu的脚本
接下来,运行如下命令进行效果评估,产出Recall@1, Recall@5, Recall@10, Recall@20 和 Recall@50 指标:
```
python -u evaluate.py \
--similar_text_pair "recall/dev.csv" \
--recall_result_file "./recall_result_dir/recall_result.txt" \
--recall_num 50
```
也可以使用下面的bash脚本:
```
bash scripts/evaluate.sh
```
参数含义说明
* `similar_text_pair`: 由相似文本对构成的评估集
* `recall_result_file`: 针对评估集中第一列文本 *Source Text* 的召回结果
* `recall_num`: 对 1 个文本召回的相似文本数量
成功运行结束后,会输出如下评估指标:
```
recall@1=45.183
recall@5=60.444
recall@10=65.224
recall@20=69.562
recall@50=74.848
```
<a name="预测"></a>
## 7. 预测
我们可以基于语义索引模型预测文本的语义向量或者计算文本 Pair 的语义相似度。
### 7.1 功能一:抽取文本的语义向量
修改 inference.py 文件里面输入文本 id2corpus 和模型路径 params_path:
```
params_path='checkpoints/model_12000/model_state.pdparams'
id2corpus={0:'国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据'}
```
然后运行
```
python inference.py
```
预测结果位256维的向量:
```
[1, 256]
[[-6.70653954e-02 -6.46878220e-03 -6.78317016e-03 1.66617986e-02
7.20006675e-02 -9.79134627e-03 -1.38441555e-03 4.37440760e-02
4.78116237e-02 1.33881181e-01 1.82927232e-02 3.23656350e-02
...
```
### 7.2 功能二:计算文本 Pair 的语义相似度
### 准备预测数据
待预测数据为 tab 分隔的 tsv 文件,每一行为 1 个文本 Pair,部分示例如下:
```
热处理对尼龙6 及其与聚酰胺嵌段共聚物共混体系晶体熔融行为和结晶结构的影响 热处理对尼龙6及其与聚酰胺嵌段共聚物共混体系晶体熔融行为和结晶结构的影响尼龙6,聚酰胺嵌段共聚物,芳香聚酰胺,热处理
面向生态系统服务的生态系统分类方案研发与应用. 面向生态系统服务的生态系统分类方案研发与应用
huntington舞蹈病的动物模型 Huntington舞蹈病的动物模型
试论我国海岸带经济开发的问题与前景 试论我国海岸带经济开发的问题与前景海岸带,经济开发,问题,前景
```
### 开始预测
以上述 demo 数据为例,运行如下命令基于我们开源的 SimCSE无监督语义索引模型开始计算文本 Pair 的语义相似度:
```
root_dir="checkpoints"
python -u -m paddle.distributed.launch --gpus "3" \
predict.py \
--device gpu \
--params_path "${root_dir}/model_12000/model_state.pdparams" \
--model_name_or_path rocketqa-zh-base-query-encoder \
--output_emb_size 256 \
--batch_size 128 \
--max_seq_length 64 \
--text_pair_file "recall/test.csv"
```
参数含义说明
* `device`: 使用 cpu/gpu 进行训练
* `params_path`: 预训练模型的参数文件名
* `model_name_or_path`: 预训练模型,用于模型和`Tokenizer`的参数初始化。
* `output_emb_size`: Transformer 顶层输出的文本向量维度
* `text_pair_file`: 由文本 Pair 构成的待预测数据集
也可以运行下面的bash脚本:
```
sh scripts/predict.sh
```
产出如下结果
```
0.6477588415145874
0.9698382019996643
1.0
0.1787596344947815
```
<a name="部署"></a>
## 8. 部署
### 动转静导出
首先把动态图模型转换为静态图:
```
python export_model.py --params_path checkpoints/model_12000/model_state.pdparams \
--model_name_or_path rocketqa-zh-base-query-encoder \
--output_path=./output
```
也可以运行下面的bash脚本:
```
sh scripts/export_model.sh
```
### Paddle Inference预测
预测既可以抽取向量也可以计算两个文本的相似度。
修改id2corpus的样本:
```
# 抽取向量
id2corpus={0:'国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据'}
# 计算相似度
corpus_list=[['中西方语言与文化的差异','中西方文化差异以及语言体现中西方文化,差异,语言体现'],
['中西方语言与文化的差异','飞桨致力于让深度学习技术的创新与应用更简单']]
```
然后使用PaddleInference
```
python deploy/python/predict.py --model_dir=./output
```
也可以运行下面的bash脚本:
```
sh deploy.sh
```
最终输出的是256维度的特征向量和句子对的预测概率
```
(1, 256)
[[-6.70653731e-02 -6.46873191e-03 -6.78317575e-03 1.66618153e-02
7.20006898e-02 -9.79136024e-03 -1.38439541e-03 4.37440872e-02
4.78115827e-02 1.33881137e-01 1.82927139e-02 3.23656537e-02
.......
[0.5649663209915161, 0.03284594044089317]
```
## FAQ
#### SimCSE模型怎么部署?
+ SimCSE使用的模型跟 In-batch Negatives 训练出来的模型网络结构是一样的,使用 In-batch Negatives 的部署流程即可,参考[In-batch Negatives](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/applications/neural_search/recall/in_batch_negative/deploy/python)
## Reference
[1] Gao, Tianyu, Xingcheng Yao, and Danqi Chen. “SimCSE: Simple Contrastive Learning of Sentence Embeddings.” ArXiv:2104.08821 [Cs], April 18, 2021. http://arxiv.org/abs/2104.08821.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment