# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
importargparse
importos
importsys
importpaddle
frompaddleimportinference
fromscipyimportspatial
frompaddlenlp.dataimportPad,Tuple
frompaddlenlp.transformersimportAutoTokenizer
frompaddlenlp.utils.logimportlogger
sys.path.append(".")
# yapf: disable
parser=argparse.ArgumentParser()
parser.add_argument("--model_dir",type=str,required=True,help="The directory to static model.")
parser.add_argument("--max_seq_length",default=128,type=int,help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.")
parser.add_argument("--batch_size",default=15,type=int,help="Batch size per GPU/CPU for training.")
parser.add_argument('--device',choices=['cpu','gpu','xpu'],default="gpu",help="Select which device to train model, defaults to gpu.")
parser.add_argument("--params_path",type=str,required=True,default='./checkpoint/model_900/model_state.pdparams',help="The path to model parameters to be loaded.")
parser.add_argument("--output_path",type=str,default='./output',help="The path of model parameter in static graph to be saved.")
parser.add_argument("--model_name_or_path",default='rocketqa-zh-base-query-encoder',type=str,help='The pretrained model used for training')
args=parser.parse_args()
# yapf: enable
if__name__=="__main__":
# If you want to use ernie1.0 model, plesace uncomment the following code
parser.add_argument('--device',choices=['cpu','gpu'],default="gpu",help="Select which device to train model, defaults to gpu.")
parser.add_argument("--text_pair_file",type=str,required=True,help="The full path of input file")
parser.add_argument("--params_path",type=str,required=True,help="The path to model parameters to be loaded.")
parser.add_argument("--max_seq_length",default=64,type=int,help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.")
parser.add_argument("--batch_size",default=32,type=int,help="Batch size per GPU/CPU for training.")
parser.add_argument("--margin",default=0.0,type=float,help="Margin between pos_sample and neg_samples.")
parser.add_argument("--scale",default=20,type=int,help="Scale for pair-wise margin_rank_loss.")
parser.add_argument("--output_emb_size",default=0,type=int,help="Output_embedding_size, 0 means use hidden_size as output embedding size.")
parser.add_argument("--model_name_or_path",default='rocketqa-zh-base-query-encoder',type=str,help='The pretrained model used for training')
args=parser.parse_args()
# yapf: enable
defpredict(model,data_loader):
"""
Predicts the data labels.
Args:
model (obj:`SimCSE`): A model to extract text embedding or calculate similarity of text pair.
data_loader (obj:`List(Example)`): The processed data ids of text pair: [query_input_ids, query_token_type_ids, title_input_ids, title_token_type_ids]
Returns:
results(obj:`List`): cosine similarity of text pairs.
parser.add_argument("--corpus_file",type=str,required=True,help="The full path of input file")
parser.add_argument("--similar_text_pair_file",type=str,required=True,help="The full path of similar text pair file")
parser.add_argument("--recall_result_dir",type=str,default='recall_result',help="The full path of recall result file to save")
parser.add_argument("--recall_result_file",type=str,default='recall_result_file',help="The file name of recall result")
parser.add_argument("--params_path",type=str,required=True,help="The path to model parameters to be loaded.")
parser.add_argument("--max_seq_length",default=64,type=int,help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.")
parser.add_argument("--batch_size",default=32,type=int,help="Batch size per GPU/CPU for training.")
parser.add_argument("--save_dir",default='./checkpoint',type=str,help="The output directory where the model checkpoints will be written.")
parser.add_argument("--max_seq_length",default=128,type=int,help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.")
parser.add_argument("--batch_size",default=32,type=int,help="Batch size per GPU/CPU for training.")
parser.add_argument("--output_emb_size",default=0,type=int,help="Output_embedding_size, 0 means use hidden_size as output embedding size.")
parser.add_argument("--learning_rate",default=1e-5,type=float,help="The initial learning rate for Adam.")
parser.add_argument("--weight_decay",default=0.0,type=float,help="Weight decay if we apply some.")
parser.add_argument("--epochs",default=1,type=int,help="Total number of training epochs to perform.")
parser.add_argument("--warmup_proportion",default=0.0,type=float,help="Linear warmup proportion over the training process.")
parser.add_argument("--init_from_ckpt",type=str,default=None,help="The path of checkpoint to be loaded.")
parser.add_argument("--seed",type=int,default=1000,help="Random seed for initialization.")
parser.add_argument('--device',choices=['cpu','gpu'],default="gpu",help="Select which device to train model, defaults to gpu.")
parser.add_argument('--save_steps',type=int,default=10000,help="Step interval for saving checkpoint.")
parser.add_argument('--eval_steps',type=int,default=10000,help="Step interval for evaluation.")
parser.add_argument("--train_set_file",type=str,required=True,help="The full path of train_set_file.")
parser.add_argument("--test_set_file",type=str,required=True,help="The full path of test_set_file.")
parser.add_argument("--margin",default=0.0,type=float,help="Margin between pos_sample and neg_samples.")
parser.add_argument("--scale",default=20,type=int,help="Scale for pair-wise margin_rank_loss.")
parser.add_argument("--dropout",default=0.1,type=float,help="Dropout for pretrained model encoder.")
parser.add_argument("--infer_with_fc_pooler",action='store_true',help="Whether use fc layer after cls embedding or not for when infer.")
parser.add_argument("--model_name_or_path",default='rocketqa-zh-base-query-encoder',type=str,help='The pretrained model used for training')