llama_paddle

10f294ff · yuguo-Jack · 7c64e6ec · 10f294ff · 10f294ff · 10f294ff
Commit 10f294ff authored Dec 19, 2023 by yuguo-Jack
20 changed files
--- a/applications/neural_search/ranking/ernie_matching/scripts/export_to_serving.sh
+++ b/applications/neural_search/ranking/ernie_matching/scripts/export_to_serving.sh
+python export_to_serving.py \
+    --dirname "output" \
+    --model_filename "inference.predict.pdmodel" \
+    --params_filename "inference.predict.pdiparams" \
+    --server_path "serving_server" \
+    --client_path "serving_client" \
+    --fetch_alias_names "predict"
--- a/applications/neural_search/ranking/ernie_matching/scripts/predict_pairwise.sh
+++ b/applications/neural_search/ranking/ernie_matching/scripts/predict_pairwise.sh
+# gpu
+python -u -m paddle.distributed.launch --gpus "0" \
+        predict_pairwise.py \
+        --device gpu \
+        --params_path "./checkpoints/model_30000/model_state.pdparams"\
+        --batch_size 128 \
+        --max_seq_length 64 \
+        --input_file 'sort/test_pairwise.csv'
+# cpu
+# python predict_pairwise.py \
+#         --device gpu \
+#         --params_path "./checkpoints/model_30000/model_state.pdparams"\
+#         --batch_size 128 \
+#         --max_seq_length 64 \
+#         --input_file 'sort/test_pairwise.csv'
\ No newline at end of file
--- a/applications/neural_search/ranking/ernie_matching/scripts/train_pairwise.sh
+++ b/applications/neural_search/ranking/ernie_matching/scripts/train_pairwise.sh
+# gpu
+python -u -m paddle.distributed.launch --gpus="0,1,2,3" train_pairwise.py \
+        --device gpu \
+        --save_dir ./checkpoints \
+        --batch_size 32 \
+        --learning_rate 2E-5 \
+        --margin 0.1 \
+        --eval_step 100 \
+        --train_file sort/train_pairwise.csv \
+        --test_file sort/dev_pairwise.csv
+
+# cpu
+# python train_pairwise.py \
+#         --device cpu \
+#         --save_dir ./checkpoints \
+#         --batch_size 32 \
+#         --learning_rate 2E-5 \
+#         --margin 0.1 \
+#         --eval_step 100 \
+#         --train_file sort/train_pairwise.csv \
+#         --test_file sort/dev_pairwise.csv
\ No newline at end of file
--- a/applications/neural_search/ranking/ernie_matching/train_pairwise.py
+++ b/applications/neural_search/ranking/ernie_matching/train_pairwise.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import random
+import time
+from functools import partial
+
+import numpy as np
+import paddle
+import pandas as pd
+from data import convert_pairwise_example as convert_example
+from data import create_dataloader
+from model import PairwiseMatching
+from tqdm import tqdm
+
+from paddlenlp.data import Pad, Stack, Tuple
+from paddlenlp.datasets import load_dataset
+from paddlenlp.transformers import AutoModel, AutoTokenizer, LinearDecayWithWarmup
+
+# yapf: disable
+parser = argparse.ArgumentParser()
+parser.add_argument("--margin", default=0.2, type=float, help="Margin for pos_score and neg_score.")
+parser.add_argument("--train_file", type=str, required=True, help="The full path of train file")
+parser.add_argument("--test_file", type=str, required=True, help="The full path of test file")
+parser.add_argument("--save_dir", default='./checkpoint', type=str, help="The output directory where the model checkpoints will be written.")
+parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.")
+parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.")
+parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
+parser.add_argument("--epochs", default=3, type=int, help="Total number of training epochs to perform.")
+parser.add_argument("--eval_step", default=200, type=int, help="Step interval for evaluation.")
+parser.add_argument('--save_step', default=10000, type=int, help="Step interval for saving checkpoint.")
+parser.add_argument("--warmup_proportion", default=0.0, type=float, help="Linear warmup proportion over the training process.")
+parser.add_argument("--init_from_ckpt", type=str, default=None, help="The path of checkpoint to be loaded.")
+parser.add_argument('--model_name_or_path', default="ernie-3.0-medium-zh", help="The pretrained model used for training")
+parser.add_argument("--seed", type=int, default=1000, help="Random seed for initialization.")
+parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
+args = parser.parse_args()
+# yapf: enable
+
+
+def set_seed(seed):
+    """sets random seed"""
+    random.seed(seed)
+    np.random.seed(seed)
+    paddle.seed(seed)
+
+
+@paddle.no_grad()
+def evaluate(model, metric, data_loader, phase="dev"):
+    """
+    Given a dataset, it evals model and computes the metric.
+
+    Args:
+        model(obj:`paddle.nn.Layer`): A model to classify texts.
+        data_loader(obj:`paddle.io.DataLoader`): The dataset loader which generates batches.
+        metric(obj:`paddle.metric.Metric`): The evaluation metric.
+    """
+    model.eval()
+    metric.reset()
+
+    for idx, batch in enumerate(data_loader):
+        input_ids, token_type_ids, labels = batch
+
+        pos_probs = model.predict(input_ids=input_ids, token_type_ids=token_type_ids)
+
+        neg_probs = 1.0 - pos_probs
+
+        preds = np.concatenate((neg_probs, pos_probs), axis=1)
+        metric.update(preds=preds, labels=labels)
+
+    print("eval_{} auc:{:.3}".format(phase, metric.accumulate()))
+    metric.reset()
+    model.train()
+
+
+# 构建读取函数，读取原始数据
+def read(src_path, is_predict=False):
+    data = pd.read_csv(src_path, sep="\t")
+    for index, row in tqdm(data.iterrows()):
+        query = row["query"]
+        title = row["title"]
+        neg_title = row["neg_title"]
+        yield {"query": query, "title": title, "neg_title": neg_title}
+
+
+def read_test(src_path, is_predict=False):
+    data = pd.read_csv(src_path, sep="\t")
+    for index, row in tqdm(data.iterrows()):
+        query = row["query"]
+        title = row["title"]
+        label = row["label"]
+        yield {"query": query, "title": title, "label": label}
+
+
+def do_train():
+    paddle.set_device(args.device)
+    rank = paddle.distributed.get_rank()
+    if paddle.distributed.get_world_size() > 1:
+        paddle.distributed.init_parallel_env()
+
+    set_seed(args.seed)
+
+    train_ds = load_dataset(read, src_path=args.train_file, lazy=False)
+    dev_ds = load_dataset(read_test, src_path=args.test_file, lazy=False)
+
+    pretrained_model = AutoModel.from_pretrained(args.model_name_or_path)
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
+
+    trans_func_train = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length)
+
+    trans_func_eval = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length, phase="eval")
+
+    batchify_fn_train = lambda samples, fn=Tuple(
+        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"),  # pos_pair_input
+        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"),  # pos_pair_segment
+        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"),  # neg_pair_input
+        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"),  # neg_pair_segment
+    ): [data for data in fn(samples)]
+
+    batchify_fn_eval = lambda samples, fn=Tuple(
+        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"),  # pair_input
+        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"),  # pair_segment
+        Stack(dtype="int64"),  # label
+    ): [data for data in fn(samples)]
+
+    train_data_loader = create_dataloader(
+        train_ds, mode="train", batch_size=args.batch_size, batchify_fn=batchify_fn_train, trans_fn=trans_func_train
+    )
+
+    dev_data_loader = create_dataloader(
+        dev_ds, mode="dev", batch_size=args.batch_size, batchify_fn=batchify_fn_eval, trans_fn=trans_func_eval
+    )
+
+    model = PairwiseMatching(pretrained_model, margin=args.margin)
+
+    if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt):
+        state_dict = paddle.load(args.init_from_ckpt)
+        model.set_dict(state_dict)
+
+    num_training_steps = len(train_data_loader) * args.epochs
+
+    lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion)
+
+    # Generate parameter names needed to perform weight decay.
+    # All bias and LayerNorm parameters are excluded.
+    decay_params = [p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"])]
+    optimizer = paddle.optimizer.AdamW(
+        learning_rate=lr_scheduler,
+        parameters=model.parameters(),
+        weight_decay=args.weight_decay,
+        apply_decay_param_fun=lambda x: x in decay_params,
+    )
+
+    metric = paddle.metric.Auc()
+
+    global_step = 0
+    tic_train = time.time()
+    for epoch in range(1, args.epochs + 1):
+        for step, batch in enumerate(train_data_loader, start=1):
+            pos_input_ids, pos_token_type_ids, neg_input_ids, neg_token_type_ids = batch
+
+            loss = model(
+                pos_input_ids=pos_input_ids,
+                neg_input_ids=neg_input_ids,
+                pos_token_type_ids=pos_token_type_ids,
+                neg_token_type_ids=neg_token_type_ids,
+            )
+
+            global_step += 1
+            if global_step % 10 == 0 and rank == 0:
+                print(
+                    "global step %d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s"
+                    % (global_step, epoch, step, loss, 10 / (time.time() - tic_train))
+                )
+                tic_train = time.time()
+
+            loss.backward()
+            optimizer.step()
+            lr_scheduler.step()
+            optimizer.clear_grad()
+
+            if global_step % args.eval_step == 0 and rank == 0:
+                evaluate(model, metric, dev_data_loader, "dev")
+
+            if global_step % args.save_step == 0 and rank == 0:
+                save_dir = os.path.join(args.save_dir, "model_%d" % global_step)
+                if not os.path.exists(save_dir):
+                    os.makedirs(save_dir)
+                save_param_path = os.path.join(save_dir, "model_state.pdparams")
+                paddle.save(model.state_dict(), save_param_path)
+                tokenizer.save_pretrained(save_dir)
+
+
+if __name__ == "__main__":
+    do_train()
--- a/applications/neural_search/recall/in_batch_negative/README.md
+++ b/applications/neural_search/recall/in_batch_negative/README.md
+# In-batch Negatives
+
+ **目录**
+
+* [背景介绍](#背景介绍)
+* [In-batch Negatives](#In-batchNegatives)
+    * [1. 技术方案和评估指标](#技术方案)
+    * [2. 环境依赖](#环境依赖)
+    * [3. 代码结构](#代码结构)
+    * [4. 数据准备](#数据准备)
+    * [5. 模型训练](#模型训练)
+    * [6. 评估](#开始评估)
+    * [7. 预测](#预测)
+    * [8. 部署](#部署)
+
+<a name="背景介绍"></a>
+
+# 背景介绍
+
+语义索引（可通俗理解为向量索引）技术是搜索引擎、推荐系统、广告系统在召回阶段的核心技术之一。语义索引模型的目标是：给定输入文本，模型可以从海量候选召回库中**快速、准确**地召回一批语义相关文本。语义索引模型的效果直接决定了语义相关的物料能否被成功召回进入系统参与上层排序，从基础层面影响整个系统的效果。
+
+在召回阶段，最常见的方式是通过双塔模型，学习Document(简写为Doc)的向量表示，对Doc端建立索引，用ANN召回。我们在这种方式的基础上，引入语义索引策略 [In-batch Negatives](https://arxiv.org/abs/2004.04906)，以如下Batch size=4的训练数据为例：
+
+
+```
+我手机丢了，我想换个手机     我想买个新手机，求推荐
+求秋色之空漫画全集          求秋色之空全集漫画
+学日语软件手机上的          手机学日语的软件
+侠盗飞车罪恶都市怎样改车     侠盗飞车罪恶都市怎么改车
+```
+
+In-batch Negatives 策略的训练数据为语义相似的 Pair 对，策略核心是在 1 个 Batch 内同时基于 N 个负例进行梯度更新，将Batch 内除自身之外其它所有 Source Text 的相似文本 Target Text 作为负例，例如: 上例中“我手机丢了，我想换个手机” 有 1 个正例(”我想买个新手机，求推荐“)，3 个负例(1.求秋色之空全集漫画，2.手机学日语的软件，3.侠盗飞车罪恶都市怎么改车)。
+
+
+<a name="In-batch Negatives"></a>
+
+# In-batch Negatives
+
+<a name="技术方案"></a>
+
+## 1. 技术方案和评估指标
+
+### 技术方案
+
+双塔模型，在召回训练阶段引入In-batch Negatives  策略，使用hnswlib建立索引库，进行召回测试。
+
+
+### 评估指标
+
+采用 Recall@1，Recall@5 ，Recall@10 ，Recall@20  和 Recall@50 指标来评估语义索引模型的召回效果。
+
+Recall@K召回率是指预测的前topK（top-k是指从最后的按得分排序的召回列表中返回前k个结果）结果中检索出的相关结果数和库中所有的相关结果数的比率，衡量的是检索系统的查全率。
+
+**效果评估**
+
+|  策略 | 模型 |  Recall@1 | Recall@5 |Recall@10 |Recall@20 |Recall@50 |
+| ------------ | ------------ | ------------ |--------- |--------- |--------- |--------- |
+|  In-batch Negatives | ernie 1.0 | 51.301 | 65.309| 69.878| 73.996|78.881|
+|  In-batch Negatives | rocketqa-zh-base-query-encoder | **59.622** | **75.089**| **79.668**| **83.404**|**87.773**|
+
+
+<a name="环境依赖"></a>
+
+## 2. 环境依赖
+
+推荐使用GPU进行训练，在预测阶段使用CPU或者GPU均可。
+
+**环境依赖**
+* python >= 3.6.2
+* paddlepaddle >= 2.2.3
+* paddlenlp >= 2.2
+* [hnswlib](https://github.com/nmslib/hnswlib) >= 0.5.2
+* visualdl >= 2.2.2
+
+<a name="代码结构"></a>
+
+## 3. 代码结构
+
+```
+|—— data.py # 数据读取、数据转换等预处理逻辑
+|—— base_model.py # 语义索引模型基类
+|—— train_batch_neg.py # In-batch Negatives 策略的训练主脚本
+|—— batch_negative
+    |—— model.py # In-batch Negatives 策略核心网络结构
+|—— ann_util.py # Ann 建索引库相关函数
+
+
+|—— recall.py # 基于训练好的语义索引模型，从召回库中召回给定文本的相似文本
+|—— evaluate.py # 根据召回结果和评估集计算评估指标
+|—— predict.py # 给定输入文件，计算文本 pair 的相似度
+|—— export_model.py # 动态图转换成静态图
+|—— scripts
+    |—— export_model.sh  # 动态图转换成静态图脚本
+    |—— predict.sh  # 预测 bash 版本
+    |—— evaluate.sh # 评估 bash 版本
+    |—— run_build_index.sh # 构建索引 bash 版本
+    |—— train_batch_neg.sh  # 训练 bash 版本
+    |—— export_to_serving.sh  # Paddle Inference 转 Serving 的 bash 脚本
+|—— deploy
+    |—— python
+        |—— predict.py # PaddleInference
+        |—— deploy.sh # Paddle Inference 部署脚本
+        |—— rpc_client.py # Paddle Serving 的 Client 端
+        |—— web_service.py # Paddle Serving 的 Serving 端
+        |—— config_nlp.yml # Paddle Serving 的配置文件
+|—— inference.py # 动态图抽取向量
+|—— export_to_serving.py # 静态图转 Serving
+
+```
+
+<a name="数据准备"></a>
+
+## 4. 数据准备
+
+### 数据集说明
+
+我们基于某文献检索平台数据，构造面向语义索引的训练集、测试集、召回库。
+
+**训练集** 和 **验证集** 格式一致，训练集4k条，测试集2w条，每行由一对语义相似的文本Pair构成，以tab符分割，第一列是检索query，第二列由相关文献标题（+关键词）构成。样例数据如下:
+
+```
+宁夏社区图书馆服务体系布局现状分析           宁夏社区图书馆服务体系布局现状分析社区图书馆,社区图书馆服务,社区图书馆服务体系
+人口老龄化对京津冀经济                     京津冀人口老龄化对区域经济增长的影响京津冀,人口老龄化,区域经济增长,固定效应模型
+英语广告中的模糊语                      模糊语在英语广告中的应用及其功能模糊语,英语广告,表现形式,语用功能
+甘氨酸二肽的合成                          甘氨酸二肽合成中缩合剂的选择甘氨酸,缩合剂,二肽
+```
+
+**召回库** 用于模拟业务线上的全量语料库，评估模型的召回效果，计算相应的Recall指标。召回库总共30万条样本，每行由一列构成，文献标题（+关键词），样例数据如下：
+```
+陕西省贫困地区城乡青春期少女生长发育调查青春期,生长发育,贫困地区
+五丈岩水库溢洪道加固工程中的新材料应用碳纤维布,粘钢加固技术,超细水泥,灌浆技术
+木塑复合材料在儿童卫浴家具中的应用探索木塑复合材料,儿童,卫浴家具
+泡沫铝准静态轴向压缩有限元仿真泡沫铝,准静态,轴向压缩,力学特性
+```
+
+
+### 数据集下载
+
+
+- [literature_search_data](https://bj.bcebos.com/v1/paddlenlp/data/literature_search_data.zip)
+
+```
+├── milvus # milvus建库数据集
+    ├── milvus_data.csv.  # 构建召回库的数据
+├── recall  # 召回（语义索引）数据集
+    ├── corpus.csv # 用于测试的召回库
+    ├── dev.csv  # 召回验证集
+    ├── test.csv # 召回测试集
+    ├── train.csv  # 召回训练集
+    ├── train_unsupervised.csv # 无监督训练集
+├── sort # 排序数据集
+    ├── test_pairwise.csv   # 排序测试集
+    ├── dev_pairwise.csv    # 排序验证集
+    └── train_pairwise.csv  # 排序训练集
+
+```
+
+<a name="模型训练"></a>
+
+
+## 5. 模型训练
+
+**语义索引训练模型下载链接：**
+
+以下模型结构参数为: `TrasformerLayer:12, Hidden:768, Heads:12, OutputEmbSize: 256`
+
+|Model|训练参数配置|硬件|MD5|
+| ------------ | ------------ | ------------ |-----------|
+|[batch_neg](https://bj.bcebos.com/v1/paddlenlp/models/inbatch_model.zip)|<div style="width: 150pt">ernie 1.0 margin:0.2 scale:30 epoch:3 lr:5E-5 bs:64 max_len:64 </div>|<div style="width: 100pt">4卡 v100-16g</div>|f3e5c7d7b0b718c2530c5e1b136b2d74|
+
+
+### 训练环境说明
+
+- NVIDIA Driver Version: 440.64.00
+- Ubuntu 16.04.6 LTS (Docker)
+- Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
+
+
+### 单机单卡训练/单机多卡训练
+
+这里采用单机多卡方式进行训练，通过如下命令，指定 GPU 0,1,2,3 卡, 基于 In-batch Negatives 策略训练模型，数据量比较小，几分钟就可以完成。如果采用单机单卡训练，只需要把`--gpus`参数设置成单卡的卡号即可。
+
+如果使用CPU进行训练，则需要吧`--gpus`参数去除，然后吧`device`设置成cpu即可，详细请参考train_batch_neg.sh文件的训练设置
+
+然后运行下面的命令使用GPU训练，得到语义索引模型：
+
+```
+root_path=inbatch
+python -u -m paddle.distributed.launch --gpus "0,1,2,3" \
+    train_batch_neg.py \
+    --device gpu \
+    --save_dir ./checkpoints/${root_path} \
+    --batch_size 64 \
+    --learning_rate 5E-5 \
+    --epochs 3 \
+    --output_emb_size 256 \
+    --model_name_or_path rocketqa-zh-base-query-encoder \
+    --save_steps 10 \
+    --max_seq_length 64 \
+    --margin 0.2 \
+    --train_set_file recall/train.csv \
+    --recall_result_dir "recall_result_dir" \
+    --recall_result_file "recall_result.txt" \
+    --hnsw_m 100 \
+    --hnsw_ef 100 \
+    --recall_num 50 \
+    --similar_text_pair_file "recall/dev.csv" \
+    --corpus_file "recall/corpus.csv"
+```
+
+参数含义说明
+
+* `device`: 使用 cpu/gpu 进行训练
+* `save_dir`: 模型存储路径
+* `batch_size`: 训练的batch size的大小
+* `learning_rate`: 训练的学习率的大小
+* `epochs`: 训练的epoch数
+* `output_emb_size`: Transformer 顶层输出的文本向量维度
+* `model_name_or_path`: 预训练模型，用于模型和`Tokenizer`的参数初始化
+* `save_steps`： 模型存储 checkpoint 的间隔 steps 个数
+* `max_seq_length`: 输入序列的最大长度
+* `margin`: 正样本相似度与负样本之间的目标 Gap
+* `train_set_file`: 训练集文件
+* `evaluate`: 是否开启边训练边评估模型训练效果，默认开启
+* `recall_result_dir`: 召回结果存储目录
+* `recall_result_file`: 召回结果的文件名
+* `hnsw_m`: hnsw 算法相关参数，保持默认即可
+* `hnsw_ef`: hnsw 算法相关参数，保持默认即可
+* `recall_num`: 对 1 个文本召回的相似文本数量
+* `similar_text_pair_file`: 由相似文本对构成的评估集
+* `corpus_file`: 召回库数据 corpus_file
+* `use_recompute`: 使用Recompute策略，用于节省显存，是一种以时间换空间的技术
+* `use_gradient_cache`: 使用Gradient Cache策略，用于节省显存，是一种以时间换空间的技术
+* `chunk_numbers`: 使用Gradient Cache策略的参数，表示的是同一个批次的样本分几次执行
+
+也可以使用bash脚本：
+
+```
+sh scripts/train.sh
+```
+
+
+<a name="评估"></a>
+
+## 6. 评估
+
+效果评估分为 4 个步骤:
+
+a. 获取Doc端Embedding
+
+基于语义索引模型抽取出Doc样本库的文本向量。
+
+b. 采用hnswlib对Doc端Embedding建库
+
+使用 ANN 引擎构建索引库(这里基于 [hnswlib](https://github.com/nmslib/hnswlib) 进行 ANN 索引)
+
+c. 获取Query的Embedding并查询相似结果
+
+基于语义索引模型抽取出评估集 *Source Text* 的文本向量，在第 2 步中建立的索引库中进行 ANN 查询，召回 Top50 最相似的 *Target Text*, 产出评估集中 *Source Text* 的召回结果 `recall_result` 文件。
+
+d. 评估
+
+基于评估集 `dev.csv` 和召回结果 `recall_result` 计算评估指标 Recall@k，其中k取值1，5，10，20，50。
+
+运行如下命令进行 ANN 建库、召回，产出召回结果数据 `recall_result`
+
+```
+root_dir="checkpoints/inbatch"
+python -u -m paddle.distributed.launch --gpus "3" --log_dir "recall_log/" \
+        recall.py \
+        --device gpu \
+        --recall_result_dir "recall_result_dir" \
+        --recall_result_file "recall_result.txt" \
+        --params_path "${root_dir}/model_40/model_state.pdparams" \
+        --model_name_or_path rocketqa-zh-base-query-encoder \
+        --hnsw_m 100 \
+        --hnsw_ef 100 \
+        --batch_size 64 \
+        --output_emb_size 256\
+        --max_seq_length 60 \
+        --recall_num 50 \
+        --similar_text_pair "recall/dev.csv" \
+        --corpus_file "recall/corpus.csv"
+```
+参数含义说明
+* `device`： 使用 cpu/gpu 进行训练
+* `recall_result_dir`： 召回结果存储目录
+* `recall_result_file`： 召回结果的文件名
+* `params_path`： 待评估模型的参数文件名
+* `model_name_or_path`: 预训练模型，用于模型和`Tokenizer`的参数初始化
+* `hnsw_m`： hnsw 算法相关参数，保持默认即可
+* `hnsw_ef`： hnsw 算法相关参数，保持默认即可
+* `output_emb_size`： Transformer 顶层输出的文本向量维度
+* `recall_num`： 对 1 个文本召回的相似文本数量
+* `similar_text_pair`： 由相似文本对构成的评估集
+* `corpus_file`： 召回库数据 corpus_file
+
+也可以使用下面的bash脚本：
+
+```
+sh scripts/run_build_index.sh
+```
+
+run_build_index.sh还包含cpu和gpu运行的脚本，默认是gpu的脚本
+
+成功运行结束后，会在 `./recall_result_dir/` 目录下产出 `recall_result.txt` 文件
+
+```
+热处理对尼龙6 及其与聚酰胺嵌段共聚物共混体系晶体熔融行为和结晶结构的影响        热处理对尼龙6及其与聚酰胺嵌段共聚物共混体系晶体熔融行为和结晶结构的影响尼龙6,聚酰胺嵌段共聚物,芳香聚酰胺,热处理      0.9831992387771606
+热处理对尼龙6 及其与聚酰胺嵌段共聚物共混体系晶体熔融行为和结晶结构的影响        热处理方法对高强高模聚乙烯醇纤维性能的影响聚乙烯醇纤维,热处理,性能,热拉伸,热定型    0.8438636660575867
+热处理对尼龙6 及其与聚酰胺嵌段共聚物共混体系晶体熔融行为和结晶结构的影响        制备工艺对PVC/ABS合金力学性能和维卡软化温度的影响PVC,ABS,正交试验,力学性能,维卡软化温度      0.8130228519439697
+.....
+```
+
+
+接下来，运行如下命令进行效果评估，产出Recall@1, Recall@5, Recall@10, Recall@20 和 Recall@50 指标:
+```
+python -u evaluate.py \
+        --similar_text_pair "recall/dev.csv" \
+        --recall_result_file "./recall_result_dir/recall_result.txt" \
+        --recall_num 50
+```
+也可以使用下面的bash脚本：
+
+```
+sh scripts/evaluate.sh
+```
+
+参数含义说明
+* `similar_text_pair`: 由相似文本对构成的评估集 semantic_similar_pair.tsv
+* `recall_result_file`: 针对评估集中第一列文本 *Source Text* 的召回结果
+* `recall_num`: 对 1 个文本召回的相似文本数量
+
+成功运行结束后，会输出如下评估指标:
+
+```
+recall@1=51.261
+recall@5=65.279
+recall@10=69.848
+recall@20=73.971
+recall@50=78.84
+```
+
+<a name="预测"></a>
+
+## 7. 预测
+
+我们可以基于语义索引模型预测文本的语义向量或者计算文本 Pair 的语义相似度。
+
+### 7.1 功能一：抽取文本的语义向量
+
+修改 inference.py 文件里面输入文本 id2corpus 和模型路径 params_path ：
+
+```
+params_path='checkpoints/inbatch/model_40/model_state.pdparams'
+id2corpus={0:'国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据'}
+```
+然后运行：
+```
+python inference.py
+```
+预测结果为256维的向量：
+
+```
+[1, 256]
+[[ 0.07766181 -0.13780491  0.03388524 -0.14910668 -0.0334941   0.06780092
+   0.0104043   0.03168401  0.02605671  0.02088691  0.05520441 -0.0852212
+   .....
+```
+
+### 7.2 功能二：计算文本 Pair 的语义相似度
+
+
+### 准备预测数据
+
+待预测数据为 tab 分隔的 csv 文件，每一行为 1 个文本 Pair，部分示例如下:
+```
+试论我国海岸带经济开发的问题与前景    试论我国海岸带经济开发的问题与前景海岸带,经济开发,问题,前景
+外语阅读焦虑与英语成绩及性别的关系    外语阅读焦虑与英语成绩及性别的关系外语阅读焦虑,外语课堂焦虑,英语成绩,性别
+数字图书馆    智能化图书馆
+网络健康可信性研究    网络成瘾少年
+```
+
+### 开始预测
+
+以上述 demo 数据为例，运行如下命令基于我们开源的 [In-batch Negatives](https://arxiv.org/abs/2004.04906) 策略语义索引模型开始计算文本 Pair 的语义相似度:
+```
+root_dir="checkpoints/inbatch"
+
+python -u -m paddle.distributed.launch --gpus "0" \
+    predict.py \
+    --device gpu \
+    --params_path "${root_dir}/model_40/model_state.pdparams" \
+    --model_name_or_path rocketqa-zh-base-query-encoder \
+    --output_emb_size 256 \
+    --batch_size 128 \
+    --max_seq_length 64 \
+    --text_pair_file "recall/test.csv"
+```
+
+参数含义说明
+* `device`: 使用 cpu/gpu 进行训练
+* `params_path`： 预训练模型的参数文件名
+* `model_name_or_path`: 预训练模型，用于模型和`Tokenizer`的参数初始化
+* `output_emb_size`: Transformer 顶层输出的文本向量维度
+* `text_pair_file`: 由文本 Pair 构成的待预测数据集
+
+也可以运行下面的bash脚本：
+
+```
+sh scripts/predict.sh
+```
+predict.sh文件包含了cpu和gpu运行的脚本，默认是gpu运行的脚本
+
+产出如下结果
+```
+0.9717282652854919
+0.9371012449264526
+0.7968897223472595
+0.30377304553985596
+```
+
+<a name="部署"></a>
+
+## 8. 部署
+
+### 动转静导出
+
+首先把动态图模型转换为静态图：
+
+```
+python export_model.py --params_path checkpoints/inbatch/model_40/model_state.pdparams \
+                       --model_name_or_path rocketqa-zh-base-query-encoder \
+                       --output_path=./output
+```
+也可以运行下面的bash脚本：
+
+```
+sh scripts/export_model.sh
+```
+
+### Paddle Inference预测
+
+预测既可以抽取向量也可以计算两个文本的相似度。
+
+修改id2corpus的样本：
+
+```
+# 抽取向量
+id2corpus={0:'国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据'}
+# 计算相似度
+corpus_list=[['中西方语言与文化的差异','中西方文化差异以及语言体现中西方文化,差异,语言体现'],
+                    ['中西方语言与文化的差异','飞桨致力于让深度学习技术的创新与应用更简单']]
+
+```
+
+然后使用PaddleInference
+
+```
+python deploy/python/predict.py \
+                             --model_dir=./output \
+                             --model_name_or_path rocketqa-zh-base-query-encoder
+```
+也可以运行下面的bash脚本：
+
+```
+sh deploy.sh
+```
+最终输出的是256维度的特征向量和句子对的预测概率：
+
+```
+(1, 256)
+[[-0.0394925  -0.04474756 -0.065534    0.00939134  0.04359895  0.14659195
+  -0.0091779  -0.07303623  0.09413272 -0.01255222 -0.08685658  0.02762237
+   0.10138468  0.00962821  0.10888419  0.04553023  0.05898942  0.00694253
+   ....
+
+[0.959269642829895, 0.04725276678800583]
+```
+
+### Paddle Serving部署
+
+Paddle Serving 的详细文档请参考 [Pipeline_Design](https://github.com/PaddlePaddle/Serving/blob/v0.7.0/doc/Python_Pipeline/Pipeline_Design_CN.md)和[Serving_Design](https://github.com/PaddlePaddle/Serving/blob/v0.7.0/doc/Serving_Design_CN.md),首先把静态图模型转换成Serving的格式：
+
+```
+python export_to_serving.py \
+    --dirname "output" \
+    --model_filename "inference.get_pooled_embedding.pdmodel" \
+    --params_filename "inference.get_pooled_embedding.pdiparams" \
+    --server_path "./serving_server" \
+    --client_path "./serving_client" \
+    --fetch_alias_names "output_embedding"
+
+```
+
+参数含义说明
+* `dirname`: 需要转换的模型文件存储路径，Program 结构文件和参数文件均保存在此目录。
+* `model_filename`： 存储需要转换的模型 Inference Program 结构的文件名称。如果设置为 None ，则使用 `__model__` 作为默认的文件名
+* `params_filename`: 存储需要转换的模型所有参数的文件名称。当且仅当所有模型参数被保>存在一个单独的二进制文件中，它才需要被指定。如果模型参数是存储在各自分离的文件中，设置它的值为 None
+* `server_path`: 转换后的模型文件和配置文件的存储路径。默认值为 serving_server
+* `client_path`: 转换后的客户端配置文件存储路径。默认值为 serving_client
+* `fetch_alias_names`: 模型输出的别名设置，比如输入的 input_ids 等，都可以指定成其他名字，默认不指定
+* `feed_alias_names`: 模型输入的别名设置，比如输出 pooled_out 等，都可以重新指定成其他模型，默认不指定
+
+也可以运行下面的 bash 脚本：
+```
+sh scripts/export_to_serving.sh
+```
+
+Paddle Serving的部署有两种方式，第一种方式是Pipeline的方式，第二种是C++的方式，下面分别介绍这两种方式的用法：
+
+#### Pipeline方式
+
+修改模型需要用到的`Tokenizer`
+
+```
+self.tokenizer = AutoTokenizer.from_pretrained("rocketqa-zh-base-query-encoder")
+```
+
+然后启动 Pipeline Server:
+
+```
+cd deploy/python
+python web_service.py
+```
+
+启动客户端调用 Server。
+
+首先修改rpc_client.py中需要预测的样本：
+
+```
+list_data = [
+    "国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据",
+    "试论翻译过程中的文化差异与语言空缺翻译过程,文化差异,语言空缺,文化对比"
+]
+```
+然后运行：
+
+```
+python deploy/python/rpc_client.py
+```
+模型的输出为：
+
+```
+{'0': '国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据', '1': '试论翻译过程中的文化差异与语言空缺翻译过程,文化差异,语言空缺,文化对比'}
+PipelineClient::predict pack_data time:1641450851.3752182
+PipelineClient::predict before time:1641450851.375738
+['output_embedding']
+(2, 256)
+[[ 0.07830612 -0.14036864  0.03433796 -0.14967982 -0.03386067  0.06630666
+   0.01357943  0.03531194  0.02411093  0.02000859  0.05724002 -0.08119463
+   ......
+```
+
+可以看到客户端发送了2条文本，返回了2个 embedding 向量
+
+#### C++的方式
+
+启动C++的Serving：
+
+```
+python -m paddle_serving_server.serve --model serving_server --port 9393 --gpu_id 2 --thread 5 --ir_optim True --use_trt --precision FP16
+```
+也可以使用脚本：
+
+```
+sh deploy/cpp/start_server.sh
+```
+Client 可以使用 http 或者 rpc 两种方式，rpc 的方式为：
+
+```
+python deploy/cpp/rpc_client.py
+```
+运行的输出为：
+```
+I0209 20:40:07.978225 20896 general_model.cpp:490] [client]logid=0,client_cost=395.695ms,server_cost=392.559ms.
+time to cost :0.3960278034210205 seconds
+{'output_embedding': array([[ 9.01343748e-02, -1.21870913e-01,  1.32834800e-02,
+        -1.57673359e-01, -2.60387752e-02,  6.98455423e-02,
+         1.58108603e-02,  3.89952064e-02,  3.22783105e-02,
+         3.49135026e-02,  7.66086206e-02, -9.12970975e-02,
+         6.25643134e-02,  7.21886680e-02,  7.03565404e-02,
+         5.44054210e-02,  3.25332815e-03,  5.01751155e-02,
+......
+```
+可以看到服务端返回了向量
+
+或者使用 http 的客户端访问模式：
+
+```
+python deploy/cpp/http_client.py
+```
+运行的输出为：
+
+```
+(2, 64)
+(2, 64)
+outputs {
+  tensor {
+    float_data: 0.09013437479734421
+    float_data: -0.12187091261148453
+    float_data: 0.01328347995877266
+    float_data: -0.15767335891723633
+......
+```
+可以看到服务端返回了向量
+
+## FAQ
+
+#### 如何基于无监督SimCSE训练出的模型参数作为参数初始化继续做有监督 In-Batch Negative 训练？
+
+ 使用 `--init_from_ckpt` 参数加载即可，下面是使用示例：
+
+```
+python -u -m paddle.distributed.launch --gpus "0,1,2,3" \
+    train_batch_neg.py \
+    --device gpu \
+    --save_dir ./checkpoints/simcse_inbatch_negative \
+    --model_name_or_path rocketqa-zh-base-query-encoder \
+    --batch_size 64 \
+    --learning_rate 5E-5 \
+    --epochs 3 \
+    --output_emb_size 256 \
+    --save_steps 10 \
+    --max_seq_length 64 \
+    --margin 0.2 \
+    --train_set_file recall/train.csv  \
+    --init_from_ckpt simcse/model_20000/model_state.pdparams
+```
+
+
+
+## Reference
+
+[1] Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, Wen-tau Yih, Dense Passage Retrieval for Open-Domain Question Answering, Preprint 2020.
--- a/applications/neural_search/recall/in_batch_negative/ann_util.py
+++ b/applications/neural_search/recall/in_batch_negative/ann_util.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# coding=UTF-8
+
+import numpy as np
+import hnswlib
+from paddlenlp.utils.log import logger
+
+
+def build_index(args, data_loader, model):
+
+    index = hnswlib.Index(space="ip", dim=args.output_emb_size if args.output_emb_size > 0 else 768)
+
+    # Initializing index
+    # max_elements - the maximum number of elements (capacity). Will throw an exception if exceeded
+    # during insertion of an element.
+    # The capacity can be increased by saving/loading the index, see below.
+    #
+    # ef_construction - controls index search speed/build speed tradeoff
+    #
+    # M - is tightly connected with internal dimensionality of the data. Strongly affects memory consumption (~M)
+    # Higher M leads to higher accuracy/run_time at fixed ef/efConstruction
+    index.init_index(max_elements=args.hnsw_max_elements, ef_construction=args.hnsw_ef, M=args.hnsw_m)
+
+    # Controlling the recall by setting ef:
+    # higher ef leads to better accuracy, but slower search
+    index.set_ef(args.hnsw_ef)
+
+    # Set number of threads used during batch search/construction
+    # By default using all available cores
+    index.set_num_threads(16)
+
+    logger.info("start build index..........")
+
+    all_embeddings = []
+
+    for text_embeddings in model.get_semantic_embedding(data_loader):
+        all_embeddings.append(text_embeddings.numpy())
+
+    all_embeddings = np.concatenate(all_embeddings, axis=0)
+    index.add_items(all_embeddings)
+
+    logger.info("Total index number:{}".format(index.get_current_count()))
+
+    return index
--- a/applications/neural_search/recall/in_batch_negative/base_model.py
+++ b/applications/neural_search/recall/in_batch_negative/base_model.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import abc
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class SemanticIndexBase(nn.Layer):
+    def __init__(self, pretrained_model, dropout=None, output_emb_size=None):
+        super().__init__()
+        self.ptm = pretrained_model
+        self.dropout = nn.Dropout(dropout if dropout is not None else 0.1)
+
+        # if output_emb_size is not None, then add Linear layer to reduce embedding_size,
+        # we recommend set output_emb_size = 256 considering the trade-off between
+        # recall performance and efficiency
+
+        self.output_emb_size = output_emb_size
+        if output_emb_size > 0:
+            weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=0.02))
+            self.emb_reduce_linear = paddle.nn.Linear(
+                self.ptm.config.hidden_size, output_emb_size, weight_attr=weight_attr
+            )
+
+    def get_pooled_embedding(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
+        _, cls_embedding = self.ptm(input_ids, token_type_ids, position_ids, attention_mask)
+
+        if self.output_emb_size > 0:
+            cls_embedding = self.emb_reduce_linear(cls_embedding)
+        cls_embedding = self.dropout(cls_embedding)
+        cls_embedding = F.normalize(cls_embedding, p=2, axis=-1)
+
+        return cls_embedding
+
+    def get_semantic_embedding(self, data_loader):
+        self.eval()
+        with paddle.no_grad():
+            for batch_data in data_loader:
+                input_ids, token_type_ids = batch_data
+
+                text_embeddings = self.get_pooled_embedding(input_ids, token_type_ids=token_type_ids)
+
+                yield text_embeddings
+
+    def cosine_sim(
+        self,
+        query_input_ids,
+        title_input_ids,
+        query_token_type_ids=None,
+        query_position_ids=None,
+        query_attention_mask=None,
+        title_token_type_ids=None,
+        title_position_ids=None,
+        title_attention_mask=None,
+    ):
+
+        query_cls_embedding = self.get_pooled_embedding(
+            query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask
+        )
+
+        title_cls_embedding = self.get_pooled_embedding(
+            title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask
+        )
+
+        cosine_sim = paddle.sum(query_cls_embedding * title_cls_embedding, axis=-1)
+        return cosine_sim
+
+    @abc.abstractmethod
+    def forward(self):
+        pass
+
+
+class SemanticIndexBaseStatic(nn.Layer):
+    def __init__(self, pretrained_model, dropout=None, output_emb_size=None):
+        super().__init__()
+        self.ptm = pretrained_model
+        self.dropout = nn.Dropout(dropout if dropout is not None else 0.1)
+
+        # if output_emb_size is not None, then add Linear layer to reduce embedding_size,
+        # we recommend set output_emb_size = 256 considering the trade-off between
+        # recall performance and efficiency
+
+        self.output_emb_size = output_emb_size
+        if output_emb_size > 0:
+            weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=0.02))
+            self.emb_reduce_linear = paddle.nn.Linear(
+                self.ptm.config.hidden_size, output_emb_size, weight_attr=weight_attr
+            )
+
+    @paddle.jit.to_static(
+        input_spec=[
+            paddle.static.InputSpec(shape=[None, None], dtype="int64"),
+            paddle.static.InputSpec(shape=[None, None], dtype="int64"),
+        ]
+    )
+    def get_pooled_embedding(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
+        _, cls_embedding = self.ptm(input_ids, token_type_ids, position_ids, attention_mask)
+
+        if self.output_emb_size > 0:
+            cls_embedding = self.emb_reduce_linear(cls_embedding)
+        cls_embedding = self.dropout(cls_embedding)
+        cls_embedding = F.normalize(cls_embedding, p=2, axis=-1)
+
+        return cls_embedding
+
+    def get_semantic_embedding(self, data_loader):
+        self.eval()
+        with paddle.no_grad():
+            for batch_data in data_loader:
+                input_ids, token_type_ids = batch_data
+
+                text_embeddings = self.get_pooled_embedding(input_ids, token_type_ids=token_type_ids)
+
+                yield text_embeddings
+
+    def cosine_sim(
+        self,
+        query_input_ids,
+        title_input_ids,
+        query_token_type_ids=None,
+        query_position_ids=None,
+        query_attention_mask=None,
+        title_token_type_ids=None,
+        title_position_ids=None,
+        title_attention_mask=None,
+    ):
+
+        query_cls_embedding = self.get_pooled_embedding(
+            query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask
+        )
+
+        title_cls_embedding = self.get_pooled_embedding(
+            title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask
+        )
+
+        cosine_sim = paddle.sum(query_cls_embedding * title_cls_embedding, axis=-1)
+        return cosine_sim
+
+    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
+        _, cls_embedding = self.ptm(input_ids, token_type_ids, position_ids, attention_mask)
+
+        if self.output_emb_size > 0:
+            cls_embedding = self.emb_reduce_linear(cls_embedding)
+        cls_embedding = self.dropout(cls_embedding)
+        cls_embedding = F.normalize(cls_embedding, p=2, axis=-1)
+
+        return cls_embedding
--- a/applications/neural_search/recall/in_batch_negative/batch_negative/model.py
+++ b/applications/neural_search/recall/in_batch_negative/batch_negative/model.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import paddle
+import paddle.nn.functional as F
+from base_model import SemanticIndexBase
+
+
+class SemanticIndexBatchNeg(SemanticIndexBase):
+    def __init__(self, pretrained_model, dropout=None, margin=0.3, scale=30, output_emb_size=None):
+        super().__init__(pretrained_model, dropout, output_emb_size)
+
+        self.margin = margin
+        # Used scaling cosine similarity to ease converge
+        self.sacle = scale
+
+    def forward(
+        self,
+        query_input_ids,
+        title_input_ids,
+        query_token_type_ids=None,
+        query_position_ids=None,
+        query_attention_mask=None,
+        title_token_type_ids=None,
+        title_position_ids=None,
+        title_attention_mask=None,
+    ):
+
+        query_cls_embedding = self.get_pooled_embedding(
+            query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask
+        )
+
+        title_cls_embedding = self.get_pooled_embedding(
+            title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask
+        )
+
+        cosine_sim = paddle.matmul(query_cls_embedding, title_cls_embedding, transpose_y=True)
+
+        # Substract margin from all positive samples cosine_sim()
+        margin_diag = paddle.full(
+            shape=[query_cls_embedding.shape[0]], fill_value=self.margin, dtype=paddle.get_default_dtype()
+        )
+
+        cosine_sim = cosine_sim - paddle.diag(margin_diag)
+
+        # Scale cosine to ease training converge
+        cosine_sim *= self.sacle
+
+        labels = paddle.arange(0, query_cls_embedding.shape[0], dtype="int64")
+        labels = paddle.reshape(labels, shape=[-1, 1])
+
+        loss = F.cross_entropy(input=cosine_sim, label=labels)
+
+        return loss
+
+
+class SemanticIndexCacheNeg(SemanticIndexBase):
+    def __init__(self, pretrained_model, dropout=None, margin=0.3, scale=30, output_emb_size=None):
+        super().__init__(pretrained_model, dropout, output_emb_size)
+        self.margin = margin
+        # Used scaling cosine similarity to ease converge
+        self.sacle = scale
+
+    def forward(
+        self,
+        query_input_ids,
+        title_input_ids,
+        query_token_type_ids=None,
+        query_position_ids=None,
+        query_attention_mask=None,
+        title_token_type_ids=None,
+        title_position_ids=None,
+        title_attention_mask=None,
+    ):
+
+        query_cls_embedding = self.get_pooled_embedding(
+            query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask
+        )
+
+        title_cls_embedding = self.get_pooled_embedding(
+            title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask
+        )
+
+        cosine_sim = paddle.matmul(query_cls_embedding, title_cls_embedding, transpose_y=True)
+
+        # Substract margin from all positive samples cosine_sim()
+        margin_diag = paddle.full(shape=[query_cls_embedding.shape[0]], fill_value=self.margin, dtype=cosine_sim.dtype)
+
+        cosine_sim = cosine_sim - paddle.diag(margin_diag)
+
+        # Scale cosine to ease training converge
+        cosine_sim *= self.sacle
+
+        labels = paddle.arange(0, query_cls_embedding.shape[0], dtype="int64")
+        labels = paddle.reshape(labels, shape=[-1, 1])
+
+        return [cosine_sim, labels, query_cls_embedding, title_cls_embedding]
--- a/applications/neural_search/recall/in_batch_negative/data.py
+++ b/applications/neural_search/recall/in_batch_negative/data.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import paddle
+from paddlenlp.utils.log import logger
+
+
+def create_dataloader(dataset, mode="train", batch_size=1, batchify_fn=None, trans_fn=None):
+    if trans_fn:
+        dataset = dataset.map(trans_fn)
+    shuffle = True if mode == "train" else False
+    if mode == "train":
+        batch_sampler = paddle.io.DistributedBatchSampler(dataset, batch_size=batch_size, shuffle=shuffle)
+    else:
+        batch_sampler = paddle.io.BatchSampler(dataset, batch_size=batch_size, shuffle=shuffle)
+
+    return paddle.io.DataLoader(dataset=dataset, batch_sampler=batch_sampler, collate_fn=batchify_fn, return_list=True)
+
+
+def convert_example(example, tokenizer, max_seq_length=512, pad_to_max_seq_len=False):
+    """
+    Builds model inputs from a sequence.
+
+    A BERT sequence has the following format:
+
+    - single sequence: ``[CLS] X [SEP]``
+
+    Args:
+        example(obj:`list(str)`): The list of text to be converted to ids.
+        tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
+            which contains most of the methods. Users should refer to the superclass for more information regarding methods.
+        max_seq_len(obj:`int`): The maximum total input sequence length after tokenization.
+            Sequences longer than this will be truncated, sequences shorter will be padded.
+        is_test(obj:`False`, defaults to `False`): Whether the example contains label or not.
+
+    Returns:
+        input_ids(obj:`list[int]`): The list of query token ids.
+        token_type_ids(obj: `list[int]`): List of query sequence pair mask.
+    """
+
+    result = []
+    for key, text in example.items():
+        encoded_inputs = tokenizer(text=text, max_seq_len=max_seq_length, pad_to_max_seq_len=pad_to_max_seq_len)
+        input_ids = encoded_inputs["input_ids"]
+        token_type_ids = encoded_inputs["token_type_ids"]
+        result += [input_ids, token_type_ids]
+    return result
+
+
+def read_text_pair(data_path):
+    """Reads data."""
+    with open(data_path, "r", encoding="utf-8") as f:
+        for line in f:
+            data = line.rstrip().split("\t")
+            if len(data) != 2:
+                continue
+            yield {"text_a": data[0], "text_b": data[1]}
+
+
+def read_text_triplet(data_path):
+    """Reads data."""
+    with open(data_path, "r", encoding="utf-8") as f:
+        for line in f:
+            data = line.rstrip().split("\t")
+            if len(data) != 3:
+                continue
+            yield {"text": data[0], "pos_sample": data[1], "neg_sample": data[2]}
+
+
+# ANN - active learning ------------------------------------------------------
+def get_latest_checkpoint(args):
+    """
+    Return: (latest_checkpoint_path, global_step)
+    """
+    if not os.path.exists(args.save_dir):
+        return args.init_from_ckpt, 0
+
+    subdirectories = list(next(os.walk(args.save_dir))[1])
+
+    def valid_checkpoint(checkpoint):
+        chk_path = os.path.join(args.save_dir, checkpoint)
+        scheduler_path = os.path.join(chk_path, "model_state.pdparams")
+        succeed_flag_file = os.path.join(chk_path, "succeed_flag_file")
+        return os.path.exists(scheduler_path) and os.path.exists(succeed_flag_file)
+
+    trained_steps = [int(s) for s in subdirectories if valid_checkpoint(s)]
+
+    if len(trained_steps) > 0:
+        return os.path.join(args.save_dir, str(max(trained_steps)), "model_state.pdparams"), max(trained_steps)
+
+    return args.init_from_ckpt, 0
+
+
+# ANN - active learning ------------------------------------------------------
+def get_latest_ann_data(ann_data_dir):
+    if not os.path.exists(ann_data_dir):
+        return None, -1
+
+    subdirectories = list(next(os.walk(ann_data_dir))[1])
+
+    def valid_checkpoint(step):
+        ann_data_file = os.path.join(ann_data_dir, step, "new_ann_data")
+        # succeed_flag_file is an empty file that indicates ann data has been generated
+        succeed_flag_file = os.path.join(ann_data_dir, step, "succeed_flag_file")
+        return os.path.exists(succeed_flag_file) and os.path.exists(ann_data_file)
+
+    ann_data_steps = [int(s) for s in subdirectories if valid_checkpoint(s)]
+
+    if len(ann_data_steps) > 0:
+        latest_ann_data_file = os.path.join(ann_data_dir, str(max(ann_data_steps)), "new_ann_data")
+        logger.info("Using latest ann_data_file:{}".format(latest_ann_data_file))
+        return latest_ann_data_file, max(ann_data_steps)
+
+    logger.info("no new ann_data, return (None, -1)")
+    return None, -1
+
+
+def gen_id2corpus(corpus_file):
+    id2corpus = {}
+    with open(corpus_file, "r", encoding="utf-8") as f:
+        for idx, line in enumerate(f):
+            id2corpus[idx] = line.rstrip()
+    return id2corpus
+
+
+def gen_text_file(similar_text_pair_file):
+    text2similar_text = {}
+    texts = []
+    with open(similar_text_pair_file, "r", encoding="utf-8") as f:
+        for line in f:
+            splited_line = line.rstrip().split("\t")
+            if len(splited_line) != 2:
+                continue
+
+            text, similar_text = line.rstrip().split("\t")
+
+            if not text or not similar_text:
+                continue
+
+            text2similar_text[text] = similar_text
+            texts.append({"text": text})
+    return texts, text2similar_text
--- a/applications/neural_search/recall/in_batch_negative/deploy/cpp/http_client.py
+++ b/applications/neural_search/recall/in_batch_negative/deploy/cpp/http_client.py
+# coding:utf-8
+# pylint: disable=doc-string-missing
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+
+import numpy as np
+from paddle_serving_client import HttpClient
+
+from paddlenlp.transformers import AutoTokenizer
+
+
+def convert_example(example, tokenizer, max_seq_length=512, pad_to_max_seq_len=True):
+    list_input_ids = []
+    list_token_type_ids = []
+    for text in example:
+        encoded_inputs = tokenizer(text=text, max_seq_len=max_seq_length, pad_to_max_seq_len=pad_to_max_seq_len)
+        input_ids = encoded_inputs["input_ids"]
+        token_type_ids = encoded_inputs["token_type_ids"]
+
+        list_input_ids.append(input_ids)
+        list_token_type_ids.append(token_type_ids)
+    return list_input_ids, list_token_type_ids
+
+
+# 启动python客户端
+endpoint_list = ["127.0.0.1:9393"]
+client = HttpClient()
+client.load_client_config("serving_client")
+client.connect(endpoint_list)
+feed_names = client.feed_names_
+fetch_names = client.fetch_names_
+print(feed_names)
+print(fetch_names)
+
+# 创建tokenizer
+tokenizer = AutoTokenizer.from_pretrained("rocketqa-zh-base-query-encoder")
+max_seq_len = 64
+
+# 数据预处理
+
+list_data = ["国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据.", "面向生态系统服务的生态系统分类方案研发与应用"]
+# for i in range(5):
+#     list_data.extend(list_data)
+# print(len(list_data))
+examples = convert_example(list_data, tokenizer, max_seq_length=max_seq_len)
+print(examples)
+
+feed_dict = {}
+feed_dict["input_ids"] = np.array(examples[0])
+feed_dict["token_type_ids"] = np.array(examples[1])
+
+print(feed_dict["input_ids"].shape)
+print(feed_dict["token_type_ids"].shape)
+
+# batch设置为True表示的是批量预测
+b_start = time.time()
+result = client.predict(feed=feed_dict, fetch=fetch_names, batch=True)
+b_end = time.time()
+print(result)
+print("time to cost :{} seconds".format(b_end - b_start))
--- a/applications/neural_search/recall/in_batch_negative/deploy/cpp/rpc_client.py
+++ b/applications/neural_search/recall/in_batch_negative/deploy/cpp/rpc_client.py
+# coding:utf-8
+# pylint: disable=doc-string-missing
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+
+import numpy as np
+from paddle_serving_client import Client
+
+from paddlenlp.transformers import AutoTokenizer
+
+
+def convert_example(example, tokenizer, max_seq_length=512, pad_to_max_seq_len=True):
+    list_input_ids = []
+    list_token_type_ids = []
+    for text in example:
+        encoded_inputs = tokenizer(text=text, max_seq_len=max_seq_length, pad_to_max_seq_len=pad_to_max_seq_len)
+        input_ids = encoded_inputs["input_ids"]
+        token_type_ids = encoded_inputs["token_type_ids"]
+        list_input_ids.append(input_ids)
+        list_token_type_ids.append(token_type_ids)
+    return list_input_ids, list_token_type_ids
+
+
+# 启动python客户端
+endpoint_list = ["127.0.0.1:9393"]
+client = Client()
+client.load_client_config("serving_client")
+client.connect(endpoint_list)
+feed_names = client.feed_names_
+fetch_names = client.fetch_names_
+print(feed_names)
+print(fetch_names)
+
+# 创建tokenizer
+tokenizer = AutoTokenizer.from_pretrained("rocketqa-zh-base-query-encoder")
+max_seq_len = 64
+
+# 数据预处理
+
+list_data = ["国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据.", "面向生态系统服务的生态系统分类方案研发与应用"]
+# for i in range(5):
+#     list_data.extend(list_data)
+# print(len(list_data))
+examples = convert_example(list_data, tokenizer, max_seq_length=max_seq_len)
+print(examples)
+
+feed_dict = {}
+feed_dict["input_ids"] = np.array(examples[0])
+feed_dict["token_type_ids"] = np.array(examples[1])
+
+print(feed_dict["input_ids"].shape)
+print(feed_dict["token_type_ids"].shape)
+# batch设置为True表示的是批量预测
+b_start = time.time()
+result = client.predict(feed=feed_dict, fetch=fetch_names, batch=True)
+b_end = time.time()
+print("time to cost :{} seconds".format(b_end - b_start))
+print(result)
--- a/applications/neural_search/recall/in_batch_negative/deploy/cpp/start_server.sh
+++ b/applications/neural_search/recall/in_batch_negative/deploy/cpp/start_server.sh
+python -m paddle_serving_server.serve --model serving_server --port 9393 --gpu_id 2 --thread 5 --ir_optim True --use_trt --precision FP16
\ No newline at end of file
--- a/applications/neural_search/recall/in_batch_negative/deploy/python/config_nlp.yml
+++ b/applications/neural_search/recall/in_batch_negative/deploy/python/config_nlp.yml
+# worker_num, 最大并发数。当build_dag_each_worker=True时, 框架会创建worker_num个进程，每个进程内构建grpcSever和DAG
+# 当build_dag_each_worker=False时，框架会设置主线程grpc线程池的max_workers=worker_num
+worker_num: 20
+# build_dag_each_worker, False，框架在进程内创建一条DAG；True，框架会每个进程内创建多个独立的DAG
+build_dag_each_worker: false
+
+dag:
+  # op资源类型, True, 为线程模型；False，为进程模型
+  is_thread_op: False
+  # 使用性能分析, True，生成Timeline性能数据，对性能有一定影响；False为不使用
+  tracer:
+    interval_s: 10
+# http端口, rpc_port和http_port不允许同时为空。当rpc_port可用且http_port为空时，不自动生成http_port
+http_port: 18082
+# rpc端口, rpc_port和http_port不允许同时为空。当rpc_port为空且http_port不为空时，会自动将rpc_port设置为http_port+1
+rpc_port: 8080
+op:
+  ernie:
+    # 并发数，is_thread_op=True时，为线程并发；否则为进程并发
+    concurrency: 1
+    # 当op配置没有server_endpoints时，从local_service_conf读取本地服务配置
+    local_service_conf:
+      # client类型，包括brpc, grpc和local_predictor.local_predictor不启动Serving服务，进程内预测
+      client_type: local_predictor
+      #ir_optim
+      ir_optim: True
+      # device_type, 0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu
+      device_type: 1
+      # 计算硬件ID，当devices为""或不写时为CPU预测；当devices为"0", "0,1,2"时为GPU预测，表示使用的GPU卡
+      devices: '2'
+      # Fetch结果列表，以client_config中fetch_var的alias_name为准, 如果没有设置则全部返回
+      fetch_list: ['output_embedding']
+      # 模型路径
+      model_config: ../../serving_server/
--- a/applications/neural_search/recall/in_batch_negative/deploy/python/deploy.sh
+++ b/applications/neural_search/recall/in_batch_negative/deploy/python/deploy.sh
+python predict.py --model_dir=../../output
\ No newline at end of file
--- a/applications/neural_search/recall/in_batch_negative/deploy/python/predict.py
+++ b/applications/neural_search/recall/in_batch_negative/deploy/python/predict.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import sys
+
+import paddle
+from paddle import inference
+from scipy import spatial
+
+from paddlenlp.data import Pad, Tuple
+from paddlenlp.transformers import AutoTokenizer
+from paddlenlp.utils.log import logger
+
+sys.path.append(".")
+
+# yapf: disable
+parser = argparse.ArgumentParser()
+parser.add_argument("--model_dir", type=str, required=True, help="The directory to static model.")
+parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.")
+parser.add_argument("--batch_size", default=15, type=int, help="Batch size per GPU/CPU for training.")
+parser.add_argument('--device', choices=['cpu', 'gpu', 'xpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
+parser.add_argument('--model_name_or_path', default="rocketqa-zh-base-query-encoder", help="model name.")
+parser.add_argument('--use_tensorrt', default=False, type=eval, choices=[True, False], help='Enable to use tensorrt to speed up.')
+parser.add_argument("--precision", default="fp32", type=str, choices=["fp32", "fp16", "int8"], help='The tensorrt precision.')
+parser.add_argument('--cpu_threads', default=10, type=int, help='Number of threads to predict when using cpu.')
+parser.add_argument('--enable_mkldnn', default=False, type=eval, choices=[True, False], help='Enable to use mkldnn to speed up when using cpu.')
+parser.add_argument("--benchmark", type=eval, default=False, help="To log some information about environment and running.")
+parser.add_argument("--save_log_path", type=str, default="./log_output/", help="The file path to save log.")
+args = parser.parse_args()
+# yapf: enable
+
+
+def convert_example(example, tokenizer, max_seq_length=512, pad_to_max_seq_len=False):
+    """
+    Builds model inputs from a sequence.
+
+    A BERT sequence has the following format:
+
+    - single sequence: ``[CLS] X [SEP]``
+
+    Args:
+        example(obj:`list(str)`): The list of text to be converted to ids.
+        tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
+            which contains most of the methods. Users should refer to the superclass for more information regarding methods.
+        max_seq_len(obj:`int`): The maximum total input sequence length after tokenization.
+            Sequences longer than this will be truncated, sequences shorter will be padded.
+        is_test(obj:`False`, defaults to `False`): Whether the example contains label or not.
+
+    Returns:
+        input_ids(obj:`list[int]`): The list of query token ids.
+        token_type_ids(obj: `list[int]`): List of query sequence pair mask.
+    """
+
+    result = []
+    for key, text in example.items():
+        encoded_inputs = tokenizer(text=text, max_seq_len=max_seq_length, pad_to_max_seq_len=pad_to_max_seq_len)
+        input_ids = encoded_inputs["input_ids"]
+        token_type_ids = encoded_inputs["token_type_ids"]
+        result += [input_ids, token_type_ids]
+    return result
+
+
+class Predictor(object):
+    def __init__(
+        self,
+        model_dir,
+        device="gpu",
+        max_seq_length=128,
+        batch_size=32,
+        use_tensorrt=False,
+        precision="fp32",
+        cpu_threads=10,
+        enable_mkldnn=False,
+    ):
+        self.max_seq_length = max_seq_length
+        self.batch_size = batch_size
+
+        model_file = model_dir + "/inference.pdmodel"
+        params_file = model_dir + "/inference.pdiparams"
+        if not os.path.exists(model_file):
+            raise ValueError("not find model file path {}".format(model_file))
+        if not os.path.exists(params_file):
+            raise ValueError("not find params file path {}".format(params_file))
+        config = paddle.inference.Config(model_file, params_file)
+
+        if device == "gpu":
+            # set GPU configs accordingly
+            # such as initialize the gpu memory, enable tensorrt
+            config.enable_use_gpu(100, 0)
+            precision_map = {
+                "fp16": inference.PrecisionType.Half,
+                "fp32": inference.PrecisionType.Float32,
+                "int8": inference.PrecisionType.Int8,
+            }
+            precision_mode = precision_map[precision]
+
+            if args.use_tensorrt:
+                config.enable_tensorrt_engine(
+                    max_batch_size=batch_size, min_subgraph_size=30, precision_mode=precision_mode
+                )
+        elif device == "cpu":
+            # set CPU configs accordingly,
+            # such as enable_mkldnn, set_cpu_math_library_num_threads
+            config.disable_gpu()
+            if args.enable_mkldnn:
+                # cache 10 different shapes for mkldnn to avoid memory leak
+                config.set_mkldnn_cache_capacity(10)
+                config.enable_mkldnn()
+            config.set_cpu_math_library_num_threads(args.cpu_threads)
+        elif device == "xpu":
+            # set XPU configs accordingly
+            config.enable_xpu(100)
+
+        config.switch_use_feed_fetch_ops(False)
+        self.predictor = paddle.inference.create_predictor(config)
+        self.input_handles = [self.predictor.get_input_handle(name) for name in self.predictor.get_input_names()]
+        self.output_handle = self.predictor.get_output_handle(self.predictor.get_output_names()[0])
+
+        if args.benchmark:
+            import auto_log
+
+            pid = os.getpid()
+            self.autolog = auto_log.AutoLogger(
+                model_name=args.model_name_or_path,
+                model_precision=precision,
+                batch_size=self.batch_size,
+                data_shape="dynamic",
+                save_path=args.save_log_path,
+                inference_config=config,
+                pids=pid,
+                process_name=None,
+                gpu_ids=0,
+                time_keys=["preprocess_time", "inference_time", "postprocess_time"],
+                warmup=0,
+                logger=logger,
+            )
+
+    def extract_embedding(self, data, tokenizer):
+        """
+        Predicts the data labels.
+
+        Args:
+            data (obj:`List(str)`): The batch data whose each element is a raw text.
+            tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
+                which contains most of the methods. Users should refer to the superclass for more information regarding methods.
+
+        Returns:
+            results(obj:`dict`): All the feature vectors.
+        """
+        if args.benchmark:
+            self.autolog.times.start()
+
+        examples = []
+        for text in data:
+            input_ids, segment_ids = convert_example(text, tokenizer)
+            examples.append((input_ids, segment_ids))
+
+        batchify_fn = lambda samples, fn=Tuple(
+            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
+            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
+        ): fn(samples)
+
+        if args.benchmark:
+            self.autolog.times.stamp()
+
+        input_ids, segment_ids = batchify_fn(examples)
+        self.input_handles[0].copy_from_cpu(input_ids)
+        self.input_handles[1].copy_from_cpu(segment_ids)
+        self.predictor.run()
+        logits = self.output_handle.copy_to_cpu()
+        if args.benchmark:
+            self.autolog.times.stamp()
+
+        if args.benchmark:
+            self.autolog.times.end(stamp=True)
+
+        return logits
+
+    def predict(self, data, tokenizer):
+        """
+        Predicts the data labels.
+
+        Args:
+            data (obj:`List(str)`): The batch data whose each element is a raw text.
+            tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
+                which contains most of the methods. Users should refer to the superclass for more information regarding methods.
+
+        Returns:
+            results(obj:`dict`): All the predictions probs.
+        """
+        if args.benchmark:
+            self.autolog.times.start()
+
+        examples = []
+        for idx, text in enumerate(data):
+            input_ids, segment_ids = convert_example({idx: text[0]}, tokenizer)
+            title_ids, title_segment_ids = convert_example({idx: text[1]}, tokenizer)
+            examples.append((input_ids, segment_ids, title_ids, title_segment_ids))
+
+        batchify_fn = lambda samples, fn=Tuple(
+            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
+            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
+            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
+            Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment
+        ): fn(samples)
+
+        if args.benchmark:
+            self.autolog.times.stamp()
+
+        query_ids, query_segment_ids, title_ids, title_segment_ids = batchify_fn(examples)
+        self.input_handles[0].copy_from_cpu(query_ids)
+        self.input_handles[1].copy_from_cpu(query_segment_ids)
+        self.predictor.run()
+        query_logits = self.output_handle.copy_to_cpu()
+
+        self.input_handles[0].copy_from_cpu(title_ids)
+        self.input_handles[1].copy_from_cpu(title_segment_ids)
+        self.predictor.run()
+        title_logits = self.output_handle.copy_to_cpu()
+
+        if args.benchmark:
+            self.autolog.times.stamp()
+
+        if args.benchmark:
+            self.autolog.times.end(stamp=True)
+        result = [float(1 - spatial.distance.cosine(arr1, arr2)) for arr1, arr2 in zip(query_logits, title_logits)]
+        return result
+
+
+if __name__ == "__main__":
+    # Define predictor to do prediction.
+    predictor = Predictor(
+        args.model_dir,
+        args.device,
+        args.max_seq_length,
+        args.batch_size,
+        args.use_tensorrt,
+        args.precision,
+        args.cpu_threads,
+        args.enable_mkldnn,
+    )
+
+    # ErnieTinyTokenizer is special for ernie-tiny pretained model.
+    output_emb_size = 256
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
+    id2corpus = {0: "国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据"}
+    corpus_list = [{idx: text} for idx, text in id2corpus.items()]
+    res = predictor.extract_embedding(corpus_list, tokenizer)
+    print(res.shape)
+    print(res)
+    corpus_list = [["中西方语言与文化的差异", "中西方文化差异以及语言体现中西方文化,差异,语言体现"], ["中西方语言与文化的差异", "飞桨致力于让深度学习技术的创新与应用更简单"]]
+    res = predictor.predict(corpus_list, tokenizer)
+    print(res)
--- a/applications/neural_search/recall/in_batch_negative/deploy/python/rpc_client.py
+++ b/applications/neural_search/recall/in_batch_negative/deploy/python/rpc_client.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+import numpy as np
+
+from paddle_serving_server.pipeline import PipelineClient
+
+client = PipelineClient()
+client.connect(["127.0.0.1:8080"])
+
+list_data = ["国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据", "试论翻译过程中的文化差异与语言空缺翻译过程,文化差异,语言空缺,文化对比"]
+feed = {}
+for i, item in enumerate(list_data):
+    feed[str(i)] = item
+
+print(feed)
+start_time = time.time()
+ret = client.predict(feed_dict=feed)
+end_time = time.time()
+print("time to cost :{} seconds".format(end_time - start_time))
+
+result = np.array(eval(ret.value[0]))
+print(ret.key)
+print(result.shape)
+print(result)
--- a/applications/neural_search/recall/in_batch_negative/deploy/python/web_service.py
+++ b/applications/neural_search/recall/in_batch_negative/deploy/python/web_service.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+
+from paddle_serving_server.web_service import Op, WebService
+
+_LOGGER = logging.getLogger()
+
+
+def convert_example(example, tokenizer, max_seq_length=512, pad_to_max_seq_len=False):
+    result = []
+    for text in example:
+        encoded_inputs = tokenizer(text=text, max_seq_len=max_seq_length, pad_to_max_seq_len=pad_to_max_seq_len)
+        input_ids = encoded_inputs["input_ids"]
+        token_type_ids = encoded_inputs["token_type_ids"]
+        result += [input_ids, token_type_ids]
+    return result
+
+
+class ErnieOp(Op):
+    def init_op(self):
+        from paddlenlp.transformers import AutoTokenizer
+
+        self.tokenizer = AutoTokenizer.from_pretrained("rocketqa-zh-base-query-encoder")
+
+    def preprocess(self, input_dicts, data_id, log_id):
+        from paddlenlp.data import Pad, Tuple
+
+        ((_, input_dict),) = input_dicts.items()
+        print("input dict", input_dict)
+        batch_size = len(input_dict.keys())
+        examples = []
+        for i in range(batch_size):
+            input_ids, segment_ids = convert_example([input_dict[str(i)]], self.tokenizer)
+            examples.append((input_ids, segment_ids))
+        batchify_fn = lambda samples, fn=Tuple(
+            Pad(axis=0, pad_val=self.tokenizer.pad_token_id, dtype="int64"),  # input
+            Pad(axis=0, pad_val=self.tokenizer.pad_token_type_id, dtype="int64"),  # segment
+        ): fn(samples)
+        input_ids, segment_ids = batchify_fn(examples)
+        feed_dict = {}
+        feed_dict["input_ids"] = input_ids
+        feed_dict["token_type_ids"] = segment_ids
+        return feed_dict, False, None, ""
+
+    def postprocess(self, input_dicts, fetch_dict, data_id, log_id):
+        new_dict = {}
+        new_dict["output_embedding"] = str(fetch_dict["output_embedding"].tolist())
+        return new_dict, None, ""
+
+
+class ErnieService(WebService):
+    def get_pipeline_response(self, read_op):
+        ernie_op = ErnieOp(name="ernie", input_ops=[read_op])
+        return ernie_op
+
+
+ernie_service = ErnieService(name="ernie")
+ernie_service.prepare_pipeline_config("config_nlp.yml")
+ernie_service.run_service()
--- a/applications/neural_search/recall/in_batch_negative/evaluate.py
+++ b/applications/neural_search/recall/in_batch_negative/evaluate.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+import numpy as np
+
+import time
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--similar_text_pair", type=str,
+                    default='', help="The full path of similar pair file")
+parser.add_argument("--recall_result_file", type=str,
+                    default='', help="The full path of recall result file")
+parser.add_argument("--recall_num", type=int, default=10,
+                    help="Most similar number of doc recalled from corpus per query")
+
+
+args = parser.parse_args()
+
+
+def recall(rs, N=10):
+    """
+    Ratio of recalled Ground Truth at topN Recalled Docs
+    >>> rs = [[0, 0, 1], [0, 1, 0], [1, 0, 0]]
+    >>> recall(rs, N=1)
+    0.333333
+    >>> recall(rs, N=2)
+    >>> 0.6666667
+    >>> recall(rs, N=3)
+    >>> 1.0
+    Args:
+        rs: Iterator of recalled flag()
+    Returns:
+        Recall@N
+    """
+
+    recall_flags = [np.sum(r[0:N]) for r in rs]
+    return np.mean(recall_flags)
+
+
+if __name__ == "__main__":
+    text2similar = {}
+    with open(args.similar_text_pair, "r", encoding="utf-8") as f:
+        for line in f:
+            text, similar_text = line.rstrip().split("\t")
+            text2similar[text] = similar_text
+
+    rs = []
+
+    with open(args.recall_result_file, "r", encoding="utf-8") as f:
+        relevance_labels = []
+        for index, line in enumerate(f):
+
+            if index % args.recall_num == 0 and index != 0:
+                rs.append(relevance_labels)
+                relevance_labels = []
+
+            text, recalled_text, cosine_sim = line.rstrip().split("\t")
+            if text2similar[text] == recalled_text:
+                relevance_labels.append(1)
+            else:
+                relevance_labels.append(0)
+
+    recall_N = []
+    recall_num = [1, 5, 10, 20, 50]
+    for topN in recall_num:
+        R = round(100 * recall(rs, N=topN), 3)
+        recall_N.append(str(R))
+    result = open("result.tsv", "a")
+    res = []
+    timestamp = time.strftime("%Y%m%d-%H%M%S", time.localtime())
+    res.append(timestamp)
+    for key, val in zip(recall_num, recall_N):
+        print("recall@{}={}".format(key, val))
+        res.append(str(val))
+    result.write("\t".join(res) + "\n")
--- a/applications/neural_search/recall/in_batch_negative/export_model.py
+++ b/applications/neural_search/recall/in_batch_negative/export_model.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+
+import paddle
+from base_model import SemanticIndexBaseStatic
+
+from paddlenlp.transformers import AutoModel, AutoTokenizer
+
+# yapf: disable
+parser = argparse.ArgumentParser()
+parser.add_argument("--params_path", type=str, required=True,
+                    default='./checkpoint/model_900/model_state.pdparams', help="The path to model parameters to be loaded.")
+parser.add_argument('--model_name_or_path', default="rocketqa-zh-base-query-encoder", help="Select model to train, defaults to rocketqa-zh-base-query-encoder.")
+parser.add_argument("--output_path", type=str, default='./output',
+                    help="The path of model parameter in static graph to be saved.")
+args = parser.parse_args()
+# yapf: enable
+
+if __name__ == "__main__":
+    output_emb_size = 256
+    pretrained_model = AutoModel.from_pretrained(args.model_name_or_path)
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
+    model = SemanticIndexBaseStatic(pretrained_model, output_emb_size=output_emb_size)
+    if args.params_path and os.path.isfile(args.params_path):
+        state_dict = paddle.load(args.params_path)
+        model.set_dict(state_dict)
+        print("Loaded parameters from %s" % args.params_path)
+    else:
+        raise ValueError("Please set --params_path with correct pretrained model file")
+
+    model.eval()
+    # Convert to static graph with specific input description
+    model = paddle.jit.to_static(
+        model,
+        input_spec=[
+            paddle.static.InputSpec(shape=[None, None], dtype="int64"),  # input_ids
+            paddle.static.InputSpec(shape=[None, None], dtype="int64"),  # segment_ids
+        ],
+    )
+    # Save in static graph model.
+    save_path = os.path.join(args.output_path, "inference")
+    paddle.jit.save(model, save_path)
--- a/applications/neural_search/recall/in_batch_negative/export_to_serving.py
+++ b/applications/neural_search/recall/in_batch_negative/export_to_serving.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import paddle_serving_client.io as serving_io
+
+# yapf: disable
+parser = argparse.ArgumentParser()
+parser.add_argument("--dirname", type=str, required=True,
+                    default='./output', help="Path of saved model files. Program file and parameter files are saved in this directory.")
+parser.add_argument("--model_filename", type=str, required=True,
+                    default='inference.get_pooled_embedding.pdmodel', help="The name of file to load the inference program. If it is None, the default filename __model__ will be used.")
+parser.add_argument("--params_filename", type=str, required=True,
+                    default='inference.get_pooled_embedding.pdiparams', help="The name of file to load all parameters. It is only used for the case that all parameters were saved in a single binary file. If parameters were saved in separate files, set it as None. Default: None.")
+parser.add_argument("--server_path", type=str, default='./serving_server',
+                    help="The path of server parameter in static graph to be saved.")
+parser.add_argument("--client_path", type=str, default='./serving_client',
+                    help="The path of client parameter in static graph to be saved.")
+parser.add_argument("--feed_alias_names", type=str, default=None,
+                    help='set alias names for feed vars, split by comma \',\', you should run --show_proto to check the number of feed vars')
+parser.add_argument("--fetch_alias_names", type=str, default=None,
+                    help='set alias names for feed vars, split by comma \',\', you should run --show_proto to check the number of fetch vars')
+parser.add_argument("--show_proto", type=bool, default=False,
+                    help='If yes, you can preview the proto and then determine your feed var alias name and fetch var alias name.')
+# yapf: enable
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    serving_io.inference_model_to_serving(
+        dirname=args.dirname,
+        serving_server=args.server_path,
+        serving_client=args.client_path,
+        model_filename=args.model_filename,
+        params_filename=args.params_filename,
+        show_proto=args.show_proto,
+        feed_alias_names=args.feed_alias_names,
+        fetch_alias_names=args.fetch_alias_names,
+    )