llama_paddle

10f294ff · yuguo-Jack · 7c64e6ec · 10f294ff · 10f294ff · 10f294ff
Commit 10f294ff authored Dec 19, 2023 by yuguo-Jack
20 changed files
--- a/applications/neural_search/ranking/cross_encoder/scripts/predict_ce.sh
+++ b/applications/neural_search/ranking/cross_encoder/scripts/predict_ce.sh
+#!/bin/bash
+unset CUDA_VISIBLE_DEVICES
+export CUDA_VISIBLE_DEVICES=0
+python predict.py \
+                --device 'gpu' \
+                --params_path checkpoints/model_80000/model_state.pdparams \
+                --model_name_or_path rocketqa-base-cross-encoder \
+                --test_set data/test.csv \
+                --topk 10 \
+                --batch_size 128 \
+                --max_seq_length 384
\ No newline at end of file
--- a/applications/neural_search/ranking/cross_encoder/scripts/train_ce.sh
+++ b/applications/neural_search/ranking/cross_encoder/scripts/train_ce.sh
+#!/bin/bash
+unset CUDA_VISIBLE_DEVICES
+python -u -m paddle.distributed.launch --gpus "0,1,2,3" --log_dir="logs" train_ce.py \
+        --device gpu \
+        --train_set data/train.csv \
+        --test_file data/dev_pairwise.csv \
+        --save_dir ./checkpoints \
+        --model_name_or_path rocketqa-base-cross-encoder \
+        --batch_size 32 \
+        --save_steps 10000 \
+        --max_seq_len 384 \
+        --learning_rate 1E-5 \
+        --weight_decay  0.01 \
+        --warmup_proportion 0.0 \
+        --logging_steps 10 \
+        --seed 1 \
+        --epochs 3 \
+        --eval_step 1000
--- a/applications/neural_search/ranking/cross_encoder/train_ce.py
+++ b/applications/neural_search/ranking/cross_encoder/train_ce.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import random
+import time
+from functools import partial
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+from data import convert_example, create_dataloader, read_data
+
+from paddlenlp.data import Pad, Stack, Tuple
+from paddlenlp.datasets import load_dataset
+from paddlenlp.trainer.argparser import strtobool
+from paddlenlp.transformers import AutoModelForSequenceClassification, AutoTokenizer
+
+# yapf: disable
+parser = argparse.ArgumentParser()
+parser.add_argument("--save_dir", default='./checkpoint', type=str, help="The output directory where the model checkpoints will be written.")
+parser.add_argument("--train_set", type=str, required=True, help="The full path of train_set_file.")
+parser.add_argument("--test_file", type=str, required=True, help="The full path of test file")
+
+parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.")
+parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.")
+parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
+parser.add_argument("--epochs", default=3, type=int, help="Total number of training epochs to perform.")
+parser.add_argument("--warmup_proportion", default=0.0, type=float, help="Linear warmup proportion over the training process.")
+parser.add_argument("--valid_steps", default=100, type=int, help="The interval steps to evaluate model performance.")
+parser.add_argument("--save_steps", default=100, type=int, help="The interval steps to save checkppoints.")
+parser.add_argument("--logging_steps", default=10, type=int, help="The interval steps to logging.")
+parser.add_argument("--init_from_ckpt", type=str, default=None, help="The path of checkpoint to be loaded.")
+parser.add_argument("--seed", type=int, default=1000, help="random seed for initialization")
+parser.add_argument('--device', choices=['cpu', 'gpu', 'xpu', 'npu'], default="gpu", help="Select which device to train model, defaults to gpu.")
+parser.add_argument("--use_amp", type=strtobool, default=False, help="Enable mixed precision training.")
+parser.add_argument("--scale_loss", type=float, default=2**15, help="The value of scale_loss for fp16.")
+parser.add_argument('--model_name_or_path', default="rocketqa-base-cross-encoder", help="The pretrained model used for training")
+parser.add_argument("--eval_step", default=200, type=int, help="Step interval for evaluation.")
+args = parser.parse_args()
+# yapf: enable
+
+
+@paddle.no_grad()
+def evaluate(model, metric, data_loader, phase="dev"):
+    """
+    Given a dataset, it evals model and computes the metric.
+
+    Args:
+        model(obj:`paddle.nn.Layer`): A model to classify texts.
+        data_loader(obj:`paddle.io.DataLoader`): The dataset loader which generates batches.
+        metric(obj:`paddle.metric.Metric`): The evaluation metric.
+    """
+    model.eval()
+    metric.reset()
+
+    for idx, batch in enumerate(data_loader):
+        input_ids, token_type_ids, labels = batch
+
+        pos_probs = model(input_ids=input_ids, token_type_ids=token_type_ids)
+
+        sim_score = F.softmax(pos_probs)
+
+        metric.update(preds=sim_score.numpy(), labels=labels)
+
+    print("eval_{} auc:{:.3}".format(phase, metric.accumulate()))
+    metric.reset()
+    model.train()
+
+
+def set_seed(seed):
+    """sets random seed"""
+    random.seed(seed)
+    np.random.seed(seed)
+    paddle.seed(seed)
+
+
+def do_train():
+    paddle.set_device(args.device)
+    rank = paddle.distributed.get_rank()
+    if paddle.distributed.get_world_size() > 1:
+        paddle.distributed.init_parallel_env()
+    dev_count = paddle.distributed.get_world_size()
+    set_seed(args.seed)
+
+    train_ds = load_dataset(read_data, data_path=args.train_set, lazy=False)
+    dev_ds = load_dataset(read_data, data_path=args.test_file, lazy=False)
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
+    model = AutoModelForSequenceClassification.from_pretrained(args.model_name_or_path, num_classes=2)
+
+    trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length, is_pair=True)
+
+    batchify_fn = lambda samples, fn=Tuple(
+        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"),  # input
+        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"),  # segment
+        Stack(dtype="int64"),  # label
+    ): [data for data in fn(samples)]
+
+    train_data_loader = create_dataloader(
+        train_ds, mode="train", batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func
+    )
+    dev_data_loader = create_dataloader(
+        dev_ds, mode="dev", batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func
+    )
+
+    if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt):
+        state_dict = paddle.load(args.init_from_ckpt)
+        model.set_dict(state_dict)
+
+    model = paddle.DataParallel(model)
+
+    num_training_examples = len(train_ds)
+    # 4卡 gpu
+    max_train_steps = args.epochs * num_training_examples // args.batch_size // dev_count
+
+    warmup_steps = int(max_train_steps * args.warmup_proportion)
+
+    print("Device count: %d" % dev_count)
+    print("Num train examples: %d" % num_training_examples)
+    print("Max train steps: %d" % max_train_steps)
+    print("Num warmup steps: %d" % warmup_steps)
+
+    # Generate parameter names needed to perform weight decay.
+    # All bias and LayerNorm parameters are excluded.
+    decay_params = [p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"])]
+    optimizer = paddle.optimizer.AdamW(
+        learning_rate=args.learning_rate,
+        parameters=model.parameters(),
+        weight_decay=args.weight_decay,
+        apply_decay_param_fun=lambda x: x in decay_params,
+        grad_clip=paddle.nn.ClipGradByGlobalNorm(1.0),
+    )
+
+    criterion = paddle.nn.loss.CrossEntropyLoss()
+    metric = paddle.metric.Auc()
+    global_step = 0
+    tic_train = time.time()
+    for epoch in range(1, args.epochs + 1):
+        for step, batch in enumerate(train_data_loader, start=1):
+            input_ids, token_type_ids, labels = batch
+            logits = model(input_ids, token_type_ids)
+            loss = criterion(logits, labels)
+            probs = F.softmax(logits, axis=1)
+            acc = paddle.metric.accuracy(input=probs, label=labels)
+            loss.backward()
+
+            optimizer.step()
+            optimizer.clear_grad()
+
+            global_step += 1
+            if global_step % args.logging_steps == 0 and rank == 0:
+                time_diff = time.time() - tic_train
+                print(
+                    "global step %d, epoch: %d, batch: %d, loss: %.5f, accuracy: %.5f, speed: %.2f step/s"
+                    % (global_step, epoch, step, loss, acc, args.logging_steps / time_diff)
+                )
+                tic_train = time.time()
+            if global_step % args.eval_step == 0 and rank == 0:
+                evaluate(model, metric, dev_data_loader, "dev")
+            if global_step % args.save_steps == 0 and rank == 0:
+                save_dir = os.path.join(args.save_dir, "model_%d" % global_step)
+                model_to_save = model._layers if isinstance(model, paddle.DataParallel) else model
+                model_to_save.save_pretrained(save_dir)
+                tokenizer.save_pretrained(save_dir)
+                tic_train = time.time()
+
+    # save final checkpoint
+    save_dir = os.path.join(args.save_dir, "model_%d" % global_step)
+    model_to_save = model._layers if isinstance(model, paddle.DataParallel) else model
+    model_to_save.save_pretrained(save_dir)
+    tokenizer.save_pretrained(save_dir)
+
+
+if __name__ == "__main__":
+    do_train()
--- a/applications/neural_search/ranking/ernie_matching/README.md
+++ b/applications/neural_search/ranking/ernie_matching/README.md
+
+ **目录**
+
+* [背景介绍](#背景介绍)
+* [ERNIE-Gram](#ERNIE-Gram)
+    * [1. 技术方案和评估指标](#技术方案)
+    * [2. 环境依赖](#环境依赖)
+    * [3. 代码结构](#代码结构)
+    * [4. 数据准备](#数据准备)
+    * [5. 模型训练](#模型训练)
+    * [6. 评估](#开始评估)
+    * [7. 预测](#预测)
+    * [8. 部署](#部署)
+
+<a name="背景介绍"></a>
+
+# 背景介绍
+
+基于ERNIE-Gram训练Pair-wise模型。Pair-wise 匹配模型适合将文本对相似度作为特征之一输入到上层排序模块进行排序的应用场景。
+
+
+<a name="ERNIE-Gram"></a>
+
+# ERNIE-Gram
+
+<a name="技术方案"></a>
+
+## 1. 技术方案和评估指标
+
+### 技术方案
+
+双塔模型，使用ERNIE-Gram预训练模型，使用margin_ranking_loss训练模型。
+
+
+### 评估指标
+
+（1）采用 AUC 指标来评估排序模型的排序效果。
+
+**效果评估**
+
+|  模型 |  AUC |
+| ------------ | ------------ |
+|  ERNIE-Gram |  0.801 |
+
+<a name="环境依赖"></a>
+
+## 2. 环境依赖和安装说明
+
+**环境依赖**
+
+* python >= 3.x
+* paddlepaddle >= 2.1.3
+* paddlenlp >= 2.2
+* pandas >= 0.25.1
+* scipy >= 1.3.1
+
+<a name="代码结构"></a>
+
+## 3. 代码结构
+
+以下是本项目主要代码结构及说明：
+
+```
+ernie_matching/
+├── deply # 部署
+    ├── cpp
+        ├── rpc_client.py # RPC 客户端的bash脚本
+        ├── http_client.py # http 客户端的bash文件
+        └── start_server.sh # 启动C++服务的脚本
+    └── python
+        ├── deploy.sh # 预测部署bash脚本
+        ├── config_nlp.yml # Pipeline 的配置文件
+        ├── web_service.py # Pipeline 服务端的脚本
+        ├── rpc_client.py # Pipeline RPC客户端的脚本
+        └── predict.py # python 预测部署示例
+|—— scripts
+    ├── export_model.sh # 动态图参数导出静态图参数的bash文件
+    ├── export_to_serving.sh # 导出 Paddle Serving 模型格式的bash文件
+    ├── train_pairwise.sh # Pair-wise 单塔匹配模型训练的bash文件
+    ├── evaluate.sh # 评估验证文件bash脚本
+    ├── predict_pairwise.sh # Pair-wise 单塔匹配模型预测脚本的bash文件
+├── export_model.py # 动态图参数导出静态图参数脚本
+├── export_to_serving.py # 导出 Paddle Serving 模型格式的脚本
+├── model.py #  Pair-wise 匹配模型组网
+├── data.py #  Pair-wise 训练样本的转换逻辑 、Pair-wise 生成随机负例的逻辑
+├── train_pairwise.py # Pair-wise 单塔匹配模型训练脚本
+├── evaluate.py # 评估验证文件
+├── predict_pairwise.py # Pair-wise 单塔匹配模型预测脚本，输出文本对是相似度
+
+```
+
+<a name="数据准备"></a>
+
+## 4. 数据准备
+
+### 数据集说明
+
+样例数据如下:
+```
+个人所得税税务筹划      基于新个税视角下的个人所得税纳税筹划分析新个税;个人所得税;纳税筹划      个人所得税工资薪金税务筹划研究个人所得税,工资薪金,税务筹划
+液压支架底座受力分析    ZY4000/09/19D型液压支架的有限元分析液压支架,有限元分析,两端加载,偏载,扭转       基于ANSYS的液压支架多工况受力分析液压支架,四种工况,仿真分析,ANSYS,应力集中,优化
+迟发性血管痉挛  西洛他唑治疗动脉瘤性蛛网膜下腔出血后脑血管痉挛的Meta分析西洛他唑,蛛网膜下腔出血,脑血管痉挛,Meta分析     西洛他唑治疗动脉瘤性蛛网膜下腔出血后脑血管痉挛的Meta分析西洛他唑,蛛网膜下腔出血,脑血管痉挛,Meta分析
+氧化亚硅        复合溶胶-凝胶一锅法制备锂离子电池氧化亚硅/碳复合负极材料氧化亚硅,溶胶-凝胶法,纳米颗粒,负极,锂离子电池   负载型聚酰亚胺-二氧化硅-银杂化膜的制备和表征聚酰亚胺,二氧化硅,银,杂化膜,促进传输
+```
+
+
+### 数据集下载
+
+
+- [literature_search_data](https://bj.bcebos.com/v1/paddlenlp/data/literature_search_data.zip)
+
+```
+├── milvus # milvus建库数据集
+    ├── milvus_data.csv.  # 构建召回库的数据
+├── recall  # 召回（语义索引）数据集
+    ├── corpus.csv # 用于测试的召回库
+    ├── dev.csv  # 召回验证集
+    ├── test.csv # 召回测试集
+    ├── train.csv  # 召回训练集
+    ├── train_unsupervised.csv # 无监督训练集
+├── sort # 排序数据集
+    ├── test_pairwise.csv   # 排序测试集
+    ├── dev_pairwise.csv    # 排序验证集
+    └── train_pairwise.csv  # 排序训练集
+
+```
+
+<a name="模型训练"></a>
+
+## 5. 模型训练
+
+**排序模型下载链接：**
+
+
+|Model|训练参数配置|硬件|MD5|
+| ------------ | ------------ | ------------ |-----------|
+|[ERNIE-Gram-Sort](https://bj.bcebos.com/v1/paddlenlp/models/ernie_gram_sort.zip)|<div style="width: 150pt">epoch:3 lr:5E-5 bs:64 max_len:64 </div>|<div style="width: 100pt">4卡 v100-16g</div>|d24ece68b7c3626ce6a24baa58dd297d|
+
+
+### 训练环境说明
+
+
+- NVIDIA Driver Version: 440.64.00
+- Ubuntu 16.04.6 LTS (Docker)
+- Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
+
+
+### 单机单卡训练/单机多卡训练
+
+这里采用单机多卡方式进行训练，通过如下命令，指定 GPU 0,1,2,3 卡, 基于ERNIE-Gram训练模型，数据量比较大，需要20小时10分钟左右。如果采用单机单卡训练，只需要把`--gpu`参数设置成单卡的卡号即可
+
+训练的命令如下：
+
+```
+python -u -m paddle.distributed.launch --gpus "0,1,2,3" train_pairwise.py \
+        --device gpu \
+        --save_dir ./checkpoints \
+        --batch_size 32 \
+        --learning_rate 2E-5 \
+        --margin 0.1 \
+        --eval_step 100 \
+        --train_file data/train_pairwise.csv \
+        --test_file data/dev_pairwise.csv
+```
+也可以运行bash脚本：
+
+```
+sh scripts/train_pairwise.sh
+```
+
+<a name="评估"></a>
+
+## 6. 评估
+
+
+```
+unset CUDA_VISIBLE_DEVICES
+python -u -m paddle.distributed.launch --gpus "0" evaluate.py \
+        --device gpu \
+        --batch_size 32 \
+        --learning_rate 2E-5 \
+        --init_from_ckpt "./checkpoints/model_30000/model_state.pdparams" \
+        --test_file data/dev_pairwise.csv
+```
+也可以运行bash脚本：
+
+```
+sh scripts/evaluate.sh
+```
+
+
+成功运行后会输出下面的指标：
+
+```
+eval_dev auc:0.796
+```
+
+<a name="预测"></a>
+
+## 7. 预测
+
+### 准备预测数据
+
+待预测数据为 tab 分隔的 tsv 文件，每一行为 1 个文本 Pair，和文本pair的语义索引相似度，部分示例如下:
+
+```
+中西方语言与文化的差异  第二语言习得的一大障碍就是文化差异。    0.5160342454910278
+中西方语言与文化的差异  跨文化视角下中国文化对外传播路径琐谈跨文化,中国文化,传播,翻译   0.5145505666732788
+中西方语言与文化的差异  从中西方民族文化心理的差异看英汉翻译语言,文化,民族文化心理,思维方式,翻译        0.5141439437866211
+中西方语言与文化的差异  中英文化差异对翻译的影响中英文化,差异,翻译的影响        0.5138794183731079
+中西方语言与文化的差异  浅谈文化与语言习得文化,语言,文化与语言的关系,文化与语言习得意识,跨文化交际      0.5131710171699524
+```
+
+
+
+### 开始预测
+
+以上述 demo 数据为例，运行如下命令基于我们开源的 ERNIE-Gram模型开始计算文本 Pair 的语义相似度:
+
+```shell
+python -u -m paddle.distributed.launch --gpus "0" \
+        predict_pairwise.py \
+        --device gpu \
+        --params_path "./checkpoints/model_30000/model_state.pdparams"\
+        --batch_size 128 \
+        --max_seq_length 64 \
+        --input_file 'sort/test_pairwise.csv'
+```
+也可以直接执行下面的命令：
+
+```
+sh scripts/predict_pairwise.sh
+```
+得到下面的输出，分别是query，title和对应的预测概率：
+
+```
+{'query': '中西方语言与文化的差异', 'title': '第二语言习得的一大障碍就是文化差异。', 'pred_prob': 0.85112214}
+{'query': '中西方语言与文化的差异', 'title': '跨文化视角下中国文化对外传播路径琐谈跨文化,中国文化,传播,翻译', 'pred_prob': 0.78629625}
+{'query': '中西方语言与文化的差异', 'title': '从中西方民族文化心理的差异看英汉翻译语言,文化,民族文化心理,思维方式,翻译', 'pred_prob': 0.91767526}
+{'query': '中西方语言与文化的差异', 'title': '中英文化差异对翻译的影响中英文化,差异,翻译的影响', 'pred_prob': 0.8601749}
+{'query': '中西方语言与文化的差异', 'title': '浅谈文化与语言习得文化,语言,文化与语言的关系,文化与语言习得意识,跨文化交际', 'pred_prob': 0.8944413}
+```
+
+<a name="部署"></a>
+
+## 8. 部署
+
+### 动转静导出
+
+首先把动态图模型转换为静态图：
+
+```
+python export_model.py --params_path checkpoints/model_30000/model_state.pdparams \
+                       --output_path=./output \
+                       --model_name_or_path ernie-3.0-medium-zh
+```
+也可以运行下面的bash脚本：
+
+```
+sh scripts/export_model.sh
+```
+
+### Paddle Inference
+
+使用PaddleInference：
+
+```
+python deploy/python/predict.py --model_dir ./output \
+                                --input_file sort/test_pairwise.csv \
+                                --model_name_or_path ernie-3.0-medium-zh
+```
+也可以运行下面的bash脚本：
+
+```
+sh deploy/python/deploy.sh
+```
+得到下面的输出，输出的是样本的query，title以及对应的概率：
+
+```
+Data: {'query': '中西方语言与文化的差异', 'title': '第二语言习得的一大障碍就是文化差异。'}       prob: [0.8511221]
+Data: {'query': '中西方语言与文化的差异', 'title': '跨文化视角下中国文化对外传播路径琐谈跨文化,中国文化,传播,翻译'}      prob: [0.7862964]
+Data: {'query': '中西方语言与文化的差异', 'title': '从中西方民族文化心理的差异看英汉翻译语言,文化,民族文化心理,思维方式,翻译'}   prob: [0.91767514]
+Data: {'query': '中西方语言与文化的差异', 'title': '中英文化差异对翻译的影响中英文化,差异,翻译的影响'}   prob: [0.8601747]
+Data: {'query': '中西方语言与文化的差异', 'title': '浅谈文化与语言习得文化,语言,文化与语言的关系,文化与语言习得意识,跨文化交际'}     prob: [0.8944413]
+```
+
+### Paddle Serving部署
+
+Paddle Serving 的详细文档请参考 [Pipeline_Design](https://github.com/PaddlePaddle/Serving/blob/v0.7.0/doc/Python_Pipeline/Pipeline_Design_CN.md)和[Serving_Design](https://github.com/PaddlePaddle/Serving/blob/v0.7.0/doc/Serving_Design_CN.md),首先把静态图模型转换成Serving的格式：
+
+```
+python export_to_serving.py \
+    --dirname "output" \
+    --model_filename "inference.predict.pdmodel" \
+    --params_filename "inference.predict.pdiparams" \
+    --server_path "serving_server" \
+    --client_path "serving_client" \
+    --fetch_alias_names "predict"
+
+```
+
+参数含义说明
+* `dirname`: 需要转换的模型文件存储路径，Program 结构文件和参数文件均保存在此目录。
+* `model_filename`： 存储需要转换的模型 Inference Program 结构的文件名称。如果设置为 None ，则使用 `__model__` 作为默认的文件名
+* `params_filename`: 存储需要转换的模型所有参数的文件名称。当且仅当所有模型参数被保>存在一个单独的二进制文件中，它才需要被指定。如果模型参数是存储在各自分离的文件中，设置它的值为 None
+* `server_path`: 转换后的模型文件和配置文件的存储路径。默认值为 serving_server
+* `client_path`: 转换后的客户端配置文件存储路径。默认值为 serving_client
+* `fetch_alias_names`: 模型输出的别名设置，比如输入的 input_ids 等，都可以指定成其他名字，默认不指定
+* `feed_alias_names`: 模型输入的别名设置，比如输出 pooled_out 等，都可以重新指定成其他模型，默认不指定
+
+也可以运行下面的 bash 脚本：
+```
+sh scripts/export_to_serving.sh
+```
+Paddle Serving的部署有两种方式，第一种方式是Pipeline的方式，第二种是C++的方式，下面分别介绍这两种方式的用法：
+
+#### Pipeline方式
+
+修改`Tokenizer`
+
+```
+self.tokenizer = AutoTokenizer.from_pretrained('ernie-3.0-medium-zh')
+```
+
+启动 Pipeline Server:
+
+```
+python web_service.py
+```
+
+启动客户端调用 Server。
+
+首先修改rpc_client.py中需要预测的样本：
+
+```
+list_data = [{"query":"中西方语言与文化的差异","title":"第二语言习得的一大障碍就是文化差异。"}]`
+```
+然后运行：
+```
+python rpc_client.py
+```
+模型的输出为：
+
+```
+PipelineClient::predict pack_data time:1656912047.5986433
+PipelineClient::predict before time:1656912047.599081
+time to cost :0.012039899826049805 seconds
+(1, 1)
+[[0.85112208]]
+```
+可以看到客户端发送了1条文本，这条文本的相似的概率值。
+
+#### C++的方式
+
+启动C++的Serving：
+
+```
+python -m paddle_serving_server.serve --model serving_server --port 8600 --gpu_id 0 --thread 5 --ir_optim True
+```
+也可以使用脚本：
+
+```
+sh deploy/cpp/start_server.sh
+```
+Client 可以使用 http 或者 rpc 两种方式，rpc 的方式为：
+
+```
+python deploy/cpp/rpc_client.py
+```
+运行的输出为：
+
+```
+I0704 05:19:00.443437  1987 general_model.cpp:490] [client]logid=0,client_cost=8.477ms,server_cost=6.458ms.
+time to cost :0.008707761764526367 seconds
+{'predict': array([[0.8511221]], dtype=float32)}
+```
+可以看到服务端返回了相似度结果
+
+或者使用 http 的客户端访问模式：
+
+```
+python deploy/cpp/http_client.py
+```
+运行的输出为：
+```
+time to cost :0.006819009780883789 seconds
+[0.8511220812797546]
+```
+可以看到服务端返回了相似度结果
+
+也可以使用curl方式发送Http请求：
+
+```
+curl -XPOST http://0.0.0.0:8600/GeneralModelService/inference -d  ' {"tensor":[{"int64_data":[    1,    12,   213,    58,   405,   545,    54,    68,    73,
+            5,   859,   712,     2,   131,   177,   405,   545,   489,
+          116,     5,     7,    19,   843,  1767,   113,    10,    68,
+           73,   859,   712, 12043,     2],"elem_type":0,"name":"input_ids","alias_name":"input_ids","shape":[1,32]},
+    {"int64_data":[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1],"elem_type":0,"name":"token_type_ids","alias_name":"token_type_ids","shape":[1,32]}
+        ],
+"fetch_var_names":["sigmoid_2.tmp_0"],
+"log_id":0
+}'
+```
+
+
+## Reference
+
+[1] Xiao, Dongling, Yu-Kun Li, Han Zhang, Yu Sun, Hao Tian, Hua Wu, and Haifeng Wang. “ERNIE-Gram: Pre-Training with Explicitly N-Gram Masked Language Modeling for Natural Language Understanding.” ArXiv:2010.12148 [Cs].
--- a/applications/neural_search/ranking/ernie_matching/data.py
+++ b/applications/neural_search/ranking/ernie_matching/data.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numpy as np
+
+from paddlenlp.datasets import MapDataset
+
+
+def create_dataloader(dataset, mode="train", batch_size=1, batchify_fn=None, trans_fn=None):
+    if trans_fn:
+        dataset = dataset.map(trans_fn)
+
+    shuffle = True if mode == "train" else False
+    if mode == "train":
+        batch_sampler = paddle.io.DistributedBatchSampler(dataset, batch_size=batch_size, shuffle=shuffle)
+    else:
+        batch_sampler = paddle.io.BatchSampler(dataset, batch_size=batch_size, shuffle=shuffle)
+
+    return paddle.io.DataLoader(dataset=dataset, batch_sampler=batch_sampler, collate_fn=batchify_fn, return_list=True)
+
+
+def read_text_pair(data_path):
+    """Reads data."""
+    with open(data_path, "r", encoding="utf-8") as f:
+        for line in f:
+            data = line.rstrip().split("\t")
+            if len(data) != 3:
+                continue
+            yield {"query": data[0], "title": data[1]}
+
+
+def convert_pointwise_example(example, tokenizer, max_seq_length=512, is_test=False):
+
+    query, title = example["query"], example["title"]
+
+    encoded_inputs = tokenizer(text=query, text_pair=title, max_seq_len=max_seq_length)
+
+    input_ids = encoded_inputs["input_ids"]
+    token_type_ids = encoded_inputs["token_type_ids"]
+
+    if not is_test:
+        label = np.array([example["label"]], dtype="int64")
+        return input_ids, token_type_ids, label
+    else:
+        return input_ids, token_type_ids
+
+
+def convert_pairwise_example(example, tokenizer, max_seq_length=512, phase="train"):
+
+    if phase == "train":
+        query, pos_title, neg_title = example["query"], example["title"], example["neg_title"]
+
+        pos_inputs = tokenizer(text=query, text_pair=pos_title, max_seq_len=max_seq_length)
+        neg_inputs = tokenizer(text=query, text_pair=neg_title, max_seq_len=max_seq_length)
+
+        pos_input_ids = pos_inputs["input_ids"]
+        pos_token_type_ids = pos_inputs["token_type_ids"]
+        neg_input_ids = neg_inputs["input_ids"]
+        neg_token_type_ids = neg_inputs["token_type_ids"]
+
+        return (pos_input_ids, pos_token_type_ids, neg_input_ids, neg_token_type_ids)
+
+    else:
+        query, title = example["query"], example["title"]
+
+        inputs = tokenizer(text=query, text_pair=title, max_seq_len=max_seq_length)
+
+        input_ids = inputs["input_ids"]
+        token_type_ids = inputs["token_type_ids"]
+        if phase == "eval":
+            return input_ids, token_type_ids, example["label"]
+        elif phase == "predict":
+            return input_ids, token_type_ids
+        else:
+            raise ValueError("not supported phase:{}".format(phase))
+
+
+def gen_pair(dataset, pool_size=100):
+    """
+    Generate triplet randomly based on dataset
+
+    Args:
+        dataset: A `MapDataset` or `IterDataset` or a tuple of those.
+            Each example is composed of 2 texts: example["query"], example["title"]
+        pool_size: the number of example to sample negative example randomly
+
+    Return:
+        dataset: A `MapDataset` or `IterDataset` or a tuple of those.
+        Each example is composed of 3 texts: example["query"], example["pos_title"]、example["neg_title"]
+    """
+
+    if len(dataset) < pool_size:
+        pool_size = len(dataset)
+
+    new_examples = []
+    pool = []
+    tmp_examples = []
+
+    for example in dataset:
+        label = example["label"]
+
+        # Filter negative example
+        if label == 0:
+            continue
+
+        tmp_examples.append(example)
+        pool.append(example["title"])
+
+        if len(pool) >= pool_size:
+            np.random.shuffle(pool)
+            for idx, example in enumerate(tmp_examples):
+                example["neg_title"] = pool[idx]
+                new_examples.append(example)
+            tmp_examples = []
+            pool = []
+        else:
+            continue
+    return MapDataset(new_examples)
--- a/applications/neural_search/ranking/ernie_matching/deploy/cpp/http_client.py
+++ b/applications/neural_search/ranking/ernie_matching/deploy/cpp/http_client.py
+# coding:utf-8
+# pylint: disable=doc-string-missing
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+
+import numpy as np
+from paddle_serving_client.httpclient import HttpClient
+
+import paddlenlp as ppnlp
+
+
+def convert_example(example, tokenizer, max_seq_length=512):
+
+    query, title = example["query"], example["title"]
+    encoded_inputs = tokenizer(text=query, text_pair=title, max_seq_len=max_seq_length)
+    input_ids = encoded_inputs["input_ids"]
+    token_type_ids = encoded_inputs["token_type_ids"]
+
+    return input_ids, token_type_ids
+
+
+# 启动python客户端
+endpoint_list = ["127.0.0.1:8600"]
+client = HttpClient()
+client.load_client_config("serving_client")
+client.connect(endpoint_list)
+feed_names = client.feed_names_
+fetch_names = client.fetch_names_
+
+# 创建tokenizer
+tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained("ernie-gram-zh")
+max_seq_len = 64
+
+# 数据预处理
+list_data = [{"query": "中西方语言与文化的差异", "title": "第二语言习得的一大障碍就是文化差异。"}]
+
+input_ids, token_type_ids = [], []
+for example in list_data:
+    input_id, token_type_id = convert_example(example, tokenizer, max_seq_length=max_seq_len)
+    input_ids.append(input_id)
+    token_type_ids.append(token_type_id)
+
+feed_dict = {}
+feed_dict["input_ids"] = np.array(input_ids)
+feed_dict["token_type_ids"] = np.array(token_type_ids)
+# batch设置为True表示的是批量预测
+b_start = time.time()
+result = client.predict(feed=feed_dict, fetch=fetch_names, batch=True)
+b_end = time.time()
+print(result)
+print("time to cost :{} seconds".format(b_end - b_start))
+print(result.outputs[0].tensor[0].float_data)
--- a/applications/neural_search/ranking/ernie_matching/deploy/cpp/rpc_client.py
+++ b/applications/neural_search/ranking/ernie_matching/deploy/cpp/rpc_client.py
+# coding:utf-8
+# pylint: disable=doc-string-missing
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+
+import numpy as np
+from paddle_serving_client import Client
+
+import paddlenlp as ppnlp
+
+
+def convert_example(example, tokenizer, max_seq_length=512):
+
+    query, title = example["query"], example["title"]
+    encoded_inputs = tokenizer(text=query, text_pair=title, max_seq_len=max_seq_length)
+
+    input_ids = encoded_inputs["input_ids"]
+    token_type_ids = encoded_inputs["token_type_ids"]
+
+    return input_ids, token_type_ids
+
+
+# 启动python客户端
+endpoint_list = ["127.0.0.1:8600"]
+client = Client()
+client.load_client_config("serving_client")
+client.connect(endpoint_list)
+feed_names = client.feed_names_
+fetch_names = client.fetch_names_
+
+# 创建tokenizer
+tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained("ernie-gram-zh")
+max_seq_len = 64
+
+# 数据预处理
+list_data = [{"query": "中西方语言与文化的差异", "title": "第二语言习得的一大障碍就是文化差异。"}]
+
+input_ids, token_type_ids = [], []
+for example in list_data:
+    input_id, token_type_id = convert_example(example, tokenizer, max_seq_length=max_seq_len)
+    input_ids.append(input_id)
+    token_type_ids.append(token_type_id)
+
+feed_dict = {}
+feed_dict["input_ids"] = np.array(input_ids)
+feed_dict["token_type_ids"] = np.array(token_type_ids)
+# batch设置为True表示的是批量预测
+b_start = time.time()
+result = client.predict(feed=feed_dict, fetch=fetch_names, batch=True)
+b_end = time.time()
+print("time to cost :{} seconds".format(b_end - b_start))
+print(result)
--- a/applications/neural_search/ranking/ernie_matching/deploy/cpp/start_server.sh
+++ b/applications/neural_search/ranking/ernie_matching/deploy/cpp/start_server.sh
+python -m paddle_serving_server.serve --model serving_server --port 8600 --gpu_id 0 --thread 5 --ir_optim True
\ No newline at end of file
--- a/applications/neural_search/ranking/ernie_matching/deploy/python/config_nlp.yml
+++ b/applications/neural_search/ranking/ernie_matching/deploy/python/config_nlp.yml
+# worker_num, 最大并发数。当build_dag_each_worker=True时, 框架会创建worker_num个进程，每个进程内构建grpcSever和DAG
+# 当build_dag_each_worker=False时，框架会设置主线程grpc线程池的max_workers=worker_num
+worker_num: 20
+# build_dag_each_worker, False，框架在进程内创建一条DAG；True，框架会每个进程内创建多个独立的DAG
+build_dag_each_worker: false
+
+dag:
+  # op资源类型, True, 为线程模型；False，为进程模型
+  is_thread_op: False
+  # 使用性能分析, True，生成Timeline性能数据，对性能有一定影响；False为不使用
+  tracer:
+    interval_s: 10
+# http端口, rpc_port和http_port不允许同时为空。当rpc_port可用且http_port为空时，不自动生成http_port
+http_port: 8088
+# rpc端口, rpc_port和http_port不允许同时为空。当rpc_port为空且http_port不为空时，会自动将rpc_port设置为http_port+1
+rpc_port: 8089
+op:
+  ernie:
+    # 并发数，is_thread_op=True时，为线程并发；否则为进程并发
+    concurrency: 1
+    # 当op配置没有server_endpoints时，从local_service_conf读取本地服务配置
+    local_service_conf:
+      # client类型，包括brpc, grpc和local_predictor.local_predictor不启动Serving服务，进程内预测
+      client_type: local_predictor
+      # ir_optim
+      ir_optim: True
+      # device_type, 0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu
+      device_type: 1
+      # 计算硬件ID，当devices为""或不写时为CPU预测；当devices为"0", "0,1,2"时为GPU预测，表示使用的GPU卡
+      devices: "0"
+      # Fetch结果列表，以client_config中fetch_var的alias_name为准, 如果没有设置则全部返回
+      fetch_list: ['predict']
+      # 模型路径
+      model_config: ../../serving_server/
--- a/applications/neural_search/ranking/ernie_matching/deploy/python/deploy.sh
+++ b/applications/neural_search/ranking/ernie_matching/deploy/python/deploy.sh
+python deploy/python/predict.py --model_dir ./output \
+                                --input_file sort/test_pairwise.csv \
+                                --model_name_or_path ernie-3.0-medium-zh
\ No newline at end of file
--- a/applications/neural_search/ranking/ernie_matching/deploy/python/predict.py
+++ b/applications/neural_search/ranking/ernie_matching/deploy/python/predict.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import sys
+
+import numpy as np
+import paddle
+from paddle import inference
+
+from paddlenlp.data import Pad, Tuple
+from paddlenlp.datasets import load_dataset
+from paddlenlp.transformers import AutoTokenizer
+from paddlenlp.utils.log import logger
+
+sys.path.append(".")
+
+# yapf: disable
+parser = argparse.ArgumentParser()
+parser.add_argument("--model_dir", type=str, required=True, help="The directory to static model.")
+parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.")
+parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.")
+parser.add_argument('--device', choices=['cpu', 'gpu', 'xpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
+parser.add_argument("--input_file", type=str, required=True, help="The test set file.")
+parser.add_argument('--use_tensorrt', default=False, type=eval, choices=[True, False], help='Enable to use tensorrt to speed up.')
+parser.add_argument("--precision", default="fp32", type=str, choices=["fp32", "fp16", "int8"], help='The tensorrt precision.')
+parser.add_argument('--cpu_threads', default=10, type=int, help='Number of threads to predict when using cpu.')
+parser.add_argument('--enable_mkldnn', default=False, type=eval, choices=[True, False], help='Enable to use mkldnn to speed up when using cpu.')
+parser.add_argument("--benchmark", type=eval, default=False, help="To log some information about environment and running.")
+parser.add_argument("--save_log_path", type=str, default="./log_output/", help="The file path to save log.")
+parser.add_argument('--model_name_or_path', default="ernie-3.0-medium-zh", help="The pretrained model used for training")
+args = parser.parse_args()
+# yapf: enable
+
+
+def read_text_pair(data_path):
+    """Reads data."""
+    with open(data_path, "r", encoding="utf-8") as f:
+        for line in f:
+            data = line.rstrip().split("\t")
+            if len(data) != 3:
+                continue
+            yield {"query": data[0], "title": data[1]}
+
+
+def convert_example(example, tokenizer, max_seq_length=512, is_test=False):
+
+    query, title = example["query"], example["title"]
+
+    encoded_inputs = tokenizer(text=query, text_pair=title, max_seq_len=max_seq_length)
+
+    input_ids = encoded_inputs["input_ids"]
+    token_type_ids = encoded_inputs["token_type_ids"]
+
+    if not is_test:
+        label = np.array([example["label"]], dtype="int64")
+        return input_ids, token_type_ids, label
+    else:
+        return input_ids, token_type_ids
+
+
+class Predictor(object):
+    def __init__(
+        self,
+        model_dir,
+        device="gpu",
+        max_seq_length=128,
+        batch_size=32,
+        use_tensorrt=False,
+        precision="fp32",
+        cpu_threads=10,
+        enable_mkldnn=False,
+    ):
+        self.max_seq_length = max_seq_length
+        self.batch_size = batch_size
+
+        model_file = model_dir + "/inference.predict.pdmodel"
+        params_file = model_dir + "/inference.predict.pdiparams"
+        if not os.path.exists(model_file):
+            raise ValueError("not find model file path {}".format(model_file))
+        if not os.path.exists(params_file):
+            raise ValueError("not find params file path {}".format(params_file))
+        config = paddle.inference.Config(model_file, params_file)
+
+        if device == "gpu":
+            # set GPU configs accordingly
+            # such as initialize the gpu memory, enable tensorrt
+            config.enable_use_gpu(100, 0)
+            precision_map = {
+                "fp16": inference.PrecisionType.Half,
+                "fp32": inference.PrecisionType.Float32,
+                "int8": inference.PrecisionType.Int8,
+            }
+            precision_mode = precision_map[precision]
+
+            if args.use_tensorrt:
+                config.enable_tensorrt_engine(
+                    max_batch_size=batch_size, min_subgraph_size=30, precision_mode=precision_mode
+                )
+        elif device == "cpu":
+            # set CPU configs accordingly,
+            # such as enable_mkldnn, set_cpu_math_library_num_threads
+            config.disable_gpu()
+            if args.enable_mkldnn:
+                # cache 10 different shapes for mkldnn to avoid memory leak
+                config.set_mkldnn_cache_capacity(10)
+                config.enable_mkldnn()
+            config.set_cpu_math_library_num_threads(args.cpu_threads)
+        elif device == "xpu":
+            # set XPU configs accordingly
+            config.enable_xpu(100)
+
+        config.switch_use_feed_fetch_ops(False)
+        self.predictor = paddle.inference.create_predictor(config)
+        self.input_handles = [self.predictor.get_input_handle(name) for name in self.predictor.get_input_names()]
+        self.output_handle = self.predictor.get_output_handle(self.predictor.get_output_names()[0])
+
+        if args.benchmark:
+            import auto_log
+
+            pid = os.getpid()
+            self.autolog = auto_log.AutoLogger(
+                model_name="ernie-tiny",
+                model_precision=precision,
+                batch_size=self.batch_size,
+                data_shape="dynamic",
+                save_path=args.save_log_path,
+                inference_config=config,
+                pids=pid,
+                process_name=None,
+                gpu_ids=0,
+                time_keys=["preprocess_time", "inference_time", "postprocess_time"],
+                warmup=0,
+                logger=logger,
+            )
+
+    def predict(self, data, tokenizer):
+        """
+        Predicts the data labels.
+
+        Args:
+            data (obj:`List(str)`): The batch data whose each element is a raw text.
+            tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
+                which contains most of the methods. Users should refer to the superclass for more information regarding methods.
+            label_map(obj:`dict`): The label id (key) to label str (value) map.
+
+        Returns:
+            results(obj:`dict`): All the predictions labels.
+        """
+        if args.benchmark:
+            self.autolog.times.start()
+
+        examples = []
+        for text in data:
+            input_ids, segment_ids = convert_example(text, tokenizer, max_seq_length=self.max_seq_length, is_test=True)
+            examples.append((input_ids, segment_ids))
+
+        batchify_fn = lambda samples, fn=Tuple(
+            Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"),  # input
+            Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"),  # segment
+        ): fn(samples)
+
+        if args.benchmark:
+            self.autolog.times.stamp()
+
+        input_ids, segment_ids = batchify_fn(examples)
+        self.input_handles[0].copy_from_cpu(input_ids)
+        self.input_handles[1].copy_from_cpu(segment_ids)
+        self.predictor.run()
+        sim_score = self.output_handle.copy_to_cpu()
+        if args.benchmark:
+            self.autolog.times.stamp()
+
+        if args.benchmark:
+            self.autolog.times.end(stamp=True)
+
+        return sim_score
+
+
+if __name__ == "__main__":
+    # Define predictor to do prediction.
+    predictor = Predictor(
+        args.model_dir,
+        args.device,
+        args.max_seq_length,
+        args.batch_size,
+        args.use_tensorrt,
+        args.precision,
+        args.cpu_threads,
+        args.enable_mkldnn,
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
+
+    test_ds = load_dataset(read_text_pair, data_path=args.input_file, lazy=False)
+
+    data = [{"query": d["query"], "title": d["title"]} for d in test_ds]
+
+    batches = [data[idx : idx + args.batch_size] for idx in range(0, len(data), args.batch_size)]
+
+    results = []
+    for batch_data in batches:
+        results.extend(predictor.predict(batch_data, tokenizer))
+    for idx, text in enumerate(data):
+        print("Data: {} \t prob: {}".format(text, results[idx]))
+    if args.benchmark:
+        predictor.autolog.report()
--- a/applications/neural_search/ranking/ernie_matching/deploy/python/rpc_client.py
+++ b/applications/neural_search/ranking/ernie_matching/deploy/python/rpc_client.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+import numpy as np
+
+from paddle_serving_server.pipeline import PipelineClient
+
+client = PipelineClient()
+client.connect(["127.0.0.1:8089"])
+
+list_data = [{"query": "中西方语言与文化的差异", "title": "第二语言习得的一大障碍就是文化差异。"}]
+feed = {}
+for i, item in enumerate(list_data):
+    feed[str(i)] = str(item)
+
+print(feed)
+start_time = time.time()
+ret = client.predict(feed_dict=feed)
+end_time = time.time()
+print("time to cost :{} seconds".format(end_time - start_time))
+result = np.array(eval(ret.value[0]))
+print(result.shape)
+print(result)
--- a/applications/neural_search/ranking/ernie_matching/deploy/python/web_service.py
+++ b/applications/neural_search/ranking/ernie_matching/deploy/python/web_service.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+
+from paddle_serving_server.web_service import Op, WebService
+
+
+def convert_example(example, tokenizer, max_seq_length=512):
+
+    query, title = example["query"], example["title"]
+    encoded_inputs = tokenizer(text=query, text_pair=title, max_seq_len=max_seq_length)
+
+    input_ids = encoded_inputs["input_ids"]
+    token_type_ids = encoded_inputs["token_type_ids"]
+
+    return input_ids, token_type_ids
+
+
+class ErnieOp(Op):
+    def init_op(self):
+        from paddlenlp.transformers import AutoTokenizer
+
+        self.tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-medium-zh")
+
+    def preprocess(self, input_dicts, data_id, log_id):
+        from paddlenlp.data import Pad, Tuple
+
+        ((_, input_dict),) = input_dicts.items()
+        print("input dict", input_dict)
+        batch_size = len(input_dict.keys())
+        examples = []
+        for i in range(batch_size):
+            example = json.loads(input_dict[str(i)].replace("'", '"'))
+            input_ids, segment_ids = convert_example(example, self.tokenizer)
+            examples.append((input_ids, segment_ids))
+        batchify_fn = lambda samples, fn=Tuple(
+            Pad(axis=0, pad_val=self.tokenizer.pad_token_id, dtype="int64"),  # input
+            Pad(axis=0, pad_val=self.tokenizer.pad_token_type_id, dtype="int64"),  # segment
+        ): fn(samples)
+        input_ids, segment_ids = batchify_fn(examples)
+        feed_dict = {}
+        feed_dict["input_ids"] = input_ids
+        feed_dict["token_type_ids"] = segment_ids
+        return feed_dict, False, None, ""
+
+    def postprocess(self, input_dicts, fetch_dict, data_id, log_id):
+        new_dict = {}
+        new_dict["predict"] = str(fetch_dict["predict"].tolist())
+        return new_dict, None, ""
+
+
+class ErnieService(WebService):
+    def get_pipeline_response(self, read_op):
+        ernie_op = ErnieOp(name="ernie", input_ops=[read_op])
+        return ernie_op
+
+
+ernie_service = ErnieService(name="ernie")
+ernie_service.prepare_pipeline_config("config_nlp.yml")
+ernie_service.run_service()
--- a/applications/neural_search/ranking/ernie_matching/evaluate.py
+++ b/applications/neural_search/ranking/ernie_matching/evaluate.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import random
+from functools import partial
+
+import numpy as np
+import paddle
+import pandas as pd
+from data import convert_pairwise_example as convert_example
+from data import create_dataloader
+from model import PairwiseMatching
+from tqdm import tqdm
+
+from paddlenlp.data import Pad, Stack, Tuple
+from paddlenlp.datasets import load_dataset
+from paddlenlp.transformers import AutoModel, AutoTokenizer
+
+# yapf: disable
+parser = argparse.ArgumentParser()
+parser.add_argument("--margin", default=0.1, type=float, help="Margin for pos_score and neg_score.")
+parser.add_argument("--test_file", type=str, required=True, help="The full path of test file")
+parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.")
+parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.")
+parser.add_argument('--model_name_or_path', default="ernie-3.0-medium-zh", help="The pretrained model used for training")
+parser.add_argument("--init_from_ckpt", type=str, default=None, help="The path of checkpoint to be loaded.")
+parser.add_argument("--seed", type=int, default=1000, help="Random seed for initialization.")
+parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
+args = parser.parse_args()
+# yapf: enable
+
+
+def set_seed(seed):
+    """sets random seed"""
+    random.seed(seed)
+    np.random.seed(seed)
+    paddle.seed(seed)
+
+
+@paddle.no_grad()
+def evaluate(model, metric, data_loader, phase="dev"):
+    """
+    Given a dataset, it evals model and computes the metric.
+
+    Args:
+        model(obj:`paddle.nn.Layer`): A model to classify texts.
+        data_loader(obj:`paddle.io.DataLoader`): The dataset loader which generates batches.
+        metric(obj:`paddle.metric.Metric`): The evaluation metric.
+    """
+    model.eval()
+    metric.reset()
+
+    for idx, batch in enumerate(data_loader):
+        input_ids, token_type_ids, labels = batch
+
+        pos_probs = model.predict(input_ids=input_ids, token_type_ids=token_type_ids)
+
+        neg_probs = 1.0 - pos_probs
+
+        preds = np.concatenate((neg_probs, pos_probs), axis=1)
+        metric.update(preds=preds, labels=labels)
+
+    print("eval_{} auc:{:.3}".format(phase, metric.accumulate()))
+    metric.reset()
+    model.train()
+
+
+# 构建读取函数，读取原始数据
+def read(src_path, is_predict=False):
+    data = pd.read_csv(src_path, sep="\t")
+    for index, row in tqdm(data.iterrows()):
+        query = row["query"]
+        title = row["title"]
+        neg_title = row["neg_title"]
+        yield {"query": query, "title": title, "neg_title": neg_title}
+
+
+def read_test(src_path, is_predict=False):
+    data = pd.read_csv(src_path, sep="\t")
+    for index, row in tqdm(data.iterrows()):
+        query = row["query"]
+        title = row["title"]
+        label = row["label"]
+        yield {"query": query, "title": title, "label": label}
+
+
+def main():
+    paddle.set_device(args.device)
+    if paddle.distributed.get_world_size() > 1:
+        paddle.distributed.init_parallel_env()
+
+    set_seed(args.seed)
+
+    dev_ds = load_dataset(read_test, src_path=args.test_file, lazy=False)
+    print(dev_ds[0])
+
+    pretrained_model = AutoModel.from_pretrained(args.model_name_or_path)
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
+
+    trans_func_eval = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length, phase="eval")
+
+    batchify_fn_eval = lambda samples, fn=Tuple(
+        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"),  # pair_input
+        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"),  # pair_segment
+        Stack(dtype="int64"),  # label
+    ): [data for data in fn(samples)]
+
+    dev_data_loader = create_dataloader(
+        dev_ds, mode="dev", batch_size=args.batch_size, batchify_fn=batchify_fn_eval, trans_fn=trans_func_eval
+    )
+
+    model = PairwiseMatching(pretrained_model, margin=args.margin)
+
+    if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt):
+        state_dict = paddle.load(args.init_from_ckpt)
+        model.set_dict(state_dict)
+
+    metric = paddle.metric.Auc()
+    evaluate(model, metric, dev_data_loader, "dev")
+
+
+if __name__ == "__main__":
+    main()
--- a/applications/neural_search/ranking/ernie_matching/export_model.py
+++ b/applications/neural_search/ranking/ernie_matching/export_model.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+
+import paddle
+from model import PairwiseMatching
+
+from paddlenlp.transformers import AutoModel, AutoTokenizer
+
+# yapf: disable
+parser = argparse.ArgumentParser()
+parser.add_argument("--params_path", type=str, required=True, default='./checkpoint/model_900/model_state.pdparams', help="The path to model parameters to be loaded.")
+parser.add_argument("--output_path", type=str, default='./output', help="The path of model parameter in static graph to be saved.")
+parser.add_argument('--model_name_or_path', default="ernie-3.0-medium-zh", help="The pretrained model used for training")
+args = parser.parse_args()
+# yapf: enable
+
+if __name__ == "__main__":
+    pretrained_model = AutoModel.from_pretrained(args.model_name_or_path)
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
+    model = PairwiseMatching(pretrained_model)
+
+    if args.params_path and os.path.isfile(args.params_path):
+        state_dict = paddle.load(args.params_path)
+        model.set_dict(state_dict)
+        print("Loaded parameters from %s" % args.params_path)
+    else:
+        raise ValueError("Please set --params_path with correct pretrained model file")
+    model.eval()
+
+    # Convert to static graph with specific input description
+    model = paddle.jit.to_static(
+        model,
+        input_spec=[
+            paddle.static.InputSpec(shape=[None, None], dtype="int64"),  # input_ids
+            paddle.static.InputSpec(shape=[None, None], dtype="int64"),  # segment_ids
+        ],
+    )
+    # Save in static graph model.
+    save_path = os.path.join(args.output_path, "inference")
+    paddle.jit.save(model, save_path)
--- a/applications/neural_search/ranking/ernie_matching/export_to_serving.py
+++ b/applications/neural_search/ranking/ernie_matching/export_to_serving.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import paddle_serving_client.io as serving_io
+
+# yapf: disable
+parser = argparse.ArgumentParser()
+parser.add_argument("--dirname", type=str, required=True,
+                    default='./output', help="Path of saved model files. Program file and parameter files are saved in this directory.")
+parser.add_argument("--model_filename", type=str, required=True,
+                    default='inference.get_pooled_embedding.pdmodel', help="The name of file to load the inference program. If it is None, the default filename __model__ will be used.")
+parser.add_argument("--params_filename", type=str, required=True,
+                    default='inference.get_pooled_embedding.pdiparams', help="The name of file to load all parameters. It is only used for the case that all parameters were saved in a single binary file. If parameters were saved in separate files, set it as None. Default: None.")
+parser.add_argument("--server_path", type=str, default='./serving_server',
+                    help="The path of server parameter in static graph to be saved.")
+parser.add_argument("--client_path", type=str, default='./serving_client',
+                    help="The path of client parameter in static graph to be saved.")
+parser.add_argument("--feed_alias_names", type=str, default=None,
+                    help='set alias names for feed vars, split by comma \',\', you should run --show_proto to check the number of feed vars')
+parser.add_argument("--fetch_alias_names", type=str, default=None,
+                    help='set alias names for feed vars, split by comma \',\', you should run --show_proto to check the number of fetch vars')
+parser.add_argument("--show_proto", type=bool, default=False,
+                    help='If yes, you can preview the proto and then determine your feed var alias name and fetch var alias name.')
+# yapf: enable
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    serving_io.inference_model_to_serving(
+        dirname=args.dirname,
+        serving_server=args.server_path,
+        serving_client=args.client_path,
+        model_filename=args.model_filename,
+        params_filename=args.params_filename,
+        show_proto=args.show_proto,
+        feed_alias_names=args.feed_alias_names,
+        fetch_alias_names=args.fetch_alias_names,
+    )
--- a/applications/neural_search/ranking/ernie_matching/model.py
+++ b/applications/neural_search/ranking/ernie_matching/model.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class PairwiseMatching(nn.Layer):
+    def __init__(self, pretrained_model, dropout=None, margin=0.1):
+        super().__init__()
+        self.ptm = pretrained_model
+        self.dropout = nn.Dropout(dropout if dropout is not None else 0.1)
+        self.margin = margin
+
+        # hidden_size -> 1, calculate similarity
+        self.similarity = nn.Linear(self.ptm.config["hidden_size"], 1)
+
+    @paddle.jit.to_static(
+        input_spec=[
+            paddle.static.InputSpec(shape=[None, None], dtype="int64"),
+            paddle.static.InputSpec(shape=[None, None], dtype="int64"),
+        ]
+    )
+    def predict(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
+
+        _, cls_embedding = self.ptm(input_ids, token_type_ids, position_ids, attention_mask)
+
+        cls_embedding = self.dropout(cls_embedding)
+        sim_score = self.similarity(cls_embedding)
+        sim_score = F.sigmoid(sim_score)
+
+        return sim_score
+
+    def forward(
+        self,
+        pos_input_ids,
+        neg_input_ids,
+        pos_token_type_ids=None,
+        neg_token_type_ids=None,
+        pos_position_ids=None,
+        neg_position_ids=None,
+        pos_attention_mask=None,
+        neg_attention_mask=None,
+    ):
+
+        _, pos_cls_embedding = self.ptm(pos_input_ids, pos_token_type_ids, pos_position_ids, pos_attention_mask)
+
+        _, neg_cls_embedding = self.ptm(neg_input_ids, neg_token_type_ids, neg_position_ids, neg_attention_mask)
+
+        pos_embedding = self.dropout(pos_cls_embedding)
+        neg_embedding = self.dropout(neg_cls_embedding)
+
+        pos_sim = self.similarity(pos_embedding)
+        neg_sim = self.similarity(neg_embedding)
+
+        pos_sim = F.sigmoid(pos_sim)
+        neg_sim = F.sigmoid(neg_sim)
+
+        labels = paddle.full(shape=[pos_cls_embedding.shape[0]], fill_value=1.0, dtype="float32")
+
+        loss = F.margin_ranking_loss(pos_sim, neg_sim, labels, margin=self.margin)
+
+        return loss
--- a/applications/neural_search/ranking/ernie_matching/predict_pairwise.py
+++ b/applications/neural_search/ranking/ernie_matching/predict_pairwise.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+from functools import partial
+
+import numpy as np
+import paddle
+from data import convert_pairwise_example as convert_example
+from data import create_dataloader, read_text_pair
+from model import PairwiseMatching
+
+from paddlenlp.data import Pad, Tuple
+from paddlenlp.datasets import load_dataset
+from paddlenlp.transformers import AutoModel, AutoTokenizer
+
+# yapf: disable
+parser = argparse.ArgumentParser()
+parser.add_argument("--input_file", type=str, required=True, help="The full path of input file")
+parser.add_argument("--params_path", type=str, required=True, help="The path to model parameters to be loaded.")
+parser.add_argument("--max_seq_length", default=64, type=int, help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.")
+parser.add_argument('--model_name_or_path', default="ernie-3.0-medium-zh", help="The pretrained model used for training")
+parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.")
+parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
+args = parser.parse_args()
+# yapf: enable
+
+
+def predict(model, data_loader):
+    """
+    Predicts the data labels.
+
+    Args:
+        model (obj:`SemanticIndexBase`): A model to extract text embedding or calculate similarity of text pair.
+        data_loader (obj:`List(Example)`): The processed data ids of text pair: [query_input_ids, query_token_type_ids, title_input_ids, title_token_type_ids]
+    Returns:
+        results(obj:`List`): cosine similarity of text pairs.
+    """
+    batch_probs = []
+
+    model.eval()
+
+    with paddle.no_grad():
+        for batch_data in data_loader:
+            input_ids, token_type_ids = batch_data
+
+            batch_prob = model.predict(input_ids=input_ids, token_type_ids=token_type_ids).numpy()
+
+            batch_probs.append(batch_prob)
+        if len(batch_prob) == 1:
+            batch_probs = np.array(batch_probs)
+        else:
+            batch_probs = np.concatenate(batch_probs, axis=0)
+
+        return batch_probs
+
+
+if __name__ == "__main__":
+    paddle.set_device(args.device)
+
+    pretrained_model = AutoModel.from_pretrained(args.model_name_or_path)
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
+
+    trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length, phase="predict")
+
+    batchify_fn = lambda samples, fn=Tuple(
+        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"),  # input_ids
+        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"),  # segment_ids
+    ): [data for data in fn(samples)]
+
+    valid_ds = load_dataset(read_text_pair, data_path=args.input_file, lazy=False)
+
+    valid_data_loader = create_dataloader(
+        valid_ds, mode="predict", batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func
+    )
+
+    model = PairwiseMatching(pretrained_model)
+
+    if args.params_path and os.path.isfile(args.params_path):
+        state_dict = paddle.load(args.params_path)
+        model.set_dict(state_dict)
+        print("Loaded parameters from %s" % args.params_path)
+    else:
+        raise ValueError("Please set --params_path with correct pretrained model file")
+
+    y_probs = predict(model, valid_data_loader)
+
+    valid_ds = load_dataset(read_text_pair, data_path=args.input_file, lazy=False)
+
+    for idx, prob in enumerate(y_probs):
+        text_pair = valid_ds[idx]
+        text_pair["pred_prob"] = prob[0]
+        print(text_pair)
--- a/applications/neural_search/ranking/ernie_matching/scripts/evaluate.sh
+++ b/applications/neural_search/ranking/ernie_matching/scripts/evaluate.sh
+unset CUDA_VISIBLE_DEVICES
+# gpu
+python -u -m paddle.distributed.launch --gpus "0" evaluate.py \
+        --device gpu \
+        --batch_size 32 \
+        --learning_rate 2E-5 \
+        --init_from_ckpt "./checkpoints/model_30000/model_state.pdparams" \
+        --test_file sort/dev_pairwise.csv
+
+# cpu
+# python  evaluate.py \
+#         --device cpu \
+#         --batch_size 32 \
+#         --learning_rate 2E-5 \
+#         --init_from_ckpt "./checkpoints/model_30000/model_state.pdparams" \
+#         --test_file sort/dev_pairwise.csv
\ No newline at end of file
--- a/applications/neural_search/ranking/ernie_matching/scripts/export_model.sh
+++ b/applications/neural_search/ranking/ernie_matching/scripts/export_model.sh
+python export_model.py --params_path checkpoints/model_30000/model_state.pdparams \
+                       --output_path=./output \
+                       --model_name_or_path ernie-3.0-medium-zh
\ No newline at end of file