llama_paddle

10f294ff · yuguo-Jack · 7c64e6ec · 10f294ff · 10f294ff · 10f294ff
Commit 10f294ff authored Dec 19, 2023 by yuguo-Jack
20 changed files
--- a/applications/question_answering/supervised_qa/faq_finance/ann_util.py
+++ b/applications/question_answering/supervised_qa/faq_finance/ann_util.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import hnswlib
+import numpy as np
+from paddlenlp.utils.log import logger
+def build_index(args, data_loader, model):
+    index = hnswlib.Index(space="ip", dim=args.output_emb_size if args.output_emb_size > 0 else 768)
+    # Initializing index
+    # max_elements - the maximum number of elements (capacity). Will throw an exception if exceeded
+    # during insertion of an element.
+    # The capacity can be increased by saving/loading the index, see below.
+    #
+    # ef_construction - controls index search speed/build speed tradeoff
+    #
+    # M - is tightly connected with internal dimensionality of the data. Strongly affects memory consumption (~M)
+    # Higher M leads to higher accuracy/run_time at fixed ef/efConstruction
+    index.init_index(max_elements=args.hnsw_max_elements, ef_construction=args.hnsw_ef, M=args.hnsw_m)
+    # Controlling the recall by setting ef:
+    # higher ef leads to better accuracy, but slower search
+    index.set_ef(args.hnsw_ef)
+    # Set number of threads used during batch search/construction
+    # By default using all available cores
+    index.set_num_threads(16)
+    logger.info("start build index..........")
+    all_embeddings = []
+    for text_embeddings in model.get_semantic_embedding(data_loader):
+        all_embeddings.append(text_embeddings.numpy())
+    all_embeddings = np.concatenate(all_embeddings, axis=0)
+    index.add_items(all_embeddings)
+    logger.info("Total index number:{}".format(index.get_current_count()))
+    return index
--- a/applications/question_answering/supervised_qa/faq_finance/config.py
+++ b/applications/question_answering/supervised_qa/faq_finance/config.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+search_param = {"nprobe": 20}
+collection_name = "faq_finance"
+partition_tag = "partition_1"
+MILVUS_HOST = "10.21.226.175"
+MILVUS_PORT = 8530
+data_dim = 256
+top_k = 10
+embedding_name = "embeddings"
+index_config = {
+    "index_type": "IVF_FLAT",
+    "metric_type": "L2",
+    "params": {"nlist": 1000},
+}
+search_params = {
+    "metric_type": "L2",
+    "params": {"nprobe": top_k},
+}
--- a/applications/question_answering/supervised_qa/faq_finance/data.py
+++ b/applications/question_answering/supervised_qa/faq_finance/data.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import random
+import numpy as np
+import paddle
+def gen_id2corpus(corpus_file):
+    id2corpus = {}
+    with open(corpus_file, "r", encoding="utf-8") as f:
+        for idx, line in enumerate(f):
+            id2corpus[idx] = line.rstrip()
+    return id2corpus
+def create_dataloader(dataset, mode="train", batch_size=1, batchify_fn=None, trans_fn=None):
+    if trans_fn:
+        dataset = dataset.map(trans_fn)
+    shuffle = True if mode == "train" else False
+    if mode == "train":
+        batch_sampler = paddle.io.DistributedBatchSampler(dataset, batch_size=batch_size, shuffle=shuffle)
+    else:
+        batch_sampler = paddle.io.BatchSampler(dataset, batch_size=batch_size, shuffle=shuffle)
+    return paddle.io.DataLoader(dataset=dataset, batch_sampler=batch_sampler, collate_fn=batchify_fn, return_list=True)
+def convert_example(example, tokenizer, max_seq_length=512, do_evalute=False):
+    """
+    Builds model inputs from a sequence.
+    A BERT sequence has the following format:
+    - single sequence: ``[CLS] X [SEP]``
+    Args:
+        example(obj:`list(str)`): The list of text to be converted to ids.
+        tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
+            which contains most of the methods. Users should refer to the superclass for more information regarding methods.
+        max_seq_len(obj:`int`): The maximum total input sequence length after tokenization.
+            Sequences longer than this will be truncated, sequences shorter will be padded.
+        is_test(obj:`False`, defaults to `False`): Whether the example contains label or not.
+    Returns:
+        input_ids(obj:`list[int]`): The list of query token ids.
+        token_type_ids(obj: `list[int]`): List of query sequence pair mask.
+    """
+    result = []
+    for key, text in example.items():
+        if "label" in key:
+            # do_evaluate
+            result += [example["label"]]
+        else:
+            # do_train
+            encoded_inputs = tokenizer(text=text, max_seq_len=max_seq_length)
+            input_ids = encoded_inputs["input_ids"]
+            token_type_ids = encoded_inputs["token_type_ids"]
+            result += [input_ids, token_type_ids]
+    return result
+def convert_example_test(example, tokenizer, max_seq_length=512, pad_to_max_seq_len=False):
+    """
+    Builds model inputs from a sequence.
+    A BERT sequence has the following format:
+    - single sequence: ``[CLS] X [SEP]``
+    Args:
+        example(obj:`list(str)`): The list of text to be converted to ids.
+        tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
+            which contains most of the methods. Users should refer to the superclass for more information regarding methods.
+        max_seq_len(obj:`int`): The maximum total input sequence length after tokenization.
+            Sequences longer than this will be truncated, sequences shorter will be padded.
+        is_test(obj:`False`, defaults to `False`): Whether the example contains label or not.
+    Returns:
+        input_ids(obj:`list[int]`): The list of query token ids.
+        token_type_ids(obj: `list[int]`): List of query sequence pair mask.
+    """
+    result = []
+    for key, text in example.items():
+        encoded_inputs = tokenizer(text=text, max_seq_len=max_seq_length, pad_to_max_seq_len=pad_to_max_seq_len)
+        input_ids = encoded_inputs["input_ids"]
+        token_type_ids = encoded_inputs["token_type_ids"]
+        result += [input_ids, token_type_ids]
+    return result
+def read_simcse_text(data_path):
+    """Reads data."""
+    with open(data_path, "r", encoding="utf-8") as f:
+        for line in f:
+            data = line.rstrip()
+            yield {"text_a": data, "text_b": data}
+def read_text_pair(data_path, is_test=False):
+    """Reads data."""
+    with open(data_path, "r", encoding="utf-8") as f:
+        for line in f:
+            data = line.rstrip().split("\t")
+            if is_test is True:
+                if len(data) != 3:
+                    continue
+                yield {"text_a": data[0], "text_b": data[1], "label": data[2]}
+            else:
+                if len(data) != 2:
+                    continue
+                yield {"text_a": data[0], "text_b": data[1]}
+def gen_text_file(similar_text_pair_file):
+    text2similar_text = {}
+    texts = []
+    with open(similar_text_pair_file, "r", encoding="utf-8") as f:
+        for line in f:
+            splited_line = line.rstrip().split("\t")
+            if len(splited_line) != 2:
+                continue
+            text, similar_text = line.rstrip().split("\t")
+            if not text or not similar_text:
+                continue
+            text2similar_text[text] = similar_text
+            texts.append({"text": text})
+    return texts, text2similar_text
+def word_repetition(input_ids, token_type_ids, dup_rate=0.32):
+    """Word Repetition strategy."""
+    input_ids = input_ids.numpy().tolist()
+    token_type_ids = token_type_ids.numpy().tolist()
+    batch_size, seq_len = len(input_ids), len(input_ids[0])
+    repetitied_input_ids = []
+    repetitied_token_type_ids = []
+    rep_seq_len = seq_len
+    for batch_id in range(batch_size):
+        cur_input_id = input_ids[batch_id]
+        actual_len = np.count_nonzero(cur_input_id)
+        dup_word_index = []
+        # If sequence length is less than 5, skip it
+        if actual_len > 5:
+            dup_len = random.randint(a=0, b=max(2, int(dup_rate * actual_len)))
+            # Skip cls and sep position
+            dup_word_index = random.sample(list(range(1, actual_len - 1)), k=dup_len)
+        r_input_id = []
+        r_token_type_id = []
+        for idx, word_id in enumerate(cur_input_id):
+            # Insert duplicate word
+            if idx in dup_word_index:
+                r_input_id.append(word_id)
+                r_token_type_id.append(token_type_ids[batch_id][idx])
+            r_input_id.append(word_id)
+            r_token_type_id.append(token_type_ids[batch_id][idx])
+        after_dup_len = len(r_input_id)
+        repetitied_input_ids.append(r_input_id)
+        repetitied_token_type_ids.append(r_token_type_id)
+        if after_dup_len > rep_seq_len:
+            rep_seq_len = after_dup_len
+    # Padding the data to the same length
+    for batch_id in range(batch_size):
+        after_dup_len = len(repetitied_input_ids[batch_id])
+        pad_len = rep_seq_len - after_dup_len
+        repetitied_input_ids[batch_id] += [0] * pad_len
+        repetitied_token_type_ids[batch_id] += [0] * pad_len
+    return paddle.to_tensor(repetitied_input_ids, dtype="int64"), paddle.to_tensor(
+        repetitied_token_type_ids, dtype="int64"
+    )
--- a/applications/question_answering/supervised_qa/faq_finance/deploy/python/config_nlp.yml
+++ b/applications/question_answering/supervised_qa/faq_finance/deploy/python/config_nlp.yml
+# worker_num, 最大并发数。当build_dag_each_worker=True时, 框架会创建worker_num个进程，每个进程内构建grpcSever和DAG
+# 当build_dag_each_worker=False时，框架会设置主线程grpc线程池的max_workers=worker_num
+worker_num: 20
+# build_dag_each_worker, False，框架在进程内创建一条DAG；True，框架会每个进程内创建多个独立的DAG
+build_dag_each_worker: false
+dag:
+  # op资源类型, True, 为线程模型；False，为进程模型
+  is_thread_op: False
+  # 使用性能分析, True，生成Timeline性能数据，对性能有一定影响；False为不使用
+  tracer:
+    interval_s: 10
+# http端口, rpc_port和http_port不允许同时为空。当rpc_port可用且http_port为空时，不自动生成http_port
+http_port: 8090
+# rpc端口, rpc_port和http_port不允许同时为空。当rpc_port为空且http_port不为空时，会自动将rpc_port设置为http_port+1
+rpc_port: 8080
+op:
+  ernie:
+    # 并发数，is_thread_op=True时，为线程并发；否则为进程并发
+    concurrency: 1
+    # 当op配置没有server_endpoints时，从local_service_conf读取本地服务配置
+    local_service_conf:
+      # client类型，包括brpc, grpc和local_predictor.local_predictor不启动Serving服务，进程内预测
+      client_type: local_predictor
+      # ir_optim
+      ir_optim: True
+      # device_type, 0=cpu, 1=gpu, 2=tensorRT, 3=arm cpu, 4=kunlun xpu
+      device_type: 1
+      # 计算硬件ID，当devices为""或不写时为CPU预测；当devices为"0", "0,1,2"时为GPU预测，表示使用的GPU卡
+      devices: '2'
+      # Fetch结果列表，以client_config中fetch_var的alias_name为准, 如果没有设置则全部返回
+      fetch_list: ['output_embedding']
+      # 模型路径
+      model_config: ../../serving_server/
--- a/applications/question_answering/supervised_qa/faq_finance/deploy/python/http_client.py
+++ b/applications/question_answering/supervised_qa/faq_finance/deploy/python/http_client.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import numpy as np
+import requests
+headers = {"Content-type": "application/json"}
+url = "http://10.21.226.175:8080/ernie/prediction"  # XXX取决于服务端YourService的初始化name参数
+data = {"feed": ["买了社保，是不是就不用买商业保险了？"], "fetch": ["output_embedding"]}
+data = json.dumps(data)
+print(data)
+r = requests.post(url=url, headers=headers, data=data)
+print(r.json())
+json_data = r.json()
+data = np.array(json_data["result"]["output_embedding"])
+print(data.shape)
--- a/applications/question_answering/supervised_qa/faq_finance/deploy/python/rpc_client.py
+++ b/applications/question_answering/supervised_qa/faq_finance/deploy/python/rpc_client.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+import numpy as np
+from paddle_serving_server.pipeline import PipelineClient
+client = PipelineClient()
+client.connect(["127.0.0.1:8080"])
+list_data = ["买了社保，是不是就不用买商业保险了？"]
+feed = {}
+for i, item in enumerate(list_data):
+    feed[str(i)] = item
+print(feed)
+start_time = time.time()
+ret = client.predict(feed_dict=feed)
+end_time = time.time()
+print("time to cost :{} seconds".format(end_time - start_time))
+result = np.array(eval(ret.value[0]))
+print(ret.key)
+print(result.shape)
+print(result)
--- a/applications/question_answering/supervised_qa/faq_finance/deploy/python/web_service.py
+++ b/applications/question_answering/supervised_qa/faq_finance/deploy/python/web_service.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from paddle_serving_server.web_service import Op, WebService
+# yapf: disable
+parser = argparse.ArgumentParser()
+parser.add_argument('--model_name_or_path', default="rocketqa-zh-base-query-encoder", help="Select tokenizer name to for model")
+args = parser.parse_args()
+# yapf: enable
+def convert_example(example, tokenizer, max_seq_length=512, pad_to_max_seq_len=False):
+    result = []
+    for text in example:
+        encoded_inputs = tokenizer(text=text, max_seq_len=max_seq_length, pad_to_max_seq_len=pad_to_max_seq_len)
+        input_ids = encoded_inputs["input_ids"]
+        token_type_ids = encoded_inputs["token_type_ids"]
+        result += [input_ids, token_type_ids]
+    return result
+class ErnieOp(Op):
+    def init_op(self):
+        from paddlenlp.transformers import AutoTokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
+    def preprocess(self, input_dicts, data_id, log_id):
+        from paddlenlp.data import Pad, Tuple
+        ((_, input_dict),) = input_dicts.items()
+        print("input dict", input_dict)
+        batch_size = len(input_dict.keys())
+        examples = []
+        for i in range(batch_size):
+            input_ids, segment_ids = convert_example([input_dict[str(i)]], self.tokenizer)
+            examples.append((input_ids, segment_ids))
+        def batchify_fn(
+            samples,
+            fn=Tuple(
+                Pad(axis=0, pad_val=self.tokenizer.pad_token_id, dtype="int64"),  # input
+                Pad(axis=0, pad_val=self.tokenizer.pad_token_type_id, dtype="int64"),  # segment
+            ),
+        ):
+            return fn(samples)
+        input_ids, segment_ids = batchify_fn(examples)
+        feed_dict = {}
+        feed_dict["input_ids"] = input_ids
+        feed_dict["token_type_ids"] = segment_ids
+        return feed_dict, False, None, ""
+    def postprocess(self, input_dicts, fetch_dict, data_id, log_id):
+        new_dict = {}
+        new_dict["output_embedding"] = str(fetch_dict["output_embedding"].tolist())
+        return new_dict, None, ""
+class ErnieService(WebService):
+    def get_pipeline_response(self, read_op):
+        ernie_op = ErnieOp(name="ernie", input_ops=[read_op])
+        return ernie_op
+if __name__ == "__main__":
+    ernie_service = ErnieService(name="ernie")
+    ernie_service.prepare_pipeline_config("config_nlp.yml")
+    ernie_service.run_service()
--- a/applications/question_answering/supervised_qa/faq_finance/deploy/python/web_service_windows.py
+++ b/applications/question_answering/supervised_qa/faq_finance/deploy/python/web_service_windows.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from paddle_serving_server.web_service import WebService
+# yapf: disable
+parser = argparse.ArgumentParser()
+parser.add_argument('--model_name_or_path', default="rocketqa-zh-base-query-encoder", help="Select tokenizer name to for model")
+args = parser.parse_args()
+# yapf: enable
+def convert_example(example, tokenizer, max_seq_length=512, pad_to_max_seq_len=False):
+    result = []
+    for text in example:
+        encoded_inputs = tokenizer(text=text, max_seq_len=max_seq_length, pad_to_max_seq_len=pad_to_max_seq_len)
+        input_ids = encoded_inputs["input_ids"]
+        token_type_ids = encoded_inputs["token_type_ids"]
+        result += [input_ids, token_type_ids]
+    return result
+class ErnieService(WebService):
+    def init_service(self):
+        from paddlenlp.transformers import AutoTokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
+    def preprocess(self, feed=[], fetch=[]):
+        from paddlenlp.data import Pad, Tuple
+        print("input dict", feed)
+        batch_size = len(feed)
+        is_batch = True
+        examples = []
+        for i in range(batch_size):
+            input_ids, segment_ids = convert_example([feed[i]], self.tokenizer)
+            examples.append((input_ids, segment_ids))
+        def batchify_fn(
+            samples,
+            fn=Tuple(
+                Pad(axis=0, pad_val=self.tokenizer.pad_token_id, dtype="int64"),  # input
+                Pad(axis=0, pad_val=self.tokenizer.pad_token_type_id, dtype="int64"),  # segment
+            ),
+        ):
+            return fn(samples)
+        input_ids, segment_ids = batchify_fn(examples)
+        feed_dict = {}
+        feed_dict["input_ids"] = input_ids
+        feed_dict["token_type_ids"] = segment_ids
+        return feed_dict, fetch, is_batch
+    def postprocess(self, feed=[], fetch=[], fetch_map=None):
+        for key in fetch_map:
+            fetch_map[key] = fetch_map[key].tolist()
+        return fetch_map
+if __name__ == "__main__":
+    ernie_service = ErnieService(name="ernie")
+    ernie_service.load_model_config("../../serving_server")
+    ernie_service.prepare_server(workdir="workdir", port=8080)
+    ernie_service.init_service()
+    ernie_service.run_debugger_service()
+    ernie_service.run_web_service()
--- a/applications/question_answering/supervised_qa/faq_finance/evaluate.py
+++ b/applications/question_answering/supervised_qa/faq_finance/evaluate.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import numpy as np
+# yapf: disable
+parser = argparse.ArgumentParser()
+parser.add_argument("--similar_text_pair", type=str, default='', help="The full path of similat pair file")
+parser.add_argument("--recall_result_file", type=str, default='', help="The full path of recall result file")
+parser.add_argument("--recall_num", type=int, default=10, help="Most similair number of doc recalled from corpus per query")
+args = parser.parse_args()
+# yapf: enable
+def recall(rs, N=10):
+    """
+    Ratio of recalled Ground Truth at topN Recalled Docs
+    >>> rs = [[0, 0, 1], [0, 1, 0], [1, 0, 0]]
+    >>> recall(rs, N=1)
+    0.333333
+    >>> recall(rs, N=2)
+    >>> 0.6666667
+    >>> recall(rs, N=3)
+    >>> 1.0
+    Args:
+        rs: Iterator of recalled flag()
+    Returns:
+        Recall@N
+    """
+    recall_flags = [np.sum(r[0:N]) for r in rs]
+    return np.mean(recall_flags)
+if __name__ == "__main__":
+    text2similar = {}
+    with open(args.similar_text_pair, "r", encoding="utf-8") as f:
+        for line in f:
+            text, similar_text = line.rstrip().split("\t")
+            text2similar[text] = similar_text
+    rs = []
+    with open(args.recall_result_file, "r", encoding="utf-8") as f:
+        relevance_labels = []
+        for index, line in enumerate(f):
+            if index % args.recall_num == 0 and index != 0:
+                rs.append(relevance_labels)
+                relevance_labels = []
+            text, recalled_text, cosine_sim = line.rstrip().split("\t")
+            if text2similar[text] == recalled_text:
+                relevance_labels.append(1)
+            else:
+                relevance_labels.append(0)
+    recall_N = []
+    recall_num = [1, 5, 10]
+    result = open("result.tsv", "a")
+    res = []
+    for topN in recall_num:
+        R = round(100 * recall(rs, N=topN), 3)
+        recall_N.append(str(R))
+    for key, val in zip(recall_num, recall_N):
+        print("recall@{}={}".format(key, val))
+        res.append(str(val))
+    result.write("\t".join(res) + "\n")
+    # print("\t".join(recall_N))
--- a/applications/question_answering/supervised_qa/faq_finance/export_model.py
+++ b/applications/question_answering/supervised_qa/faq_finance/export_model.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import paddle
+from model import SimCSE
+from paddlenlp.transformers import AutoModel, AutoTokenizer
+# yapf: disable
+parser = argparse.ArgumentParser()
+parser.add_argument("--params_path", type=str, required=True,
+                    default='./checkpoint/model_50/model_state.pdparams', help="The path to model parameters to be loaded.")
+parser.add_argument("--output_path", type=str, default='./output',
+                    help="The path of model parameter in static graph to be saved.")
+parser.add_argument('--model_name_or_path', default="rocketqa-zh-base-query-encoder", help="The pretrained model used for training")
+parser.add_argument("--output_emb_size", default=256, type=int, help="Output_embedding_size, 0 means use hidden_size as output embedding size.")
+args = parser.parse_args()
+# yapf: enable
+if __name__ == "__main__":
+    pretrained_model = AutoModel.from_pretrained(args.model_name_or_path)
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
+    model = SimCSE(pretrained_model, output_emb_size=args.output_emb_size)
+    if args.params_path and os.path.isfile(args.params_path):
+        state_dict = paddle.load(args.params_path)
+        model.set_dict(state_dict)
+        print("Loaded parameters from %s" % args.params_path)
+    else:
+        raise ValueError("Please set --params_path with correct pretrained model file")
+    model.eval()
+    # Convert to static graph with specific input description
+    model = paddle.jit.to_static(
+        model,
+        input_spec=[
+            paddle.static.InputSpec(shape=[None, None], dtype="int64"),  # input_ids
+            paddle.static.InputSpec(shape=[None, None], dtype="int64"),  # segment_ids
+        ],
+    )
+    # Save in static graph model.
+    save_path = os.path.join(args.output_path, "inference")
+    paddle.jit.save(model, save_path)
--- a/applications/question_answering/supervised_qa/faq_finance/export_to_serving.py
+++ b/applications/question_answering/supervised_qa/faq_finance/export_to_serving.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import paddle_serving_client.io as serving_io
+# yapf: disable
+parser = argparse.ArgumentParser()
+parser.add_argument("--dirname", type=str, required=True,
+                    default='./output', help="Path of saved model files. Program file and parameter files are saved in this directory.")
+parser.add_argument("--model_filename", type=str, required=True,
+                    default='inference.get_pooled_embedding.pdmodel', help="The name of file to load the inference program. If it is None, the default filename __model__ will be used.")
+parser.add_argument("--params_filename", type=str, required=True,
+                    default='inference.get_pooled_embedding.pdiparams', help="The name of file to load all parameters. It is only used for the case that all parameters were saved in a single binary file. If parameters were saved in separate files, set it as None. Default: None.")
+parser.add_argument("--server_path", type=str, default='./serving_server',
+                    help="The path of server parameter in static graph to be saved.")
+parser.add_argument("--client_path", type=str, default='./serving_client',
+                    help="The path of client parameter in static graph to be saved.")
+parser.add_argument("--feed_alias_names", type=str, default=None,
+                    help='set alias names for feed vars, split by comma \',\', you should run --show_proto to check the number of feed vars')
+parser.add_argument("--fetch_alias_names", type=str, default=None,
+                    help='set alias names for feed vars, split by comma \',\', you should run --show_proto to check the number of fetch vars')
+parser.add_argument("--show_proto", type=bool, default=False,
+                    help='If yes, you can preview the proto and then determine your feed var alias name and fetch var alias name.')
+# yapf: enable
+if __name__ == "__main__":
+    args = parser.parse_args()
+    serving_io.inference_model_to_serving(
+        dirname=args.dirname,
+        serving_server=args.server_path,
+        serving_client=args.client_path,
+        model_filename=args.model_filename,
+        params_filename=args.params_filename,
+        show_proto=args.show_proto,
+        feed_alias_names=args.feed_alias_names,
+        fetch_alias_names=args.fetch_alias_names,
+    )
--- a/applications/question_answering/supervised_qa/faq_finance/feature_extract.py
+++ b/applications/question_answering/supervised_qa/faq_finance/feature_extract.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import numpy as np
+import paddle
+from paddle import inference
+from tqdm import tqdm
+from paddlenlp.data import Pad, Tuple
+from paddlenlp.transformers import AutoTokenizer
+# yapf: disable
+parser = argparse.ArgumentParser()
+parser.add_argument("--model_dir", type=str, required=True, help="The directory to static model.")
+parser.add_argument("--corpus_file", type=str, required=True, help="The corpus_file path.")
+parser.add_argument('--model_name_or_path', default="rocketqa-zh-base-query-encoder", help="The pretrained model used for training")
+parser.add_argument("--max_seq_length", default=64, type=int, help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.")
+parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.")
+parser.add_argument('--device', choices=['cpu', 'gpu', 'xpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
+parser.add_argument('--use_tensorrt', default=False, type=eval, choices=[True, False], help='Enable to use tensorrt to speed up.')
+parser.add_argument("--precision", default="fp32", type=str, choices=["fp32", "fp16", "int8"], help='The tensorrt precision.')
+parser.add_argument('--cpu_threads', default=10, type=int, help='Number of threads to predict when using cpu.')
+parser.add_argument('--enable_mkldnn', default=False, type=eval, choices=[True, False], help='Enable to use mkldnn to speed up when using cpu.')
+args = parser.parse_args()
+# yapf: enable
+def convert_example(example, tokenizer, max_seq_length=512, pad_to_max_seq_len=False):
+    """
+    Builds model inputs from a sequence.
+    A BERT sequence has the following format:
+    - single sequence: ``[CLS] X [SEP]``
+    Args:
+        example(obj:`list(str)`): The list of text to be converted to ids.
+        tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
+            which contains most of the methods. Users should refer to the superclass for more information regarding methods.
+        max_seq_len(obj:`int`): The maximum total input sequence length after tokenization.
+            Sequences longer than this will be truncated, sequences shorter will be padded.
+        is_test(obj:`False`, defaults to `False`): Whether the example contains label or not.
+    Returns:
+        input_ids(obj:`list[int]`): The list of query token ids.
+        token_type_ids(obj: `list[int]`): List of query sequence pair mask.
+    """
+    result = []
+    for key, text in example.items():
+        encoded_inputs = tokenizer(text=text, max_seq_len=max_seq_length, pad_to_max_seq_len=pad_to_max_seq_len)
+        input_ids = encoded_inputs["input_ids"]
+        token_type_ids = encoded_inputs["token_type_ids"]
+        result += [input_ids, token_type_ids]
+    return result
+class Predictor(object):
+    def __init__(
+        self,
+        model_dir,
+        device="gpu",
+        max_seq_length=128,
+        batch_size=32,
+        use_tensorrt=False,
+        precision="fp32",
+        cpu_threads=10,
+        enable_mkldnn=False,
+    ):
+        self.max_seq_length = max_seq_length
+        self.batch_size = batch_size
+        model_file = model_dir + "/inference.get_pooled_embedding.pdmodel"
+        params_file = model_dir + "/inference.get_pooled_embedding.pdiparams"
+        if not os.path.exists(model_file):
+            raise ValueError("not find model file path {}".format(model_file))
+        if not os.path.exists(params_file):
+            raise ValueError("not find params file path {}".format(params_file))
+        config = paddle.inference.Config(model_file, params_file)
+        if device == "gpu":
+            # set GPU configs accordingly
+            # such as initialize the gpu memory, enable tensorrt
+            config.enable_use_gpu(100, 0)
+            precision_map = {
+                "fp16": inference.PrecisionType.Half,
+                "fp32": inference.PrecisionType.Float32,
+                "int8": inference.PrecisionType.Int8,
+            }
+            precision_mode = precision_map[precision]
+            if args.use_tensorrt:
+                config.enable_tensorrt_engine(
+                    max_batch_size=batch_size, min_subgraph_size=30, precision_mode=precision_mode
+                )
+        elif device == "cpu":
+            # set CPU configs accordingly,
+            # such as enable_mkldnn, set_cpu_math_library_num_threads
+            config.disable_gpu()
+            if args.enable_mkldnn:
+                # cache 10 different shapes for mkldnn to avoid memory leak
+                config.set_mkldnn_cache_capacity(10)
+                config.enable_mkldnn()
+            config.set_cpu_math_library_num_threads(args.cpu_threads)
+        elif device == "xpu":
+            # set XPU configs accordingly
+            config.enable_xpu(100)
+        config.switch_use_feed_fetch_ops(False)
+        self.predictor = paddle.inference.create_predictor(config)
+        self.input_handles = [self.predictor.get_input_handle(name) for name in self.predictor.get_input_names()]
+        self.output_handle = self.predictor.get_output_handle(self.predictor.get_output_names()[0])
+    def predict(self, data, tokenizer):
+        """
+        Predicts the data labels.
+        Args:
+            data (obj:`List(str)`): The batch data whose each element is a raw text.
+            tokenizer(obj:`PretrainedTokenizer`): This tokenizer inherits from :class:`~paddlenlp.transformers.PretrainedTokenizer`
+                which contains most of the methods. Users should refer to the superclass for more information regarding methods.
+        Returns:
+            results(obj:`dict`): All the predictions labels.
+        """
+        def batchify_fn(
+            samples,
+            fn=Tuple(
+                Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"),  # input
+                Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"),  # segment
+            ),
+        ):
+            return fn(samples)
+        all_embeddings = []
+        examples = []
+        for idx, text in enumerate(tqdm(data)):
+            input_ids, segment_ids = convert_example(
+                text, tokenizer, max_seq_length=self.max_seq_length, pad_to_max_seq_len=True
+            )
+            examples.append((input_ids, segment_ids))
+            if len(examples) >= self.batch_size:
+                input_ids, segment_ids = batchify_fn(examples)
+                self.input_handles[0].copy_from_cpu(input_ids)
+                self.input_handles[1].copy_from_cpu(segment_ids)
+                self.predictor.run()
+                logits = self.output_handle.copy_to_cpu()
+                all_embeddings.append(logits)
+                examples = []
+        if len(examples) > 0:
+            input_ids, segment_ids = batchify_fn(examples)
+            self.input_handles[0].copy_from_cpu(input_ids)
+            self.input_handles[1].copy_from_cpu(segment_ids)
+            self.predictor.run()
+            logits = self.output_handle.copy_to_cpu()
+            all_embeddings.append(logits)
+        all_embeddings = np.concatenate(all_embeddings, axis=0)
+        np.save("corpus_embedding", all_embeddings)
+def read_text(file_path):
+    file = open(file_path)
+    id2corpus = {}
+    for idx, data in enumerate(file.readlines()):
+        id2corpus[idx] = data.strip()
+    return id2corpus
+if __name__ == "__main__":
+    predictor = Predictor(
+        args.model_dir,
+        args.device,
+        args.max_seq_length,
+        args.batch_size,
+        args.use_tensorrt,
+        args.precision,
+        args.cpu_threads,
+        args.enable_mkldnn,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
+    id2corpus = read_text(args.corpus_file)
+    corpus_list = [{idx: text} for idx, text in id2corpus.items()]
+    predictor.predict(corpus_list, tokenizer)
--- a/applications/question_answering/supervised_qa/faq_finance/milvus_ann_search.py
+++ b/applications/question_answering/supervised_qa/faq_finance/milvus_ann_search.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import time
+import numpy as np
+from config import collection_name, embedding_name, partition_tag
+from milvus_util import RecallByMilvus, VecToMilvus, text_max_len
+from tqdm import tqdm
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--data_path", default="data/corpus.csv", type=str, required=True, help="The data for vector extraction."
+)
+parser.add_argument(
+    "--embedding_path", default="corpus_embedding.npy", type=str, required=True, help="The vector path for data."
+)
+parser.add_argument("--index", default=0, type=int, help="index of the vector for search")
+parser.add_argument("--insert", action="store_true", help="whether to insert data")
+parser.add_argument("--search", action="store_true", help="whether to search data")
+parser.add_argument("--batch_size", default=100000, type=int, help="number of examples to insert each time")
+args = parser.parse_args()
+def read_text(file_path):
+    file = open(file_path)
+    id2corpus = []
+    for idx, data in enumerate(file.readlines()):
+        question, answer = data.strip().split("\t")
+        id2corpus.append({"question": question, "answer": answer})
+    return id2corpus
+def milvus_data_insert(data_path, embedding_path, batch_size):
+    corpus_list = read_text(data_path)
+    embeddings = np.load(embedding_path)
+    embedding_ids = [i for i in range(embeddings.shape[0])]
+    client = VecToMilvus()
+    client.drop_collection(collection_name)
+    data_size = len(embedding_ids)
+    for i in tqdm(range(0, data_size, batch_size)):
+        cur_end = i + batch_size
+        if cur_end > data_size:
+            cur_end = data_size
+        batch_emb = embeddings[np.arange(i, cur_end)]
+        entities = [
+            [j for j in range(i, cur_end, 1)],
+            [corpus_list[j]["question"][: text_max_len - 1] for j in range(i, cur_end, 1)],
+            [corpus_list[j]["answer"][: text_max_len - 1] for j in range(i, cur_end, 1)],
+            batch_emb,  # field embeddings, supports numpy.ndarray and list
+        ]
+        client.insert(
+            collection_name=collection_name, entities=entities, index_name=embedding_name, partition_tag=partition_tag
+        )
+def milvus_data_recall(embedding_path, index):
+    embeddings = np.load(embedding_path)
+    embedding_ids = [i for i in range(embeddings.shape[0])]
+    recall_client = RecallByMilvus()
+    if index > len(embedding_ids):
+        print("Index should not be larger than embedding size")
+        return
+    embeddings = embeddings[np.arange(index, index + 1)]
+    time_start = time.time()
+    result = recall_client.search(
+        embeddings, embedding_name, collection_name, partition_names=[partition_tag], output_fields=["pk", "text"]
+    )
+    time_end = time.time()
+    sum_t = time_end - time_start
+    print("time cost", sum_t, "s")
+    for hits in result:
+        for hit in hits:
+            print(f"hit: {hit}, text field: {hit.entity.get('text')}")
+if __name__ == "__main__":
+    if args.insert:
+        milvus_data_insert(args.data_path, args.embedding_path, args.batch_size)
+    if args.search:
+        milvus_data_recall(args.embedding_path, args.index)
--- a/applications/question_answering/supervised_qa/faq_finance/milvus_util.py
+++ b/applications/question_answering/supervised_qa/faq_finance/milvus_util.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+from config import (
+    MILVUS_HOST,
+    MILVUS_PORT,
+    data_dim,
+    index_config,
+    search_params,
+    top_k,
+)
+from pymilvus import (
+    Collection,
+    CollectionSchema,
+    DataType,
+    FieldSchema,
+    connections,
+    utility,
+)
+fmt = "\n=== {:30} ===\n"
+text_max_len = 1000
+fields = [
+    FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=False, max_length=100),
+    FieldSchema(name="question", dtype=DataType.VARCHAR, max_length=text_max_len),
+    FieldSchema(name="answer", dtype=DataType.VARCHAR, max_length=text_max_len),
+    FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=data_dim),
+]
+schema = CollectionSchema(fields, "Neural Search Index")
+class VecToMilvus:
+    def __init__(self):
+        print(fmt.format("start connecting to Milvus"))
+        connections.connect("default", host=MILVUS_HOST, port=MILVUS_PORT)
+        self.collection = None
+    def has_collection(self, collection_name):
+        try:
+            has = utility.has_collection(collection_name)
+            print(f"Does collection {collection_name} exist in Milvus: {has}")
+            return has
+        except Exception as e:
+            print("Milvus has_table error:", e)
+    def creat_collection(self, collection_name):
+        try:
+            print(fmt.format("Create collection {}".format(collection_name)))
+            self.collection = Collection(collection_name, schema, consistency_level="Strong")
+        except Exception as e:
+            print("Milvus create collection error:", e)
+    def drop_collection(self, collection_name):
+        try:
+            utility.drop_collection(collection_name)
+        except Exception as e:
+            print("Milvus delete collection error:", e)
+    def create_index(self, index_name):
+        try:
+            print(fmt.format("Start Creating index"))
+            self.collection.create_index(index_name, index_config)
+            print(fmt.format("Start loading"))
+            self.collection.load()
+        except Exception as e:
+            print("Milvus create index error:", e)
+    def has_partition(self, partition_tag):
+        try:
+            result = self.collection.has_partition(partition_tag)
+            return result
+        except Exception as e:
+            print("Milvus has partition error: ", e)
+    def create_partition(self, partition_tag):
+        try:
+            self.collection.create_partition(partition_tag)
+            print("create partition {} successfully".format(partition_tag))
+        except Exception as e:
+            print("Milvus create partition error: ", e)
+    def insert(self, entities, collection_name, index_name, partition_tag=None):
+        try:
+            if not self.has_collection(collection_name):
+                self.creat_collection(collection_name)
+                self.create_index(index_name)
+            else:
+                self.collection = Collection(collection_name)
+            if (partition_tag is not None) and (not self.has_partition(partition_tag)):
+                self.create_partition(partition_tag)
+            self.collection.insert(entities, partition_name=partition_tag)
+            print(f"Number of entities in Milvus: {self.collection.num_entities}")  # check the num_entites
+        except Exception as e:
+            print("Milvus insert error:", e)
+class RecallByMilvus:
+    def __init__(self):
+        print(fmt.format("start connecting to Milvus"))
+        connections.connect("default", host=MILVUS_HOST, port=MILVUS_PORT)
+        self.collection = None
+    def get_collection(self, collection_name):
+        try:
+            print(fmt.format("Connect collection {}".format(collection_name)))
+            self.collection = Collection(collection_name)
+        except Exception as e:
+            print("Milvus create collection error:", e)
+    def search(self, vectors, embedding_name, collection_name, partition_names=[], output_fields=[]):
+        try:
+            self.get_collection(collection_name)
+            result = self.collection.search(
+                vectors,
+                embedding_name,
+                search_params,
+                limit=top_k,
+                partition_names=partition_names,
+                output_fields=output_fields,
+            )
+            return result
+        except Exception as e:
+            print("Milvus recall error: ", e)
+if __name__ == "__main__":
+    print(fmt.format("Start inserting entities"))
+    rng = np.random.default_rng(seed=19530)
+    num_entities = 3000
+    entities = [
+        # provide the pk field because `auto_id` is set to False
+        [i for i in range(num_entities)],
+        ["第{}个样本".format(i) for i in range(num_entities)],  # field text, only supports list
+        rng.random((num_entities, data_dim)),  # field embeddings, supports numpy.ndarray and list
+    ]
+    print(entities[-1].shape)
+    collection_name = "test1"
+    partition_tag = "partition_1"
+    embedding_name = "embeddings"
+    client = VecToMilvus()
+    client.insert(
+        collection_name=collection_name, entities=entities, index_name=embedding_name, partition_tag=partition_tag
+    )
+    print(fmt.format("Start searching entities"))
+    vectors_to_search = entities[-1][-2:]
+    recall_client = RecallByMilvus()
+    result = recall_client.search(
+        vectors_to_search,
+        embedding_name,
+        collection_name,
+        partition_names=[partition_tag],
+        output_fields=["pk", "text"],
+    )
+    for hits in result:
+        for hit in hits:
+            print(f"hit: {hit}, random field: {hit.entity.get('text')}")
--- a/applications/question_answering/supervised_qa/faq_finance/model.py
+++ b/applications/question_answering/supervised_qa/faq_finance/model.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddlenlp
+class SimCSE(nn.Layer):
+    def __init__(self, pretrained_model, dropout=None, margin=0.0, scale=20, output_emb_size=None):
+        super().__init__()
+        self.ptm = pretrained_model
+        self.dropout = nn.Dropout(dropout if dropout is not None else 0.1)
+        # if output_emb_size is greater than 0, then add Linear layer to reduce embedding_size,
+        # we recommend set output_emb_size = 256 considering the trade-off between
+        # recall performance and efficiency
+        self.output_emb_size = output_emb_size
+        if output_emb_size > 0:
+            weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=0.02))
+            self.emb_reduce_linear = paddle.nn.Linear(768, output_emb_size, weight_attr=weight_attr)
+        self.margin = margin
+        # Used scaling cosine similarity to ease converge
+        self.sacle = scale
+        self.classifier = nn.Linear(output_emb_size, 2)
+        self.rdrop_loss = paddlenlp.losses.RDropLoss()
+    @paddle.jit.to_static(
+        input_spec=[
+            paddle.static.InputSpec(shape=[None, None], dtype="int64"),
+            paddle.static.InputSpec(shape=[None, None], dtype="int64"),
+        ]
+    )
+    def get_pooled_embedding(
+        self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, with_pooler=True
+    ):
+        # Note: cls_embedding is poolerd embedding with act tanh
+        sequence_output, cls_embedding = self.ptm(input_ids, token_type_ids, position_ids, attention_mask)
+        if with_pooler is False:
+            cls_embedding = sequence_output[:, 0, :]
+        if self.output_emb_size > 0:
+            cls_embedding = self.emb_reduce_linear(cls_embedding)
+        cls_embedding = self.dropout(cls_embedding)
+        cls_embedding = F.normalize(cls_embedding, p=2, axis=-1)
+        return cls_embedding
+    def get_semantic_embedding(self, data_loader):
+        self.eval()
+        with paddle.no_grad():
+            for batch_data in data_loader:
+                input_ids, token_type_ids = batch_data
+                text_embeddings = self.get_pooled_embedding(input_ids, token_type_ids=token_type_ids)
+                yield text_embeddings
+    def cosine_sim(
+        self,
+        query_input_ids,
+        title_input_ids,
+        query_token_type_ids=None,
+        query_position_ids=None,
+        query_attention_mask=None,
+        title_token_type_ids=None,
+        title_position_ids=None,
+        title_attention_mask=None,
+        with_pooler=True,
+    ):
+        query_cls_embedding = self.get_pooled_embedding(
+            query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask, with_pooler=with_pooler
+        )
+        title_cls_embedding = self.get_pooled_embedding(
+            title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask, with_pooler=with_pooler
+        )
+        cosine_sim = paddle.sum(query_cls_embedding * title_cls_embedding, axis=-1)
+        return cosine_sim
+    def forward(
+        self,
+        query_input_ids,
+        title_input_ids,
+        query_token_type_ids=None,
+        query_position_ids=None,
+        query_attention_mask=None,
+        title_token_type_ids=None,
+        title_position_ids=None,
+        title_attention_mask=None,
+    ):
+        query_cls_embedding = self.get_pooled_embedding(
+            query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask
+        )
+        title_cls_embedding = self.get_pooled_embedding(
+            title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask
+        )
+        logits1 = self.classifier(query_cls_embedding)
+        logits2 = self.classifier(title_cls_embedding)
+        kl_loss = self.rdrop_loss(logits1, logits2)
+        cosine_sim = paddle.matmul(query_cls_embedding, title_cls_embedding, transpose_y=True)
+        # substract margin from all positive samples cosine_sim()
+        margin_diag = paddle.full(
+            shape=[query_cls_embedding.shape[0]], fill_value=self.margin, dtype=paddle.get_default_dtype()
+        )
+        cosine_sim = cosine_sim - paddle.diag(margin_diag)
+        # scale cosine to ease training converge
+        cosine_sim *= self.sacle
+        labels = paddle.arange(0, query_cls_embedding.shape[0], dtype="int64")
+        labels = paddle.reshape(labels, shape=[-1, 1])
+        loss = F.cross_entropy(input=cosine_sim, label=labels)
+        return loss, kl_loss
--- a/applications/question_answering/supervised_qa/faq_finance/recall.py
+++ b/applications/question_answering/supervised_qa/faq_finance/recall.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from functools import partial
+import paddle
+from ann_util import build_index
+from data import convert_example_test, create_dataloader, gen_id2corpus, gen_text_file
+from model import SimCSE
+from paddlenlp.data import Pad, Tuple
+from paddlenlp.datasets import MapDataset
+from paddlenlp.transformers import AutoModel, AutoTokenizer
+from paddlenlp.utils.log import logger
+# yapf: disable
+parser = argparse.ArgumentParser()
+parser.add_argument("--corpus_file", type=str, required=True, help="The full path of input file")
+parser.add_argument("--similar_text_pair_file", type=str, required=True, help="The full path of similar text pair file")
+parser.add_argument("--recall_result_dir", type=str, default='recall_result', help="The full path of recall result file to save")
+parser.add_argument("--recall_result_file", type=str, default='recall_result_file', help="The file name of recall result")
+parser.add_argument("--params_path", type=str, required=True, help="The path to model parameters to be loaded.")
+parser.add_argument("--max_seq_length", default=64, type=int, help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.")
+parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.")
+parser.add_argument("--output_emb_size", default=None, type=int, help="output_embedding_size")
+parser.add_argument("--recall_num", default=10, type=int, help="Recall number for each query from Ann index.")
+parser.add_argument('--model_name_or_path', default="rocketqa-zh-base-query-encoder", help="The pretrained model used for training")
+parser.add_argument("--hnsw_m", default=100, type=int, help="Recall number for each query from Ann index.")
+parser.add_argument("--hnsw_ef", default=100, type=int, help="Recall number for each query from Ann index.")
+parser.add_argument("--hnsw_max_elements", default=1000000, type=int, help="Recall number for each query from Ann index.")
+parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
+args = parser.parse_args()
+# yapf: enable
+if __name__ == "__main__":
+    paddle.set_device(args.device)
+    rank = paddle.distributed.get_rank()
+    if paddle.distributed.get_world_size() > 1:
+        paddle.distributed.init_parallel_env()
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
+    trans_func = partial(convert_example_test, tokenizer=tokenizer, max_seq_length=args.max_seq_length)
+    def batchify_fn(
+        samples,
+        fn=Tuple(
+            Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"),  # text_input
+            Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"),  # text_segment
+        ),
+    ):
+        return [data for data in fn(samples)]
+    pretrained_model = AutoModel.from_pretrained(args.model_name_or_path)
+    model = SimCSE(pretrained_model, output_emb_size=args.output_emb_size)
+    model = paddle.DataParallel(model)
+    # Load pretrained semantic model
+    if args.params_path and os.path.isfile(args.params_path):
+        state_dict = paddle.load(args.params_path)
+        model.set_dict(state_dict)
+        logger.info("Loaded parameters from %s" % args.params_path)
+    else:
+        raise ValueError("Please set --params_path with correct pretrained model file")
+    id2corpus = gen_id2corpus(args.corpus_file)
+    # conver_example function's input must be dict
+    corpus_list = [{idx: text} for idx, text in id2corpus.items()]
+    corpus_ds = MapDataset(corpus_list)
+    corpus_data_loader = create_dataloader(
+        corpus_ds, mode="predict", batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func
+    )
+    # Need better way to get inner model of DataParallel
+    inner_model = model._layers
+    final_index = build_index(args, corpus_data_loader, inner_model)
+    text_list, text2similar_text = gen_text_file(args.similar_text_pair_file)
+    query_ds = MapDataset(text_list)
+    query_data_loader = create_dataloader(
+        query_ds, mode="predict", batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func
+    )
+    query_embedding = inner_model.get_semantic_embedding(query_data_loader)
+    if not os.path.exists(args.recall_result_dir):
+        os.mkdir(args.recall_result_dir)
+    recall_result_file = os.path.join(args.recall_result_dir, args.recall_result_file)
+    with open(recall_result_file, "w", encoding="utf-8") as f:
+        for batch_index, batch_query_embedding in enumerate(query_embedding):
+            recalled_idx, cosine_sims = final_index.knn_query(batch_query_embedding.numpy(), args.recall_num)
+            batch_size = len(cosine_sims)
+            for row_index in range(batch_size):
+                text_index = args.batch_size * batch_index + row_index
+                for idx, doc_idx in enumerate(recalled_idx[row_index]):
+                    f.write(
+                        "{}\t{}\t{}\n".format(
+                            text_list[text_index]["text"], id2corpus[doc_idx], 1.0 - cosine_sims[row_index][idx]
+                        )
+                    )
--- a/applications/question_answering/supervised_qa/faq_finance/requirements.txt
+++ b/applications/question_answering/supervised_qa/faq_finance/requirements.txt
+pymilvus>=2.1.0
+pandas==0.25.1 
+paddlenlp>=2.3.7    
+paddlepaddle-gpu>=2.2.3
+hnswlib>=0.5.2
+numpy>=1.17.2
+visualdl>=2.2.2
+pybind11
\ No newline at end of file
--- a/applications/question_answering/supervised_qa/faq_finance/run_system.py
+++ b/applications/question_answering/supervised_qa/faq_finance/run_system.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+import numpy as np
+import pandas as pd
+from config import collection_name, embedding_name, partition_tag
+from milvus_util import RecallByMilvus
+from paddle_serving_server.pipeline import PipelineClient
+def recall_result(list_data):
+    client = PipelineClient()
+    client.connect(["127.0.0.1:8080"])
+    feed = {}
+    for i, item in enumerate(list_data):
+        feed[str(i)] = item
+    start_time = time.time()
+    ret = client.predict(feed_dict=feed)
+    end_time = time.time()
+    print("Extract feature time to cost :{} seconds".format(end_time - start_time))
+    result = np.array(eval(ret.value[0]))
+    return result
+def search_in_milvus(embeddings, query_text):
+    recall_client = RecallByMilvus()
+    start_time = time.time()
+    results = recall_client.search(
+        embeddings,
+        embedding_name,
+        collection_name,
+        partition_names=[partition_tag],
+        output_fields=["pk", "question", "answer"],
+    )
+    end_time = time.time()
+    print("Search milvus time cost is {} seconds ".format(end_time - start_time))
+    list_data = []
+    for line in results:
+        for item in line:
+            distance = item.distance
+            question = item.entity.get("question")
+            answer = item.entity.get("answer")
+            print(question, answer, distance)
+            list_data.append([query_text, question, answer, distance])
+    df = pd.DataFrame(list_data, columns=["query_text", "question", "answer", "distance"])
+    df.to_csv("faq_result.csv", index=False)
+if __name__ == "__main__":
+    list_data = ["买了社保，是不是就不用买商业保险了？"]
+    result = recall_result(list_data)
+    df = search_in_milvus(result, list_data[0])
--- a/applications/question_answering/supervised_qa/faq_finance/scripts/evaluate.sh
+++ b/applications/question_answering/supervised_qa/faq_finance/scripts/evaluate.sh
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+  python -u evaluate.py \
+        --similar_text_pair "data/test_pair.csv" \
+        --recall_result_file "./recall_result_dir/recall_result.txt" \
+        --recall_num 10
\ No newline at end of file
--- a/applications/question_answering/supervised_qa/faq_finance/scripts/export_model.sh
+++ b/applications/question_answering/supervised_qa/faq_finance/scripts/export_model.sh
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+python export_model.py --params_path checkpoints/model_100/model_state.pdparams \
+                       --output_path=./output \
+                       	--model_name_or_path rocketqa-zh-base-query-encoder 
\ No newline at end of file