llama_paddle

10f294ff · yuguo-Jack · 7c64e6ec · 10f294ff · 10f294ff · 10f294ff
Commit 10f294ff authored Dec 19, 2023 by yuguo-Jack
20 changed files
--- a/applications/document_intelligence/doc_vqa/Rerank/run_test.sh
+++ b/applications/document_intelligence/doc_vqa/Rerank/run_test.sh
+#!/bin/bash
+
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+export CUDA_VISIBLE_DEVICES=0
+
+QUESTION=$1
+
+if [ ! -d output ]; then
+    mkdir output
+fi
+if [ ! -d log ]; then
+    mkdir log
+fi
+
+python3 change_to_rerank.py ${QUESTION}
+
+python3 -u ./src/train_ce.py \
+                   --use_cuda true \
+                   --verbose true \
+                   --do_train false \
+                   --do_val false \
+                   --do_test true \
+                   --batch_size 128 \
+                   --init_checkpoint "./checkpoints/ranker" \
+                   --test_set "./data/demo.tsv" \
+                   --test_save "data/demo.score" \
+                   --max_seq_len 384 \
+                   --for_cn true \
+                   --vocab_path "config/ernie_base_1.0_CN/vocab.txt" \
+                   --ernie_config_path "config/ernie_base_1.0_CN/ernie_config.json"
+                   1>>log/train.log 2>&1
+
--- a/applications/document_intelligence/doc_vqa/Rerank/run_train.sh
+++ b/applications/document_intelligence/doc_vqa/Rerank/run_train.sh
+#!/bin/bash
+
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+export CUDA_VISIBLE_DEVICES=0
+
+if [ $# != 4 ];then
+    echo "USAGE: sh run_train.sh \$TRAIN_SET \$MODEL_PATH \$epoch \$nodes_count"
+    exit 1
+fi
+
+TRAIN_SET=$1
+MODEL_PATH=$2
+epoch=$3
+node=$4
+
+CHECKPOINT_PATH=output
+if [ ! -d output ]; then
+    mkdir output
+fi
+if [ ! -d log ]; then
+    mkdir log
+fi
+
+lr=1e-5
+batch_size=32
+train_exampls=`cat $TRAIN_SET | wc -l`
+save_steps=$[$train_exampls/$batch_size/$node]
+data_size=$[$save_steps*$batch_size*$node]
+new_save_steps=$[$save_steps*$epoch/2]
+
+python3 -m paddle.distributed.launch \
+    --log_dir log \
+    ./src/train_ce.py \
+                   --use_cuda true \
+                   --verbose true \
+                   --do_train true \
+                   --do_val false \
+                   --do_test false \
+                   --use_mix_precision false \
+                   --train_data_size ${data_size} \
+                   --batch_size ${batch_size} \
+                   --init_pretraining_params ${MODEL_PATH} \
+                   --train_set ${TRAIN_SET} \
+                   --save_steps ${new_save_steps} \
+                   --validation_steps ${new_save_steps} \
+                   --checkpoints ${CHECKPOINT_PATH} \
+                   --weight_decay  0.01 \
+                   --warmup_proportion 0.0 \
+                   --epoch $epoch \
+                   --max_seq_len 384 \
+                   --for_cn true \
+                   --vocab_path config/ernie_base_1.0_CN/vocab.txt \
+                   --ernie_config_path config/ernie_base_1.0_CN/ernie_config.json \
+                   --learning_rate ${lr} \
+                   --skip_steps 10 \
+                   --num_iteration_per_drop_scope 1 \
+                   --num_labels 2 \
+                   --random_seed 1 
+
--- a/applications/document_intelligence/doc_vqa/Rerank/src/batching.py
+++ b/applications/document_intelligence/doc_vqa/Rerank/src/batching.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Mask, padding and batching."""
+
+import numpy as np
+
+
+def pad_batch_data(
+    insts,
+    pad_idx=0,
+    return_pos=False,
+    return_input_mask=False,
+    return_max_len=False,
+    return_num_token=False,
+    return_seq_lens=False,
+):
+    """
+    Pad the instances to the max sequence length in batch, and generate the
+    corresponding position data and attention bias.
+    """
+    return_list = []
+    max_len = max(len(inst) for inst in insts)
+    # Any token included in dict can be used to pad, since the paddings' loss
+    # will be masked out by weights and make no effect on parameter gradients.
+
+    inst_data = np.array([inst + list([pad_idx] * (max_len - len(inst))) for inst in insts])
+    return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])]
+
+    # position data
+    if return_pos:
+        inst_pos = np.array([list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst)) for inst in insts])
+
+        return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])]
+
+    if return_input_mask:
+        # This is used to avoid attention on paddings.
+        input_mask_data = np.array([[1] * len(inst) + [0] * (max_len - len(inst)) for inst in insts])
+        input_mask_data = np.expand_dims(input_mask_data, axis=-1)
+        return_list += [input_mask_data.astype("float32")]
+
+    if return_max_len:
+        return_list += [max_len]
+
+    if return_num_token:
+        num_token = 0
+        for inst in insts:
+            num_token += len(inst)
+        return_list += [num_token]
+
+    if return_seq_lens:
+        seq_lens = np.array([len(inst) for inst in insts])
+        return_list += [seq_lens.astype("int64").reshape([-1, 1])]
+
+    return return_list if len(return_list) > 1 else return_list[0]
+
+
+if __name__ == "__main__":
+    pass
--- a/applications/document_intelligence/doc_vqa/Rerank/src/cross_encoder.py
+++ b/applications/document_intelligence/doc_vqa/Rerank/src/cross_encoder.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Model for classifier."""
+
+import logging
+import time
+
+import numpy as np
+import paddle.fluid as fluid
+from model.ernie import ErnieModel
+from scipy.stats import pearsonr, spearmanr
+
+log = logging.getLogger(__name__)
+
+
+def create_model(args, pyreader_name, ernie_config, is_prediction=False, task_name=""):
+    pyreader = fluid.layers.py_reader(
+        capacity=50,
+        shapes=[
+            [-1, args.max_seq_len, 1],
+            [-1, args.max_seq_len, 1],
+            [-1, args.max_seq_len, 1],
+            [-1, args.max_seq_len, 1],
+            [-1, args.max_seq_len, 1],
+            [-1, 1],
+            [-1, 1],
+        ],
+        dtypes=["int64", "int64", "int64", "int64", "float32", "int64", "int64"],
+        lod_levels=[0, 0, 0, 0, 0, 0, 0],
+        name=task_name + "_" + pyreader_name,
+        use_double_buffer=True,
+    )
+
+    (src_ids, sent_ids, pos_ids, task_ids, input_mask, labels, qids) = fluid.layers.read_file(pyreader)
+
+    def _model(is_noise=False):
+        ernie = ErnieModel(
+            src_ids=src_ids,
+            position_ids=pos_ids,
+            sentence_ids=sent_ids,
+            task_ids=task_ids,
+            input_mask=input_mask,
+            config=ernie_config,
+            is_noise=is_noise,
+        )
+
+        cls_feats = ernie.get_pooled_output()
+        if not is_noise:
+            cls_feats = fluid.layers.dropout(x=cls_feats, dropout_prob=0.1, dropout_implementation="upscale_in_train")
+        logits = fluid.layers.fc(
+            input=cls_feats,
+            size=args.num_labels,
+            param_attr=fluid.ParamAttr(
+                name=task_name + "_cls_out_w", initializer=fluid.initializer.TruncatedNormal(scale=0.02)
+            ),
+            bias_attr=fluid.ParamAttr(name=task_name + "_cls_out_b", initializer=fluid.initializer.Constant(0.0)),
+        )
+        """
+        if is_prediction:
+            probs = fluid.layers.softmax(logits)
+            feed_targets_name = [
+                src_ids.name, sent_ids.name, pos_ids.name, input_mask.name
+            ]
+            if ernie_version == "2.0":
+                feed_targets_name += [task_ids.name]
+            return pyreader, probs, feed_targets_name
+        """
+
+        num_seqs = fluid.layers.create_tensor(dtype="int64")
+        # add focal loss
+        ce_loss, probs = fluid.layers.softmax_with_cross_entropy(logits=logits, label=labels, return_softmax=True)
+        loss = fluid.layers.mean(x=ce_loss)
+        accuracy = fluid.layers.accuracy(input=probs, label=labels, total=num_seqs)
+        graph_vars = {
+            "loss": loss,
+            "probs": probs,
+            "accuracy": accuracy,
+            "labels": labels,
+            "num_seqs": num_seqs,
+            "qids": qids,
+        }
+        return graph_vars
+
+    if not is_prediction:
+        graph_vars = _model(is_noise=True)
+        old_loss = graph_vars["loss"]
+        token_emb = fluid.default_main_program().global_block().var("word_embedding")
+        token_emb.stop_gradient = False
+        token_gradient = fluid.gradients(old_loss, token_emb)[0]
+        token_gradient.stop_gradient = False
+        epsilon = 1e-8
+        norm = fluid.layers.sqrt(fluid.layers.reduce_sum(fluid.layers.square(token_gradient)) + epsilon)
+        gp = (0.01 * token_gradient) / norm
+        gp.stop_gradient = True
+        fluid.layers.assign(token_emb + gp, token_emb)
+        graph_vars = _model()
+        fluid.layers.assign(token_emb - gp, token_emb)
+    else:
+        graph_vars = _model()
+
+    return pyreader, graph_vars
+
+
+def evaluate_mrr(preds):
+    last_qid = None
+    total_mrr = 0.0
+    qnum = 0.0
+    rank = 0.0
+    correct = False
+    for qid, score, label in preds:
+        if qid != last_qid:
+            rank = 0.0
+            qnum += 1
+            correct = False
+            last_qid = qid
+
+        rank += 1
+        if not correct and label != 0:
+            total_mrr += 1.0 / rank
+            correct = True
+
+    return total_mrr / qnum
+
+
+def evaluate(
+    exe, test_program, test_pyreader, graph_vars, eval_phase, use_multi_gpu_test=False, metric="simple_accuracy"
+):
+    train_fetch_list = [graph_vars["loss"].name, graph_vars["accuracy"].name, graph_vars["num_seqs"].name]
+
+    if eval_phase == "train":
+        if "learning_rate" in graph_vars:
+            train_fetch_list.append(graph_vars["learning_rate"].name)
+        outputs = exe.run(fetch_list=train_fetch_list, program=test_program)
+        ret = {"loss": np.mean(outputs[0]), "accuracy": np.mean(outputs[1])}
+        if "learning_rate" in graph_vars:
+            ret["learning_rate"] = float(outputs[3][0])
+        return ret
+
+    test_pyreader.start()
+    total_cost = 0.0
+    total_acc = 0.0
+    total_num_seqs = 0.0
+    total_label_pos_num = 0.0
+    total_pred_pos_num = 0.0
+    total_correct_num = 0.0
+    qids, labels, scores, preds = [], [], [], []
+    time_begin = time.time()
+
+    fetch_list = [
+        graph_vars["loss"].name,
+        graph_vars["accuracy"].name,
+        graph_vars["probs"].name,
+        graph_vars["labels"].name,
+        graph_vars["num_seqs"].name,
+        graph_vars["qids"].name,
+    ]
+    while True:
+        try:
+            if use_multi_gpu_test:
+                np_loss, np_acc, np_probs, np_labels, np_num_seqs, np_qids = exe.run(fetch_list=fetch_list)
+            else:
+                np_loss, np_acc, np_probs, np_labels, np_num_seqs, np_qids = exe.run(
+                    program=test_program, fetch_list=fetch_list
+                )
+            total_cost += np.sum(np_loss * np_num_seqs)
+            total_acc += np.sum(np_acc * np_num_seqs)
+            total_num_seqs += np.sum(np_num_seqs)
+            labels.extend(np_labels.reshape((-1)).tolist())
+            if np_qids is None:
+                np_qids = np.array([])
+            qids.extend(np_qids.reshape(-1).tolist())
+            scores.extend(np_probs[:, 1].reshape(-1).tolist())
+            np_preds = np.argmax(np_probs, axis=1).astype(np.float32)
+            preds.extend(np_preds)
+            total_label_pos_num += np.sum(np_labels)
+            total_pred_pos_num += np.sum(np_preds)
+            total_correct_num += np.sum(np.dot(np_preds, np_labels))
+        except fluid.core.EOFException:
+            test_pyreader.reset()
+            break
+    time_end = time.time()
+    cost = total_cost / total_num_seqs
+    elapsed_time = time_end - time_begin
+
+    evaluate_info = ""
+    if metric == "acc_and_f1":
+        ret = acc_and_f1(preds, labels)
+        evaluate_info = "[%s evaluation] ave loss: %f, ave_acc: %f, f1: %f, data_num: %d, elapsed time: %f s" % (
+            eval_phase,
+            cost,
+            ret["acc"],
+            ret["f1"],
+            total_num_seqs,
+            elapsed_time,
+        )
+    elif metric == "matthews_corrcoef":
+        ret = matthews_corrcoef(preds, labels)
+        evaluate_info = "[%s evaluation] ave loss: %f, matthews_corrcoef: %f, data_num: %d, elapsed time: %f s" % (
+            eval_phase,
+            cost,
+            ret,
+            total_num_seqs,
+            elapsed_time,
+        )
+    elif metric == "pearson_and_spearman":
+        ret = pearson_and_spearman(scores, labels)
+        evaluate_info = (
+            "[%s evaluation] ave loss: %f, pearson:%f, spearman:%f, corr:%f, data_num: %d, elapsed time: %f s"
+            % (eval_phase, cost, ret["pearson"], ret["spearman"], ret["corr"], total_num_seqs, elapsed_time)
+        )
+    elif metric == "simple_accuracy":
+        ret = simple_accuracy(preds, labels)
+        evaluate_info = "[%s evaluation] ave loss: %f, acc:%f, data_num: %d, elapsed time: %f s" % (
+            eval_phase,
+            cost,
+            ret,
+            total_num_seqs,
+            elapsed_time,
+        )
+    elif metric == "acc_and_f1_and_mrr":
+        ret_a = acc_and_f1(preds, labels)
+        preds = sorted(zip(qids, scores, labels), key=lambda elem: (elem[0], -elem[1]))
+        ret_b = evaluate_mrr(preds)
+        evaluate_info = "[%s evaluation] ave loss: %f, acc: %f, f1: %f, mrr: %f, data_num: %d, elapsed time: %f s" % (
+            eval_phase,
+            cost,
+            ret_a["acc"],
+            ret_a["f1"],
+            ret_b,
+            total_num_seqs,
+            elapsed_time,
+        )
+    else:
+        raise ValueError("unsupported metric {}".format(metric))
+    return evaluate_info
+
+
+def matthews_corrcoef(preds, labels):
+    preds = np.array(preds)
+    labels = np.array(labels)
+    tp = np.sum((labels == 1) & (preds == 1))
+    tn = np.sum((labels == 0) & (preds == 0))
+    fp = np.sum((labels == 0) & (preds == 1))
+    fn = np.sum((labels == 1) & (preds == 0))
+
+    mcc = ((tp * tn) - (fp * fn)) / np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
+    return mcc
+
+
+def f1_score(preds, labels):
+    preds = np.array(preds)
+    labels = np.array(labels)
+
+    tp = np.sum((labels == 1) & (preds == 1))
+    fp = np.sum((labels == 0) & (preds == 1))
+    fn = np.sum((labels == 1) & (preds == 0))
+    p = tp / (tp + fp)
+    r = tp / (tp + fn)
+    f1 = (2 * p * r) / (p + r + 1e-8)
+    return f1
+
+
+def pearson_and_spearman(preds, labels):
+    preds = np.array(preds)
+    labels = np.array(labels)
+
+    pearson_corr = pearsonr(preds, labels)[0]
+    spearman_corr = spearmanr(preds, labels)[0]
+    return {
+        "pearson": pearson_corr,
+        "spearmanr": spearman_corr,
+        "corr": (pearson_corr + spearman_corr) / 2,
+    }
+
+
+def acc_and_f1(preds, labels):
+    preds = np.array(preds)
+    labels = np.array(labels)
+
+    acc = simple_accuracy(preds, labels)
+    f1 = f1_score(preds, labels)
+    return {
+        "acc": acc,
+        "f1": f1,
+        "acc_and_f1": (acc + f1) / 2,
+    }
+
+
+def simple_accuracy(preds, labels):
+    preds = np.array(preds)
+    labels = np.array(labels)
+    return (preds == labels).mean()
+
+
+def predict(exe, test_program, test_pyreader, graph_vars, dev_count=1):
+    test_pyreader.start()
+    qids, probs = [], []
+    preds = []
+
+    fetch_list = [graph_vars["probs"].name, graph_vars["qids"].name]
+
+    while True:
+        try:
+            if dev_count == 1:
+                np_probs, np_qids = exe.run(program=test_program, fetch_list=fetch_list)
+            else:
+                np_probs, np_qids = exe.run(fetch_list=fetch_list)
+
+            if np_qids is None:
+                np_qids = np.array([])
+            qids.extend(np_qids.reshape(-1).tolist())
+            np_preds = np.argmax(np_probs, axis=1).astype(np.float32)
+            preds.extend(np_preds)
+            probs.append(np_probs)
+
+        except fluid.core.EOFException:
+            test_pyreader.reset()
+            break
+
+    probs = np.concatenate(probs, axis=0).reshape([len(preds), -1])
+
+    return qids, preds, probs
--- a/applications/document_intelligence/doc_vqa/Rerank/src/finetune_args.py
+++ b/applications/document_intelligence/doc_vqa/Rerank/src/finetune_args.py
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+from src.utils.args import ArgumentGroup
+
+# yapf: disable
+parser = argparse.ArgumentParser(__doc__)
+model_g = ArgumentGroup(parser, "model", "model configuration and paths.")
+model_g.add_arg("ernie_config_path", str, None, "Path to the json file for ernie model config.")
+model_g.add_arg("init_checkpoint", str, None, "Init checkpoint to resume training from.")
+model_g.add_arg("init_pretraining_params", str, None, "Init pre-training params which preforms fine-tuning from. If the arg 'init_checkpoint' has been set, this argument wouldn't be valid.")
+model_g.add_arg("checkpoints", str, "checkpoints", "Path to save checkpoints.")
+
+model_g.add_arg("is_classify", bool, True, "is_classify")
+model_g.add_arg("is_regression", bool, False, "is_regression")
+model_g.add_arg("task_id", int, 0, "task id")
+
+train_g = ArgumentGroup(parser, "training", "training options.")
+train_g.add_arg("epoch", int, 3, "Number of epoches for fine-tuning.")
+train_g.add_arg("learning_rate", float, 5e-5, "Learning rate used to train with warmup.")
+train_g.add_arg("lr_scheduler", str, "linear_warmup_decay", "scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay'])
+train_g.add_arg("weight_decay", float, 0.01, "Weight decay rate for L2 regularizer.")
+train_g.add_arg("warmup_proportion", float, 0.1, "Proportion of training steps to perform linear learning rate warmup for.")
+train_g.add_arg("save_steps", int, 10000, "The steps interval to save checkpoints.")
+train_g.add_arg("validation_steps", int, 1000, "The steps interval to evaluate model performance.")
+train_g.add_arg("use_recompute", bool, False, "Whether to use recompute optimizer for training.")
+train_g.add_arg("use_mix_precision", bool, False, "Whether to use mix-precision optimizer for training.")
+train_g.add_arg("use_cross_batch", bool, False, "Whether to use cross-batch for training.")
+train_g.add_arg("use_lamb", bool, False, "Whether to use LambOptimizer for training.")
+train_g.add_arg("use_dynamic_loss_scaling", bool, True, "Whether to use dynamic loss scaling.")
+
+train_g.add_arg("test_save", str, "./checkpoints/test_result", "test_save")
+train_g.add_arg("metric", str, "simple_accuracy", "metric")
+train_g.add_arg("incr_every_n_steps", int, 100, "Increases loss scaling every n consecutive.")
+train_g.add_arg("decr_every_n_nan_or_inf", int, 2, "Decreases loss scaling every n accumulated steps with nan or inf gradients.")
+train_g.add_arg("incr_ratio", float, 2.0, "The multiplier to use when increasing the loss scaling.")
+train_g.add_arg("decr_ratio", float, 0.8, "The less-than-one-multiplier to use when decreasing.")
+
+log_g = ArgumentGroup(parser, "logging", "logging related.")
+log_g.add_arg("skip_steps", int, 10, "The steps interval to print loss.")
+log_g.add_arg("verbose", bool, False, "Whether to output verbose log.")
+
+data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options")
+data_g.add_arg("tokenizer", str, "FullTokenizer", "ATTENTION: the INPUT must be splited by Word with blank while using SentencepieceTokenizer or WordsegTokenizer")
+data_g.add_arg("train_set", str, None, "Path to training data.")
+data_g.add_arg("test_set", str, None, "Path to test data.")
+data_g.add_arg("dev_set", str, None, "Path to validation data.")
+data_g.add_arg("vocab_path", str, None, "Vocabulary path.")
+data_g.add_arg("max_seq_len", int, 512, "Number of words of the longest seqence.")
+data_g.add_arg("q_max_seq_len", int, 32, "Number of words of the longest seqence.")
+data_g.add_arg("p_max_seq_len", int, 256, "Number of words of the longest seqence.")
+data_g.add_arg("train_data_size", int, 0, "Number of training data's total examples. Set for distribute.")
+data_g.add_arg("batch_size", int, 32, "Total examples' number in batch for training. see also --in_tokens.")
+data_g.add_arg("predict_batch_size", int, None, "Total examples' number in batch for predict. see also --in_tokens.")
+data_g.add_arg("in_tokens", bool, False, "If set, the batch size will be the maximum number of tokens in one batch. Otherwise, it will be the maximum number of examples in one batch.")
+data_g.add_arg("do_lower_case", bool, True, "Whether to lower case the input text. Should be True for uncased models and False for cased models.")
+data_g.add_arg("random_seed", int, None, "Random seed.")
+data_g.add_arg("label_map_config", str, None, "label_map_path.")
+data_g.add_arg("num_labels", int, 2, "label number")
+data_g.add_arg("diagnostic", str, None, "GLUE Diagnostic Dataset")
+data_g.add_arg("diagnostic_save", str, None, "GLUE Diagnostic save f")
+data_g.add_arg("max_query_length", int, 64, "Max query length.")
+data_g.add_arg("max_answer_length", int, 100, "Max answer length.")
+data_g.add_arg("doc_stride", int, 128, "When splitting up a long document into chunks, how much stride to take between chunks.")
+data_g.add_arg("n_best_size", int, 20, "The total number of n-best predictions to generate in the nbest_predictions.json output file.")
+data_g.add_arg("chunk_scheme", type=str, default="IOB", choices=["IO", "IOB", "IOE", "IOBES"], help="chunk scheme")
+
+run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
+run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.")
+run_type_g.add_arg("is_distributed", bool, False, "If set, then start distributed training.")
+run_type_g.add_arg("use_fast_executor", bool, False, "If set, use fast parallel executor (in experiment).")
+run_type_g.add_arg("num_iteration_per_drop_scope", int, 10, "Iteration intervals to drop scope.")
+run_type_g.add_arg("do_train", bool, True, "Whether to perform training.")
+run_type_g.add_arg("do_val", bool, True, "Whether to perform evaluation on dev data set.")
+run_type_g.add_arg("do_test", bool, True, "Whether to perform evaluation on test data set.")
+run_type_g.add_arg("output_item", int, 3, "Test output format.")
+run_type_g.add_arg("output_file_name", str, None, "Test output file name")
+run_type_g.add_arg("test_data_cnt", int, 1110000 , "total cnt of testset")
+run_type_g.add_arg("use_multi_gpu_test", bool, False, "Whether to perform evaluation using multiple gpu cards")
+run_type_g.add_arg("metrics", bool, True, "Whether to perform evaluation on test data set.")
+run_type_g.add_arg("shuffle", bool, True, "")
+run_type_g.add_arg("for_cn", bool, False, "model train for cn or for other langs.")
--- a/applications/document_intelligence/doc_vqa/Rerank/src/index_search.py
+++ b/applications/document_intelligence/doc_vqa/Rerank/src/index_search.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+import faiss
+import numpy as np
+
+
+def read_embed(file_name, dim=768, bs=3000):
+    if file_name.endswith("npy"):
+        i = 0
+        emb_np = np.load(file_name)
+        while i < len(emb_np):
+            vec_list = emb_np[i : i + bs]
+            i += bs
+            yield vec_list
+    else:
+        vec_list = []
+        with open(file_name) as inp:
+            for line in inp:
+                data = line.strip()
+                vector = [float(item) for item in data.split(" ")]
+                assert len(vector) == dim
+                vec_list.append(vector)
+                if len(vec_list) == bs:
+                    yield vec_list
+                    vec_list = []
+            if vec_list:
+                yield vec_list
+
+
+def load_qid(file_name):
+    qid_list = []
+    with open(file_name) as inp:
+        for line in inp:
+            line = line.strip()
+            qid = line.split("\t")[0]
+            qid_list.append(qid)
+    return qid_list
+
+
+def search(index, emb_file, qid_list, outfile, top_k):
+    q_idx = 0
+    with open(outfile, "w") as out:
+        for batch_vec in read_embed(emb_file):
+            q_emb_matrix = np.array(batch_vec)
+            res_dist, res_p_id = index.search(q_emb_matrix.astype("float32"), top_k)
+            for i in range(len(q_emb_matrix)):
+                qid = qid_list[q_idx]
+                for j in range(top_k):
+                    pid = res_p_id[i][j]
+                    score = res_dist[i][j]
+                    out.write("%s\t%s\t%s\t%s\n" % (qid, pid, j + 1, score))
+                q_idx += 1
+
+
+def main():
+    part = sys.argv[1]
+    topk = int(sys.argv[2])
+    q_text_file = sys.argv[3]
+    outfile = "output/res.top%s-part%s" % (topk, part)
+
+    qid_list = load_qid(q_text_file)
+
+    engine = faiss.read_index("output/para.index.part%s" % part)
+    emb_file = "output/query.emb.npy"
+    search(engine, emb_file, qid_list, outfile, topk)
+
+
+if __name__ == "__main__":
+    main()
--- a/applications/document_intelligence/doc_vqa/Rerank/src/merge.py
+++ b/applications/document_intelligence/doc_vqa/Rerank/src/merge.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+shift = int(sys.argv[1])
+top = int(sys.argv[2])
+total_part = int(sys.argv[3])
+
+f_list = []
+for part in range(total_part):
+    f0 = open("output/res.top%s-part%s" % (top, part))
+    f_list.append(f0)
+
+line_list = []
+for part in range(total_part):
+    line = f_list[part].readline()
+    line_list.append(line)
+
+out = open("output/dev.res.top%s" % top, "w")
+last_q = ""
+ans_list = {}
+while line_list[-1]:
+    cur_list = []
+    for line in line_list:
+        sub = line.strip().split("\t")
+        cur_list.append(sub)
+
+    if last_q == "":
+        last_q = cur_list[0][0]
+    if cur_list[0][0] != last_q:
+        rank = sorted(ans_list.items(), key=lambda a: a[1], reverse=True)
+        for i in range(top):
+            out.write("%s\t%s\t%s\t%s\n" % (last_q, rank[i][0], i + 1, rank[i][1]))
+        ans_list = {}
+    for i, sub in enumerate(cur_list):
+        ans_list[int(sub[1]) + shift * i] = float(sub[-1])
+    last_q = cur_list[0][0]
+
+    line_list = []
+    for f0 in f_list:
+        line = f0.readline()
+        line_list.append(line)
+
+rank = sorted(ans_list.items(), key=lambda a: a[1], reverse=True)
+for i in range(top):
+    out.write("%s\t%s\t%s\t%s\n" % (last_q, rank[i][0], i + 1, rank[i][1]))
+out.close()
+
+print("output/dev.res.top%s" % top)
--- a/applications/document_intelligence/doc_vqa/Rerank/src/model/ernie.py
+++ b/applications/document_intelligence/doc_vqa/Rerank/src/model/ernie.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Ernie model."""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+from io import open
+
+import paddle
+import paddle.fluid as fluid
+import six
+from model.transformer_encoder import encoder, pre_process_layer
+
+log = logging.getLogger(__name__)
+
+
+class ErnieConfig(object):
+    def __init__(self, config_path):
+        self._config_dict = self._parse(config_path)
+
+    def _parse(self, config_path):
+        try:
+            with open(config_path, "r", encoding="utf8") as json_file:
+                config_dict = json.load(json_file)
+        except Exception:
+            raise IOError("Error in parsing Ernie model config file '%s'" % config_path)
+        else:
+            return config_dict
+
+    def __getitem__(self, key):
+        return self._config_dict.get(key, None)
+
+    def print_config(self):
+        for arg, value in sorted(six.iteritems(self._config_dict)):
+            log.info("%s: %s" % (arg, value))
+        log.info("------------------------------------------------")
+
+
+class ErnieModel(object):
+    def __init__(
+        self,
+        src_ids,
+        position_ids,
+        sentence_ids,
+        task_ids,
+        input_mask,
+        config,
+        weight_sharing=True,
+        model_name="",
+        is_noise=False,
+    ):
+
+        self._emb_size = config["hidden_size"]
+        self._n_layer = config["num_hidden_layers"]
+        self._n_head = config["num_attention_heads"]
+        self._voc_size = config["vocab_size"]
+        self._max_position_seq_len = config["max_position_embeddings"]
+        if config["sent_type_vocab_size"]:
+            self._sent_types = config["sent_type_vocab_size"]
+        else:
+            self._sent_types = config["type_vocab_size"]
+
+        self._use_task_id = config["use_task_id"]
+        if self._use_task_id:
+            self._task_types = config["task_type_vocab_size"]
+        self._hidden_act = config["hidden_act"]
+        self._prepostprocess_dropout = config["hidden_dropout_prob"]
+        self._attention_dropout = config["attention_probs_dropout_prob"]
+        if is_noise:
+            self._prepostprocess_dropout = 0
+            self._attention_dropout = 0
+        self._weight_sharing = weight_sharing
+        self.checkpoints = []
+
+        self._word_emb_name = "word_embedding"
+        self._pos_emb_name = "pos_embedding"
+        self._sent_emb_name = "sent_embedding"
+        self._task_emb_name = "task_embedding"
+        self._emb_dtype = "float32"
+
+        # Initialize all weights by truncated normal initializer, and all biases
+        # will be initialized by constant zero by default.
+        self._param_initializer = fluid.initializer.TruncatedNormal(scale=config["initializer_range"])
+
+        self._build_model(model_name, src_ids, position_ids, sentence_ids, task_ids, input_mask)
+
+    def _build_model(self, model_name, src_ids, position_ids, sentence_ids, task_ids, input_mask):
+        # padding id in vocabulary must be set to 0
+        emb_out = fluid.layers.embedding(
+            input=src_ids,
+            size=[self._voc_size, self._emb_size],
+            dtype=self._emb_dtype,
+            param_attr=fluid.ParamAttr(name=model_name + self._word_emb_name, initializer=self._param_initializer),
+            is_sparse=False,
+        )
+
+        position_emb_out = fluid.layers.embedding(
+            input=position_ids,
+            size=[self._max_position_seq_len, self._emb_size],
+            dtype=self._emb_dtype,
+            param_attr=fluid.ParamAttr(name=model_name + self._pos_emb_name, initializer=self._param_initializer),
+        )
+
+        sent_emb_out = fluid.layers.embedding(
+            sentence_ids,
+            size=[self._sent_types, self._emb_size],
+            dtype=self._emb_dtype,
+            param_attr=fluid.ParamAttr(name=model_name + self._sent_emb_name, initializer=self._param_initializer),
+        )
+
+        emb_out = emb_out + position_emb_out
+        emb_out = emb_out + sent_emb_out
+
+        if self._use_task_id:
+            task_emb_out = fluid.layers.embedding(
+                task_ids,
+                size=[self._task_types, self._emb_size],
+                dtype=self._emb_dtype,
+                param_attr=fluid.ParamAttr(name=model_name + self._task_emb_name, initializer=self._param_initializer),
+            )
+
+            emb_out = emb_out + task_emb_out
+
+        emb_out = pre_process_layer(emb_out, "nd", self._prepostprocess_dropout, name=model_name + "pre_encoder")
+
+        self_attn_mask = paddle.matmul(x=input_mask, y=input_mask, transpose_y=True)
+
+        self_attn_mask = fluid.layers.scale(x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
+        n_head_self_attn_mask = fluid.layers.stack(x=[self_attn_mask] * self._n_head, axis=1)
+        n_head_self_attn_mask.stop_gradient = True
+
+        self._enc_out, self.checkpoints = encoder(
+            enc_input=emb_out,
+            attn_bias=n_head_self_attn_mask,
+            n_layer=self._n_layer,
+            n_head=self._n_head,
+            d_key=self._emb_size // self._n_head,
+            d_value=self._emb_size // self._n_head,
+            d_model=self._emb_size,
+            d_inner_hid=self._emb_size * 4,
+            prepostprocess_dropout=self._prepostprocess_dropout,
+            attention_dropout=self._attention_dropout,
+            relu_dropout=0,
+            hidden_act=self._hidden_act,
+            preprocess_cmd="",
+            postprocess_cmd="dan",
+            param_initializer=self._param_initializer,
+            model_name=model_name,
+            name=model_name + "encoder",
+        )
+
+    def get_sequence_output(self):
+        return self._enc_out
+
+    def get_cls_output(self):
+        """Get the first feature of each sequence for classification"""
+        cls_output = fluid.layers.slice(input=self._enc_out, axes=[1], starts=[0], ends=[1])
+        cls_output = fluid.layers.squeeze(cls_output, axes=[1])
+        return cls_output
+
+    def get_pooled_output(self):
+        """Get the first feature of each sequence for classification"""
+        next_sent_feat = fluid.layers.slice(input=self._enc_out, axes=[1], starts=[0], ends=[1])
+        next_sent_feat = fluid.layers.fc(
+            input=next_sent_feat,
+            size=self._emb_size,
+            act="tanh",
+            param_attr=fluid.ParamAttr(name="pooled_fc.w_0", initializer=self._param_initializer),
+            bias_attr="pooled_fc.b_0",
+        )
+        return next_sent_feat
+
+    def get_lm_output(self, mask_label, mask_pos):
+        """Get the loss & accuracy for pretraining"""
+
+        mask_pos = fluid.layers.cast(x=mask_pos, dtype="int32")
+
+        # extract the first token feature in each sentence
+        self.next_sent_feat = self.get_pooled_output()
+        reshaped_emb_out = fluid.layers.reshape(x=self._enc_out, shape=[-1, self._emb_size])
+        # extract masked tokens' feature
+        mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)
+
+        # transform: fc
+        mask_trans_feat = fluid.layers.fc(
+            input=mask_feat,
+            size=self._emb_size,
+            act=self._hidden_act,
+            param_attr=fluid.ParamAttr(name="mask_lm_trans_fc.w_0", initializer=self._param_initializer),
+            bias_attr=fluid.ParamAttr(name="mask_lm_trans_fc.b_0"),
+        )
+
+        # transform: layer norm
+        mask_trans_feat = fluid.layers.layer_norm(
+            mask_trans_feat,
+            begin_norm_axis=len(mask_trans_feat.shape) - 1,
+            param_attr=fluid.ParamAttr(
+                name="mask_lm_trans_layer_norm_scale", initializer=fluid.initializer.Constant(1.0)
+            ),
+            bias_attr=fluid.ParamAttr(
+                name="mask_lm_trans_layer_norm_bias", initializer=fluid.initializer.Constant(1.0)
+            ),
+        )
+        # transform: layer norm
+        # mask_trans_feat = pre_process_layer(
+        #    mask_trans_feat, 'n', name='mask_lm_trans')
+
+        mask_lm_out_bias_attr = fluid.ParamAttr(
+            name="mask_lm_out_fc.b_0", initializer=fluid.initializer.Constant(value=0.0)
+        )
+        if self._weight_sharing:
+            fc_out = paddle.matmul(
+                x=mask_trans_feat,
+                y=fluid.default_main_program().global_block().var(self._word_emb_name),
+                transpose_y=True,
+            )
+            fc_out += fluid.layers.create_parameter(
+                shape=[self._voc_size], dtype=self._emb_dtype, attr=mask_lm_out_bias_attr, is_bias=True
+            )
+
+        else:
+            fc_out = fluid.layers.fc(
+                input=mask_trans_feat,
+                size=self._voc_size,
+                param_attr=fluid.ParamAttr(name="mask_lm_out_fc.w_0", initializer=self._param_initializer),
+                bias_attr=mask_lm_out_bias_attr,
+            )
+
+        mask_lm_loss = fluid.layers.softmax_with_cross_entropy(logits=fc_out, label=mask_label)
+        mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss)
+
+        return mean_mask_lm_loss
+
+    def get_task_output(self, task, task_labels):
+        task_fc_out = fluid.layers.fc(
+            input=self.next_sent_feat,
+            size=task["num_labels"],
+            param_attr=fluid.ParamAttr(name=task["task_name"] + "_fc.w_0", initializer=self._param_initializer),
+            bias_attr=task["task_name"] + "_fc.b_0",
+        )
+        task_loss, task_softmax = fluid.layers.softmax_with_cross_entropy(
+            logits=task_fc_out, label=task_labels, return_softmax=True
+        )
+        task_acc = fluid.layers.accuracy(input=task_softmax, label=task_labels)
+        mean_task_loss = fluid.layers.mean(task_loss)
+        return mean_task_loss, task_acc
--- a/applications/document_intelligence/doc_vqa/Rerank/src/model/transformer_encoder.py
+++ b/applications/document_intelligence/doc_vqa/Rerank/src/model/transformer_encoder.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Transformer encoder."""
+
+from __future__ import absolute_import, division, print_function
+
+from functools import partial
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+
+
+def multi_head_attention(
+    queries,
+    keys,
+    values,
+    attn_bias,
+    d_key,
+    d_value,
+    d_model,
+    n_head=1,
+    dropout_rate=0.0,
+    cache=None,
+    param_initializer=None,
+    name="multi_head_att",
+):
+    """
+    Multi-Head Attention. Note that attn_bias is added to the logit before
+    computing softmax activation to mask certain selected positions so that
+    they will not considered in attention weights.
+    """
+    keys = queries if keys is None else keys
+    values = keys if values is None else values
+
+    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
+        raise ValueError("Inputs: queries, keys and values should all be 3-D tensors.")
+
+    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
+        """
+        Add linear projection to queries, keys, and values.
+        """
+        q = layers.fc(
+            input=queries,
+            size=d_key * n_head,
+            num_flatten_dims=2,
+            param_attr=fluid.ParamAttr(name=name + "_query_fc.w_0", initializer=param_initializer),
+            bias_attr=name + "_query_fc.b_0",
+        )
+        k = layers.fc(
+            input=keys,
+            size=d_key * n_head,
+            num_flatten_dims=2,
+            param_attr=fluid.ParamAttr(name=name + "_key_fc.w_0", initializer=param_initializer),
+            bias_attr=name + "_key_fc.b_0",
+        )
+        v = layers.fc(
+            input=values,
+            size=d_value * n_head,
+            num_flatten_dims=2,
+            param_attr=fluid.ParamAttr(name=name + "_value_fc.w_0", initializer=param_initializer),
+            bias_attr=name + "_value_fc.b_0",
+        )
+        return q, k, v
+
+    def __split_heads(x, n_head):
+        """
+        Reshape the last dimension of input tensor x so that it becomes two
+        dimensions and then transpose. Specifically, input a tensor with shape
+        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
+        with shape [bs, n_head, max_sequence_length, hidden_dim].
+        """
+        hidden_size = x.shape[-1]
+        # The value 0 in shape attr means copying the corresponding dimension
+        # size of the input as the output dimension size.
+        reshaped = layers.reshape(x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True)
+
+        # permuate the dimensions into:
+        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
+        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
+
+    def __combine_heads(x):
+        """
+        Transpose and then reshape the last two dimensions of input tensor x
+        so that it becomes one dimension, which is reverse to __split_heads.
+        """
+        if len(x.shape) == 3:
+            return x
+        if len(x.shape) != 4:
+            raise ValueError("Input(x) should be a 4-D Tensor.")
+
+        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
+        # The value 0 in shape attr means copying the corresponding dimension
+        # size of the input as the output dimension size.
+        return layers.reshape(x=trans_x, shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]], inplace=True)
+
+    def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
+        """
+        Scaled Dot-Product Attention
+        """
+        scaled_q = layers.scale(x=q, scale=d_key**-0.5)
+        product = paddle.matmul(x=scaled_q, y=k, transpose_y=True)
+        if attn_bias:
+            product += attn_bias
+        weights = layers.softmax(product)
+        if dropout_rate:
+            weights = layers.dropout(
+                weights, dropout_prob=dropout_rate, dropout_implementation="upscale_in_train", is_test=False
+            )
+        out = paddle.matmul(weights, v)
+        return out
+
+    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
+
+    if cache is not None:  # use cache and concat time steps
+        # Since the inplace reshape in __split_heads changes the shape of k and
+        # v, which is the cache input for next time step, reshape the cache
+        # input from the previous time step first.
+        k = cache["k"] = layers.concat([layers.reshape(cache["k"], shape=[0, 0, d_model]), k], axis=1)
+        v = cache["v"] = layers.concat([layers.reshape(cache["v"], shape=[0, 0, d_model]), v], axis=1)
+
+    q = __split_heads(q, n_head)
+    k = __split_heads(k, n_head)
+    v = __split_heads(v, n_head)
+
+    ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate)
+
+    out = __combine_heads(ctx_multiheads)
+
+    # Project back to the model size.
+    proj_out = layers.fc(
+        input=out,
+        size=d_model,
+        num_flatten_dims=2,
+        param_attr=fluid.ParamAttr(name=name + "_output_fc.w_0", initializer=param_initializer),
+        bias_attr=name + "_output_fc.b_0",
+    )
+    return proj_out
+
+
+def positionwise_feed_forward(x, d_inner_hid, d_hid, dropout_rate, hidden_act, param_initializer=None, name="ffn"):
+    """
+    Position-wise Feed-Forward Networks.
+    This module consists of two linear transformations with a ReLU activation
+    in between, which is applied to each position separately and identically.
+    """
+    hidden = layers.fc(
+        input=x,
+        size=d_inner_hid,
+        num_flatten_dims=2,
+        act=hidden_act,
+        param_attr=fluid.ParamAttr(name=name + "_fc_0.w_0", initializer=param_initializer),
+        bias_attr=name + "_fc_0.b_0",
+    )
+    if dropout_rate:
+        hidden = layers.dropout(
+            hidden, dropout_prob=dropout_rate, dropout_implementation="upscale_in_train", is_test=False
+        )
+    out = layers.fc(
+        input=hidden,
+        size=d_hid,
+        num_flatten_dims=2,
+        param_attr=fluid.ParamAttr(name=name + "_fc_1.w_0", initializer=param_initializer),
+        bias_attr=name + "_fc_1.b_0",
+    )
+    return out
+
+
+def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.0, name=""):
+    """
+    Add residual connection, layer normalization and dropout to the out tensor
+    optionally according to the value of process_cmd.
+    This will be used before or after multi-head attention and position-wise
+    feed-forward networks.
+    """
+    for cmd in process_cmd:
+        if cmd == "a":  # add residual connection
+            out = out + prev_out if prev_out else out
+        elif cmd == "n":  # add layer normalization
+            out_dtype = out.dtype
+            if out_dtype == fluid.core.VarDesc.VarType.FP16:
+                out = layers.cast(x=out, dtype="float32")
+            out = layers.layer_norm(
+                out,
+                begin_norm_axis=len(out.shape) - 1,
+                param_attr=fluid.ParamAttr(
+                    name=name + "_layer_norm_scale", initializer=fluid.initializer.Constant(1.0)
+                ),
+                bias_attr=fluid.ParamAttr(name=name + "_layer_norm_bias", initializer=fluid.initializer.Constant(0.0)),
+            )
+            if out_dtype == fluid.core.VarDesc.VarType.FP16:
+                out = layers.cast(x=out, dtype="float16")
+        elif cmd == "d":  # add dropout
+            if dropout_rate:
+                out = layers.dropout(
+                    out, dropout_prob=dropout_rate, dropout_implementation="upscale_in_train", is_test=False
+                )
+    return out
+
+
+pre_process_layer = partial(pre_post_process_layer, None)
+post_process_layer = pre_post_process_layer
+
+
+def encoder_layer(
+    enc_input,
+    attn_bias,
+    n_head,
+    d_key,
+    d_value,
+    d_model,
+    d_inner_hid,
+    prepostprocess_dropout,
+    attention_dropout,
+    relu_dropout,
+    hidden_act,
+    preprocess_cmd="n",
+    postprocess_cmd="da",
+    param_initializer=None,
+    name="",
+):
+    """The encoder layers that can be stacked to form a deep encoder.
+    This module consists of a multi-head (self) attention followed by
+    position-wise feed-forward networks and both the two components companied
+    with the post_process_layer to add residual connection, layer normalization
+    and dropout.
+    """
+    attn_output = multi_head_attention(
+        pre_process_layer(enc_input, preprocess_cmd, prepostprocess_dropout, name=name + "_pre_att"),
+        None,
+        None,
+        attn_bias,
+        d_key,
+        d_value,
+        d_model,
+        n_head,
+        attention_dropout,
+        param_initializer=param_initializer,
+        name=name + "_multi_head_att",
+    )
+    attn_output = post_process_layer(
+        enc_input, attn_output, postprocess_cmd, prepostprocess_dropout, name=name + "_post_att"
+    )
+    ffd_output = positionwise_feed_forward(
+        pre_process_layer(attn_output, preprocess_cmd, prepostprocess_dropout, name=name + "_pre_ffn"),
+        d_inner_hid,
+        d_model,
+        relu_dropout,
+        hidden_act,
+        param_initializer=param_initializer,
+        name=name + "_ffn",
+    )
+    return (
+        post_process_layer(attn_output, ffd_output, postprocess_cmd, prepostprocess_dropout, name=name + "_post_ffn"),
+        ffd_output,
+    )
+
+
+def encoder(
+    enc_input,
+    attn_bias,
+    n_layer,
+    n_head,
+    d_key,
+    d_value,
+    d_model,
+    d_inner_hid,
+    prepostprocess_dropout,
+    attention_dropout,
+    relu_dropout,
+    hidden_act,
+    preprocess_cmd="n",
+    postprocess_cmd="da",
+    param_initializer=None,
+    model_name="",
+    name="",
+):
+    """
+    The encoder is composed of a stack of identical layers returned by calling
+    encoder_layer.
+    """
+    checkpoints = []
+    for i in range(n_layer):
+        enc_output, cp = encoder_layer(
+            enc_input,
+            attn_bias,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            prepostprocess_dropout,
+            attention_dropout,
+            relu_dropout,
+            hidden_act,
+            preprocess_cmd,
+            postprocess_cmd,
+            param_initializer=param_initializer,
+            name=name + "_layer_" + str(i),
+        )
+        checkpoints.append(cp)
+        enc_input = enc_output
+    enc_output = pre_process_layer(
+        enc_output, preprocess_cmd, prepostprocess_dropout, name=model_name + "post_encoder"
+    )
+
+    return enc_output, checkpoints
--- a/applications/document_intelligence/doc_vqa/Rerank/src/optimization.py
+++ b/applications/document_intelligence/doc_vqa/Rerank/src/optimization.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Optimization and learning rate scheduling."""
+
+import paddle.fluid as fluid
+from paddle.fluid.incubate.fleet.collective import fleet
+
+
+def linear_warmup_decay(learning_rate, warmup_steps, num_train_steps):
+    """Applies linear warmup of learning rate from 0 and decay to 0."""
+    with fluid.default_main_program()._lr_schedule_guard():
+        lr = fluid.layers.tensor.create_global_var(
+            shape=[1], value=0.0, dtype="float32", persistable=True, name="scheduled_learning_rate"
+        )
+
+        global_step = fluid.layers.learning_rate_scheduler._decay_step_counter()
+
+        with fluid.layers.control_flow.Switch() as switch:
+            with switch.case(global_step < warmup_steps):
+                warmup_lr = learning_rate * (global_step / warmup_steps)
+                fluid.layers.tensor.assign(warmup_lr, lr)
+            with switch.default():
+                decayed_lr = fluid.layers.learning_rate_scheduler.polynomial_decay(
+                    learning_rate=learning_rate,
+                    decay_steps=num_train_steps,
+                    end_learning_rate=0.0,
+                    power=1.0,
+                    cycle=False,
+                )
+                fluid.layers.tensor.assign(decayed_lr, lr)
+
+        return lr
+
+
+def optimization(
+    loss,
+    warmup_steps,
+    num_train_steps,
+    learning_rate,
+    train_program,
+    startup_prog,
+    weight_decay,
+    scheduler="linear_warmup_decay",
+    use_dynamic_loss_scaling=False,
+    incr_every_n_steps=1000,
+    decr_every_n_nan_or_inf=2,
+    incr_ratio=2.0,
+    decr_ratio=0.8,
+    dist_strategy=None,
+    use_lamb=False,
+):
+    if warmup_steps > 0:
+        if scheduler == "noam_decay":
+            scheduled_lr = fluid.layers.learning_rate_scheduler.noam_decay(
+                1 / (warmup_steps * (learning_rate**2)), warmup_steps
+            )
+        elif scheduler == "linear_warmup_decay":
+            scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps, num_train_steps)
+        else:
+            raise ValueError("Unknown learning rate scheduler, should be " "'noam_decay' or 'linear_warmup_decay'")
+        if use_lamb:
+            optimizer = fluid.optimizer.LambOptimizer(learning_rate=scheduled_lr)
+        else:
+            optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
+    else:
+        scheduled_lr = fluid.layers.create_global_var(
+            name=fluid.unique_name.generate("learning_rate"),
+            shape=[1],
+            value=learning_rate,
+            dtype="float32",
+            persistable=True,
+        )
+        if use_lamb:
+            optimizer = fluid.optimizer.LambOptimizer(learning_rate=scheduled_lr)
+        else:
+            optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
+        optimizer._learning_rate_map[fluid.default_main_program()] = scheduled_lr
+
+    fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0))
+
+    def exclude_from_weight_decay(name):
+        if name.find("layer_norm") > -1:
+            return True
+        bias_suffix = ["_bias", "_b", ".b_0"]
+        for suffix in bias_suffix:
+            if name.endswith(suffix):
+                return True
+        return False
+
+    param_list = dict()
+
+    for param in train_program.global_block().all_parameters():
+        param_list[param.name] = param * 1.0
+        param_list[param.name].stop_gradient = True
+
+    if dist_strategy is not None:
+        # use fleet api
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy)
+
+    _, param_grads = optimizer.minimize(loss)
+
+    if weight_decay > 0:
+        for param, grad in param_grads:
+            if exclude_from_weight_decay(param.name):
+                continue
+            with param.block.program._optimized_guard([param, grad]), fluid.framework.name_scope("weight_decay"):
+                updated_param = param - param_list[param.name] * weight_decay * scheduled_lr
+                fluid.layers.assign(output=param, input=updated_param)
+
+    return scheduled_lr
--- a/applications/document_intelligence/doc_vqa/Rerank/src/reader_ce.py
+++ b/applications/document_intelligence/doc_vqa/Rerank/src/reader_ce.py
--- a/applications/document_intelligence/doc_vqa/Rerank/src/tokenization.py
+++ b/applications/document_intelligence/doc_vqa/Rerank/src/tokenization.py
--- a/applications/document_intelligence/doc_vqa/Rerank/src/train_ce.py
+++ b/applications/document_intelligence/doc_vqa/Rerank/src/train_ce.py
--- a/applications/document_intelligence/doc_vqa/Rerank/src/utils/args.py
+++ b/applications/document_intelligence/doc_vqa/Rerank/src/utils/args.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Arguments for configuration."""
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import logging
+import os
+import sys
+
+import paddle.fluid as fluid
+import six
+
+from paddlenlp.trainer.argparser import strtobool
+
+log = logging.getLogger(__name__)
+
+
+def prepare_logger(logger, debug=False, save_to_file=None):
+    formatter = logging.Formatter(fmt="[%(levelname)s] %(asctime)s [%(filename)12s:%(lineno)5d]:\t%(message)s")
+    console_hdl = logging.StreamHandler()
+    console_hdl.setFormatter(formatter)
+    logger.addHandler(console_hdl)
+    if save_to_file is not None and not os.path.exists(save_to_file):
+        file_hdl = logging.FileHandler(save_to_file)
+        file_hdl.setFormatter(formatter)
+        logger.addHandler(file_hdl)
+    logger.setLevel(logging.DEBUG)
+    logger.propagate = False
+
+
+class ArgumentGroup(object):
+    def __init__(self, parser, title, des):
+        self._group = parser.add_argument_group(title=title, description=des)
+
+    def add_arg(self, name, type, default, help, positional_arg=False, **kwargs):
+        prefix = "" if positional_arg else "--"
+        type = strtobool if type == bool else type
+        self._group.add_argument(
+            prefix + name, default=default, type=type, help=help + " Default: %(default)s.", **kwargs
+        )
+
+
+def print_arguments(args):
+    log.info("-----------  Configuration Arguments -----------")
+    for arg, value in sorted(six.iteritems(vars(args))):
+        log.info("%s: %s" % (arg, value))
+    log.info("------------------------------------------------")
+
+
+def check_cuda(
+    use_cuda,
+    err="\nYou can not set use_cuda = True in the model because you are using paddlepaddle-cpu.\n \
+    Please: 1. Install paddlepaddle-gpu to run your models on GPU or 2. Set use_cuda = False to run models on CPU.\n",
+):
+    try:
+        if use_cuda is True and fluid.is_compiled_with_cuda() is False:
+            log.error(err)
+            sys.exit(1)
+    except Exception:
+        pass
--- a/applications/document_intelligence/doc_vqa/Rerank/src/utils/init.py
+++ b/applications/document_intelligence/doc_vqa/Rerank/src/utils/init.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+
+import paddle.fluid as fluid
+
+log = logging.getLogger(__name__)
+
+
+def init_checkpoint(exe, init_checkpoint_path, main_program):
+    assert os.path.exists(init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path
+
+    def existed_persitables(var):
+        if not fluid.io.is_persistable(var):
+            return False
+        if not os.path.exists(os.path.join(init_checkpoint_path, var.name)):
+            print("Var not exists: [%s]\t%s" % (var.name, os.path.join(init_checkpoint_path, var.name)))
+        # else:
+        #    print ("Var exists: [%s]" % (var.name))
+        return os.path.exists(os.path.join(init_checkpoint_path, var.name))
+
+    fluid.io.load_vars(exe, init_checkpoint_path, main_program=main_program, predicate=existed_persitables)
+    log.info("Load model from {}".format(init_checkpoint_path))
+
+
+def init_pretraining_params(exe, pretraining_params_path, main_program):
+    assert os.path.exists(pretraining_params_path), "[%s] cann't be found." % pretraining_params_path
+
+    def existed_params(var):
+        if not isinstance(var, fluid.framework.Parameter):
+            return False
+        if not os.path.exists(os.path.join(pretraining_params_path, var.name)):
+            print("Var not exists: [%s]\t%s" % (var.name, os.path.join(pretraining_params_path, var.name)))
+        # else:
+        #    print ("Var exists: [%s]" % (var.name))
+        return os.path.exists(os.path.join(pretraining_params_path, var.name))
+
+    fluid.io.load_vars(exe, pretraining_params_path, main_program=main_program, predicate=existed_params)
+    log.info("Load pretraining parameters from {}.".format(pretraining_params_path))
--- a/applications/document_intelligence/doc_vqa/run_test.sh
+++ b/applications/document_intelligence/doc_vqa/run_test.sh
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+export CUDA_VISIBLE_DEVICES=0
+
+QUESTION=$1
+
+# Question: NFC咋开门
+
+if [ $# != 1 ];then
+    echo "USAGE: sh script/run_cross_encoder_test.sh \$QUESTION"
+    exit 1
+fi
+
+# compute scores for QUESTION and OCR parsing results  with Rerank module
+cd Rerank
+bash run_test.sh ${QUESTION}
+cd ..
+
+# extraction answer for QUESTION from the top1 of rank
+cd Extraction
+bash run_test.sh ${QUESTION}
+cd ..
--- a/applications/document_intelligence/docprompt/README.md
+++ b/applications/document_intelligence/docprompt/README.md
+[ERNIE-Layout](../../../model_zoo/ernie-layout)
--- a/applications/information_extraction/README.md
+++ b/applications/information_extraction/README.md
--- a/applications/information_extraction/README_en.md
+++ b/applications/information_extraction/README_en.md
--- a/applications/information_extraction/document/README.md
+++ b/applications/information_extraction/document/README.md