"docs/source/en/model_doc/layoutlmv2.mdx" did not exist on "b5e2b183af5e40e33a4dc7659e697d137259d56e"
Commit 10f294ff authored by yuguo-Jack's avatar yuguo-Jack
Browse files

llama_paddle

parent 7c64e6ec
Pipeline #678 failed with stages
in 0 seconds
#!/bin/bash
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
export CUDA_VISIBLE_DEVICES=0
QUESTION=$1
if [ ! -d output ]; then
mkdir output
fi
if [ ! -d log ]; then
mkdir log
fi
python3 change_to_rerank.py ${QUESTION}
python3 -u ./src/train_ce.py \
--use_cuda true \
--verbose true \
--do_train false \
--do_val false \
--do_test true \
--batch_size 128 \
--init_checkpoint "./checkpoints/ranker" \
--test_set "./data/demo.tsv" \
--test_save "data/demo.score" \
--max_seq_len 384 \
--for_cn true \
--vocab_path "config/ernie_base_1.0_CN/vocab.txt" \
--ernie_config_path "config/ernie_base_1.0_CN/ernie_config.json"
1>>log/train.log 2>&1
#!/bin/bash
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
export CUDA_VISIBLE_DEVICES=0
if [ $# != 4 ];then
echo "USAGE: sh run_train.sh \$TRAIN_SET \$MODEL_PATH \$epoch \$nodes_count"
exit 1
fi
TRAIN_SET=$1
MODEL_PATH=$2
epoch=$3
node=$4
CHECKPOINT_PATH=output
if [ ! -d output ]; then
mkdir output
fi
if [ ! -d log ]; then
mkdir log
fi
lr=1e-5
batch_size=32
train_exampls=`cat $TRAIN_SET | wc -l`
save_steps=$[$train_exampls/$batch_size/$node]
data_size=$[$save_steps*$batch_size*$node]
new_save_steps=$[$save_steps*$epoch/2]
python3 -m paddle.distributed.launch \
--log_dir log \
./src/train_ce.py \
--use_cuda true \
--verbose true \
--do_train true \
--do_val false \
--do_test false \
--use_mix_precision false \
--train_data_size ${data_size} \
--batch_size ${batch_size} \
--init_pretraining_params ${MODEL_PATH} \
--train_set ${TRAIN_SET} \
--save_steps ${new_save_steps} \
--validation_steps ${new_save_steps} \
--checkpoints ${CHECKPOINT_PATH} \
--weight_decay 0.01 \
--warmup_proportion 0.0 \
--epoch $epoch \
--max_seq_len 384 \
--for_cn true \
--vocab_path config/ernie_base_1.0_CN/vocab.txt \
--ernie_config_path config/ernie_base_1.0_CN/ernie_config.json \
--learning_rate ${lr} \
--skip_steps 10 \
--num_iteration_per_drop_scope 1 \
--num_labels 2 \
--random_seed 1
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Mask, padding and batching."""
import numpy as np
def pad_batch_data(
insts,
pad_idx=0,
return_pos=False,
return_input_mask=False,
return_max_len=False,
return_num_token=False,
return_seq_lens=False,
):
"""
Pad the instances to the max sequence length in batch, and generate the
corresponding position data and attention bias.
"""
return_list = []
max_len = max(len(inst) for inst in insts)
# Any token included in dict can be used to pad, since the paddings' loss
# will be masked out by weights and make no effect on parameter gradients.
inst_data = np.array([inst + list([pad_idx] * (max_len - len(inst))) for inst in insts])
return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])]
# position data
if return_pos:
inst_pos = np.array([list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst)) for inst in insts])
return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])]
if return_input_mask:
# This is used to avoid attention on paddings.
input_mask_data = np.array([[1] * len(inst) + [0] * (max_len - len(inst)) for inst in insts])
input_mask_data = np.expand_dims(input_mask_data, axis=-1)
return_list += [input_mask_data.astype("float32")]
if return_max_len:
return_list += [max_len]
if return_num_token:
num_token = 0
for inst in insts:
num_token += len(inst)
return_list += [num_token]
if return_seq_lens:
seq_lens = np.array([len(inst) for inst in insts])
return_list += [seq_lens.astype("int64").reshape([-1, 1])]
return return_list if len(return_list) > 1 else return_list[0]
if __name__ == "__main__":
pass
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Model for classifier."""
import logging
import time
import numpy as np
import paddle.fluid as fluid
from model.ernie import ErnieModel
from scipy.stats import pearsonr, spearmanr
log = logging.getLogger(__name__)
def create_model(args, pyreader_name, ernie_config, is_prediction=False, task_name=""):
pyreader = fluid.layers.py_reader(
capacity=50,
shapes=[
[-1, args.max_seq_len, 1],
[-1, args.max_seq_len, 1],
[-1, args.max_seq_len, 1],
[-1, args.max_seq_len, 1],
[-1, args.max_seq_len, 1],
[-1, 1],
[-1, 1],
],
dtypes=["int64", "int64", "int64", "int64", "float32", "int64", "int64"],
lod_levels=[0, 0, 0, 0, 0, 0, 0],
name=task_name + "_" + pyreader_name,
use_double_buffer=True,
)
(src_ids, sent_ids, pos_ids, task_ids, input_mask, labels, qids) = fluid.layers.read_file(pyreader)
def _model(is_noise=False):
ernie = ErnieModel(
src_ids=src_ids,
position_ids=pos_ids,
sentence_ids=sent_ids,
task_ids=task_ids,
input_mask=input_mask,
config=ernie_config,
is_noise=is_noise,
)
cls_feats = ernie.get_pooled_output()
if not is_noise:
cls_feats = fluid.layers.dropout(x=cls_feats, dropout_prob=0.1, dropout_implementation="upscale_in_train")
logits = fluid.layers.fc(
input=cls_feats,
size=args.num_labels,
param_attr=fluid.ParamAttr(
name=task_name + "_cls_out_w", initializer=fluid.initializer.TruncatedNormal(scale=0.02)
),
bias_attr=fluid.ParamAttr(name=task_name + "_cls_out_b", initializer=fluid.initializer.Constant(0.0)),
)
"""
if is_prediction:
probs = fluid.layers.softmax(logits)
feed_targets_name = [
src_ids.name, sent_ids.name, pos_ids.name, input_mask.name
]
if ernie_version == "2.0":
feed_targets_name += [task_ids.name]
return pyreader, probs, feed_targets_name
"""
num_seqs = fluid.layers.create_tensor(dtype="int64")
# add focal loss
ce_loss, probs = fluid.layers.softmax_with_cross_entropy(logits=logits, label=labels, return_softmax=True)
loss = fluid.layers.mean(x=ce_loss)
accuracy = fluid.layers.accuracy(input=probs, label=labels, total=num_seqs)
graph_vars = {
"loss": loss,
"probs": probs,
"accuracy": accuracy,
"labels": labels,
"num_seqs": num_seqs,
"qids": qids,
}
return graph_vars
if not is_prediction:
graph_vars = _model(is_noise=True)
old_loss = graph_vars["loss"]
token_emb = fluid.default_main_program().global_block().var("word_embedding")
token_emb.stop_gradient = False
token_gradient = fluid.gradients(old_loss, token_emb)[0]
token_gradient.stop_gradient = False
epsilon = 1e-8
norm = fluid.layers.sqrt(fluid.layers.reduce_sum(fluid.layers.square(token_gradient)) + epsilon)
gp = (0.01 * token_gradient) / norm
gp.stop_gradient = True
fluid.layers.assign(token_emb + gp, token_emb)
graph_vars = _model()
fluid.layers.assign(token_emb - gp, token_emb)
else:
graph_vars = _model()
return pyreader, graph_vars
def evaluate_mrr(preds):
last_qid = None
total_mrr = 0.0
qnum = 0.0
rank = 0.0
correct = False
for qid, score, label in preds:
if qid != last_qid:
rank = 0.0
qnum += 1
correct = False
last_qid = qid
rank += 1
if not correct and label != 0:
total_mrr += 1.0 / rank
correct = True
return total_mrr / qnum
def evaluate(
exe, test_program, test_pyreader, graph_vars, eval_phase, use_multi_gpu_test=False, metric="simple_accuracy"
):
train_fetch_list = [graph_vars["loss"].name, graph_vars["accuracy"].name, graph_vars["num_seqs"].name]
if eval_phase == "train":
if "learning_rate" in graph_vars:
train_fetch_list.append(graph_vars["learning_rate"].name)
outputs = exe.run(fetch_list=train_fetch_list, program=test_program)
ret = {"loss": np.mean(outputs[0]), "accuracy": np.mean(outputs[1])}
if "learning_rate" in graph_vars:
ret["learning_rate"] = float(outputs[3][0])
return ret
test_pyreader.start()
total_cost = 0.0
total_acc = 0.0
total_num_seqs = 0.0
total_label_pos_num = 0.0
total_pred_pos_num = 0.0
total_correct_num = 0.0
qids, labels, scores, preds = [], [], [], []
time_begin = time.time()
fetch_list = [
graph_vars["loss"].name,
graph_vars["accuracy"].name,
graph_vars["probs"].name,
graph_vars["labels"].name,
graph_vars["num_seqs"].name,
graph_vars["qids"].name,
]
while True:
try:
if use_multi_gpu_test:
np_loss, np_acc, np_probs, np_labels, np_num_seqs, np_qids = exe.run(fetch_list=fetch_list)
else:
np_loss, np_acc, np_probs, np_labels, np_num_seqs, np_qids = exe.run(
program=test_program, fetch_list=fetch_list
)
total_cost += np.sum(np_loss * np_num_seqs)
total_acc += np.sum(np_acc * np_num_seqs)
total_num_seqs += np.sum(np_num_seqs)
labels.extend(np_labels.reshape((-1)).tolist())
if np_qids is None:
np_qids = np.array([])
qids.extend(np_qids.reshape(-1).tolist())
scores.extend(np_probs[:, 1].reshape(-1).tolist())
np_preds = np.argmax(np_probs, axis=1).astype(np.float32)
preds.extend(np_preds)
total_label_pos_num += np.sum(np_labels)
total_pred_pos_num += np.sum(np_preds)
total_correct_num += np.sum(np.dot(np_preds, np_labels))
except fluid.core.EOFException:
test_pyreader.reset()
break
time_end = time.time()
cost = total_cost / total_num_seqs
elapsed_time = time_end - time_begin
evaluate_info = ""
if metric == "acc_and_f1":
ret = acc_and_f1(preds, labels)
evaluate_info = "[%s evaluation] ave loss: %f, ave_acc: %f, f1: %f, data_num: %d, elapsed time: %f s" % (
eval_phase,
cost,
ret["acc"],
ret["f1"],
total_num_seqs,
elapsed_time,
)
elif metric == "matthews_corrcoef":
ret = matthews_corrcoef(preds, labels)
evaluate_info = "[%s evaluation] ave loss: %f, matthews_corrcoef: %f, data_num: %d, elapsed time: %f s" % (
eval_phase,
cost,
ret,
total_num_seqs,
elapsed_time,
)
elif metric == "pearson_and_spearman":
ret = pearson_and_spearman(scores, labels)
evaluate_info = (
"[%s evaluation] ave loss: %f, pearson:%f, spearman:%f, corr:%f, data_num: %d, elapsed time: %f s"
% (eval_phase, cost, ret["pearson"], ret["spearman"], ret["corr"], total_num_seqs, elapsed_time)
)
elif metric == "simple_accuracy":
ret = simple_accuracy(preds, labels)
evaluate_info = "[%s evaluation] ave loss: %f, acc:%f, data_num: %d, elapsed time: %f s" % (
eval_phase,
cost,
ret,
total_num_seqs,
elapsed_time,
)
elif metric == "acc_and_f1_and_mrr":
ret_a = acc_and_f1(preds, labels)
preds = sorted(zip(qids, scores, labels), key=lambda elem: (elem[0], -elem[1]))
ret_b = evaluate_mrr(preds)
evaluate_info = "[%s evaluation] ave loss: %f, acc: %f, f1: %f, mrr: %f, data_num: %d, elapsed time: %f s" % (
eval_phase,
cost,
ret_a["acc"],
ret_a["f1"],
ret_b,
total_num_seqs,
elapsed_time,
)
else:
raise ValueError("unsupported metric {}".format(metric))
return evaluate_info
def matthews_corrcoef(preds, labels):
preds = np.array(preds)
labels = np.array(labels)
tp = np.sum((labels == 1) & (preds == 1))
tn = np.sum((labels == 0) & (preds == 0))
fp = np.sum((labels == 0) & (preds == 1))
fn = np.sum((labels == 1) & (preds == 0))
mcc = ((tp * tn) - (fp * fn)) / np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
return mcc
def f1_score(preds, labels):
preds = np.array(preds)
labels = np.array(labels)
tp = np.sum((labels == 1) & (preds == 1))
fp = np.sum((labels == 0) & (preds == 1))
fn = np.sum((labels == 1) & (preds == 0))
p = tp / (tp + fp)
r = tp / (tp + fn)
f1 = (2 * p * r) / (p + r + 1e-8)
return f1
def pearson_and_spearman(preds, labels):
preds = np.array(preds)
labels = np.array(labels)
pearson_corr = pearsonr(preds, labels)[0]
spearman_corr = spearmanr(preds, labels)[0]
return {
"pearson": pearson_corr,
"spearmanr": spearman_corr,
"corr": (pearson_corr + spearman_corr) / 2,
}
def acc_and_f1(preds, labels):
preds = np.array(preds)
labels = np.array(labels)
acc = simple_accuracy(preds, labels)
f1 = f1_score(preds, labels)
return {
"acc": acc,
"f1": f1,
"acc_and_f1": (acc + f1) / 2,
}
def simple_accuracy(preds, labels):
preds = np.array(preds)
labels = np.array(labels)
return (preds == labels).mean()
def predict(exe, test_program, test_pyreader, graph_vars, dev_count=1):
test_pyreader.start()
qids, probs = [], []
preds = []
fetch_list = [graph_vars["probs"].name, graph_vars["qids"].name]
while True:
try:
if dev_count == 1:
np_probs, np_qids = exe.run(program=test_program, fetch_list=fetch_list)
else:
np_probs, np_qids = exe.run(fetch_list=fetch_list)
if np_qids is None:
np_qids = np.array([])
qids.extend(np_qids.reshape(-1).tolist())
np_preds = np.argmax(np_probs, axis=1).astype(np.float32)
preds.extend(np_preds)
probs.append(np_probs)
except fluid.core.EOFException:
test_pyreader.reset()
break
probs = np.concatenate(probs, axis=0).reshape([len(preds), -1])
return qids, preds, probs
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from src.utils.args import ArgumentGroup
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
model_g = ArgumentGroup(parser, "model", "model configuration and paths.")
model_g.add_arg("ernie_config_path", str, None, "Path to the json file for ernie model config.")
model_g.add_arg("init_checkpoint", str, None, "Init checkpoint to resume training from.")
model_g.add_arg("init_pretraining_params", str, None, "Init pre-training params which preforms fine-tuning from. If the arg 'init_checkpoint' has been set, this argument wouldn't be valid.")
model_g.add_arg("checkpoints", str, "checkpoints", "Path to save checkpoints.")
model_g.add_arg("is_classify", bool, True, "is_classify")
model_g.add_arg("is_regression", bool, False, "is_regression")
model_g.add_arg("task_id", int, 0, "task id")
train_g = ArgumentGroup(parser, "training", "training options.")
train_g.add_arg("epoch", int, 3, "Number of epoches for fine-tuning.")
train_g.add_arg("learning_rate", float, 5e-5, "Learning rate used to train with warmup.")
train_g.add_arg("lr_scheduler", str, "linear_warmup_decay", "scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay'])
train_g.add_arg("weight_decay", float, 0.01, "Weight decay rate for L2 regularizer.")
train_g.add_arg("warmup_proportion", float, 0.1, "Proportion of training steps to perform linear learning rate warmup for.")
train_g.add_arg("save_steps", int, 10000, "The steps interval to save checkpoints.")
train_g.add_arg("validation_steps", int, 1000, "The steps interval to evaluate model performance.")
train_g.add_arg("use_recompute", bool, False, "Whether to use recompute optimizer for training.")
train_g.add_arg("use_mix_precision", bool, False, "Whether to use mix-precision optimizer for training.")
train_g.add_arg("use_cross_batch", bool, False, "Whether to use cross-batch for training.")
train_g.add_arg("use_lamb", bool, False, "Whether to use LambOptimizer for training.")
train_g.add_arg("use_dynamic_loss_scaling", bool, True, "Whether to use dynamic loss scaling.")
train_g.add_arg("test_save", str, "./checkpoints/test_result", "test_save")
train_g.add_arg("metric", str, "simple_accuracy", "metric")
train_g.add_arg("incr_every_n_steps", int, 100, "Increases loss scaling every n consecutive.")
train_g.add_arg("decr_every_n_nan_or_inf", int, 2, "Decreases loss scaling every n accumulated steps with nan or inf gradients.")
train_g.add_arg("incr_ratio", float, 2.0, "The multiplier to use when increasing the loss scaling.")
train_g.add_arg("decr_ratio", float, 0.8, "The less-than-one-multiplier to use when decreasing.")
log_g = ArgumentGroup(parser, "logging", "logging related.")
log_g.add_arg("skip_steps", int, 10, "The steps interval to print loss.")
log_g.add_arg("verbose", bool, False, "Whether to output verbose log.")
data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options")
data_g.add_arg("tokenizer", str, "FullTokenizer", "ATTENTION: the INPUT must be splited by Word with blank while using SentencepieceTokenizer or WordsegTokenizer")
data_g.add_arg("train_set", str, None, "Path to training data.")
data_g.add_arg("test_set", str, None, "Path to test data.")
data_g.add_arg("dev_set", str, None, "Path to validation data.")
data_g.add_arg("vocab_path", str, None, "Vocabulary path.")
data_g.add_arg("max_seq_len", int, 512, "Number of words of the longest seqence.")
data_g.add_arg("q_max_seq_len", int, 32, "Number of words of the longest seqence.")
data_g.add_arg("p_max_seq_len", int, 256, "Number of words of the longest seqence.")
data_g.add_arg("train_data_size", int, 0, "Number of training data's total examples. Set for distribute.")
data_g.add_arg("batch_size", int, 32, "Total examples' number in batch for training. see also --in_tokens.")
data_g.add_arg("predict_batch_size", int, None, "Total examples' number in batch for predict. see also --in_tokens.")
data_g.add_arg("in_tokens", bool, False, "If set, the batch size will be the maximum number of tokens in one batch. Otherwise, it will be the maximum number of examples in one batch.")
data_g.add_arg("do_lower_case", bool, True, "Whether to lower case the input text. Should be True for uncased models and False for cased models.")
data_g.add_arg("random_seed", int, None, "Random seed.")
data_g.add_arg("label_map_config", str, None, "label_map_path.")
data_g.add_arg("num_labels", int, 2, "label number")
data_g.add_arg("diagnostic", str, None, "GLUE Diagnostic Dataset")
data_g.add_arg("diagnostic_save", str, None, "GLUE Diagnostic save f")
data_g.add_arg("max_query_length", int, 64, "Max query length.")
data_g.add_arg("max_answer_length", int, 100, "Max answer length.")
data_g.add_arg("doc_stride", int, 128, "When splitting up a long document into chunks, how much stride to take between chunks.")
data_g.add_arg("n_best_size", int, 20, "The total number of n-best predictions to generate in the nbest_predictions.json output file.")
data_g.add_arg("chunk_scheme", type=str, default="IOB", choices=["IO", "IOB", "IOE", "IOBES"], help="chunk scheme")
run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.")
run_type_g.add_arg("is_distributed", bool, False, "If set, then start distributed training.")
run_type_g.add_arg("use_fast_executor", bool, False, "If set, use fast parallel executor (in experiment).")
run_type_g.add_arg("num_iteration_per_drop_scope", int, 10, "Iteration intervals to drop scope.")
run_type_g.add_arg("do_train", bool, True, "Whether to perform training.")
run_type_g.add_arg("do_val", bool, True, "Whether to perform evaluation on dev data set.")
run_type_g.add_arg("do_test", bool, True, "Whether to perform evaluation on test data set.")
run_type_g.add_arg("output_item", int, 3, "Test output format.")
run_type_g.add_arg("output_file_name", str, None, "Test output file name")
run_type_g.add_arg("test_data_cnt", int, 1110000 , "total cnt of testset")
run_type_g.add_arg("use_multi_gpu_test", bool, False, "Whether to perform evaluation using multiple gpu cards")
run_type_g.add_arg("metrics", bool, True, "Whether to perform evaluation on test data set.")
run_type_g.add_arg("shuffle", bool, True, "")
run_type_g.add_arg("for_cn", bool, False, "model train for cn or for other langs.")
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import faiss
import numpy as np
def read_embed(file_name, dim=768, bs=3000):
if file_name.endswith("npy"):
i = 0
emb_np = np.load(file_name)
while i < len(emb_np):
vec_list = emb_np[i : i + bs]
i += bs
yield vec_list
else:
vec_list = []
with open(file_name) as inp:
for line in inp:
data = line.strip()
vector = [float(item) for item in data.split(" ")]
assert len(vector) == dim
vec_list.append(vector)
if len(vec_list) == bs:
yield vec_list
vec_list = []
if vec_list:
yield vec_list
def load_qid(file_name):
qid_list = []
with open(file_name) as inp:
for line in inp:
line = line.strip()
qid = line.split("\t")[0]
qid_list.append(qid)
return qid_list
def search(index, emb_file, qid_list, outfile, top_k):
q_idx = 0
with open(outfile, "w") as out:
for batch_vec in read_embed(emb_file):
q_emb_matrix = np.array(batch_vec)
res_dist, res_p_id = index.search(q_emb_matrix.astype("float32"), top_k)
for i in range(len(q_emb_matrix)):
qid = qid_list[q_idx]
for j in range(top_k):
pid = res_p_id[i][j]
score = res_dist[i][j]
out.write("%s\t%s\t%s\t%s\n" % (qid, pid, j + 1, score))
q_idx += 1
def main():
part = sys.argv[1]
topk = int(sys.argv[2])
q_text_file = sys.argv[3]
outfile = "output/res.top%s-part%s" % (topk, part)
qid_list = load_qid(q_text_file)
engine = faiss.read_index("output/para.index.part%s" % part)
emb_file = "output/query.emb.npy"
search(engine, emb_file, qid_list, outfile, topk)
if __name__ == "__main__":
main()
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
shift = int(sys.argv[1])
top = int(sys.argv[2])
total_part = int(sys.argv[3])
f_list = []
for part in range(total_part):
f0 = open("output/res.top%s-part%s" % (top, part))
f_list.append(f0)
line_list = []
for part in range(total_part):
line = f_list[part].readline()
line_list.append(line)
out = open("output/dev.res.top%s" % top, "w")
last_q = ""
ans_list = {}
while line_list[-1]:
cur_list = []
for line in line_list:
sub = line.strip().split("\t")
cur_list.append(sub)
if last_q == "":
last_q = cur_list[0][0]
if cur_list[0][0] != last_q:
rank = sorted(ans_list.items(), key=lambda a: a[1], reverse=True)
for i in range(top):
out.write("%s\t%s\t%s\t%s\n" % (last_q, rank[i][0], i + 1, rank[i][1]))
ans_list = {}
for i, sub in enumerate(cur_list):
ans_list[int(sub[1]) + shift * i] = float(sub[-1])
last_q = cur_list[0][0]
line_list = []
for f0 in f_list:
line = f0.readline()
line_list.append(line)
rank = sorted(ans_list.items(), key=lambda a: a[1], reverse=True)
for i in range(top):
out.write("%s\t%s\t%s\t%s\n" % (last_q, rank[i][0], i + 1, rank[i][1]))
out.close()
print("output/dev.res.top%s" % top)
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Ernie model."""
from __future__ import absolute_import, division, print_function, unicode_literals
import json
import logging
from io import open
import paddle
import paddle.fluid as fluid
import six
from model.transformer_encoder import encoder, pre_process_layer
log = logging.getLogger(__name__)
class ErnieConfig(object):
def __init__(self, config_path):
self._config_dict = self._parse(config_path)
def _parse(self, config_path):
try:
with open(config_path, "r", encoding="utf8") as json_file:
config_dict = json.load(json_file)
except Exception:
raise IOError("Error in parsing Ernie model config file '%s'" % config_path)
else:
return config_dict
def __getitem__(self, key):
return self._config_dict.get(key, None)
def print_config(self):
for arg, value in sorted(six.iteritems(self._config_dict)):
log.info("%s: %s" % (arg, value))
log.info("------------------------------------------------")
class ErnieModel(object):
def __init__(
self,
src_ids,
position_ids,
sentence_ids,
task_ids,
input_mask,
config,
weight_sharing=True,
model_name="",
is_noise=False,
):
self._emb_size = config["hidden_size"]
self._n_layer = config["num_hidden_layers"]
self._n_head = config["num_attention_heads"]
self._voc_size = config["vocab_size"]
self._max_position_seq_len = config["max_position_embeddings"]
if config["sent_type_vocab_size"]:
self._sent_types = config["sent_type_vocab_size"]
else:
self._sent_types = config["type_vocab_size"]
self._use_task_id = config["use_task_id"]
if self._use_task_id:
self._task_types = config["task_type_vocab_size"]
self._hidden_act = config["hidden_act"]
self._prepostprocess_dropout = config["hidden_dropout_prob"]
self._attention_dropout = config["attention_probs_dropout_prob"]
if is_noise:
self._prepostprocess_dropout = 0
self._attention_dropout = 0
self._weight_sharing = weight_sharing
self.checkpoints = []
self._word_emb_name = "word_embedding"
self._pos_emb_name = "pos_embedding"
self._sent_emb_name = "sent_embedding"
self._task_emb_name = "task_embedding"
self._emb_dtype = "float32"
# Initialize all weights by truncated normal initializer, and all biases
# will be initialized by constant zero by default.
self._param_initializer = fluid.initializer.TruncatedNormal(scale=config["initializer_range"])
self._build_model(model_name, src_ids, position_ids, sentence_ids, task_ids, input_mask)
def _build_model(self, model_name, src_ids, position_ids, sentence_ids, task_ids, input_mask):
# padding id in vocabulary must be set to 0
emb_out = fluid.layers.embedding(
input=src_ids,
size=[self._voc_size, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(name=model_name + self._word_emb_name, initializer=self._param_initializer),
is_sparse=False,
)
position_emb_out = fluid.layers.embedding(
input=position_ids,
size=[self._max_position_seq_len, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(name=model_name + self._pos_emb_name, initializer=self._param_initializer),
)
sent_emb_out = fluid.layers.embedding(
sentence_ids,
size=[self._sent_types, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(name=model_name + self._sent_emb_name, initializer=self._param_initializer),
)
emb_out = emb_out + position_emb_out
emb_out = emb_out + sent_emb_out
if self._use_task_id:
task_emb_out = fluid.layers.embedding(
task_ids,
size=[self._task_types, self._emb_size],
dtype=self._emb_dtype,
param_attr=fluid.ParamAttr(name=model_name + self._task_emb_name, initializer=self._param_initializer),
)
emb_out = emb_out + task_emb_out
emb_out = pre_process_layer(emb_out, "nd", self._prepostprocess_dropout, name=model_name + "pre_encoder")
self_attn_mask = paddle.matmul(x=input_mask, y=input_mask, transpose_y=True)
self_attn_mask = fluid.layers.scale(x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
n_head_self_attn_mask = fluid.layers.stack(x=[self_attn_mask] * self._n_head, axis=1)
n_head_self_attn_mask.stop_gradient = True
self._enc_out, self.checkpoints = encoder(
enc_input=emb_out,
attn_bias=n_head_self_attn_mask,
n_layer=self._n_layer,
n_head=self._n_head,
d_key=self._emb_size // self._n_head,
d_value=self._emb_size // self._n_head,
d_model=self._emb_size,
d_inner_hid=self._emb_size * 4,
prepostprocess_dropout=self._prepostprocess_dropout,
attention_dropout=self._attention_dropout,
relu_dropout=0,
hidden_act=self._hidden_act,
preprocess_cmd="",
postprocess_cmd="dan",
param_initializer=self._param_initializer,
model_name=model_name,
name=model_name + "encoder",
)
def get_sequence_output(self):
return self._enc_out
def get_cls_output(self):
"""Get the first feature of each sequence for classification"""
cls_output = fluid.layers.slice(input=self._enc_out, axes=[1], starts=[0], ends=[1])
cls_output = fluid.layers.squeeze(cls_output, axes=[1])
return cls_output
def get_pooled_output(self):
"""Get the first feature of each sequence for classification"""
next_sent_feat = fluid.layers.slice(input=self._enc_out, axes=[1], starts=[0], ends=[1])
next_sent_feat = fluid.layers.fc(
input=next_sent_feat,
size=self._emb_size,
act="tanh",
param_attr=fluid.ParamAttr(name="pooled_fc.w_0", initializer=self._param_initializer),
bias_attr="pooled_fc.b_0",
)
return next_sent_feat
def get_lm_output(self, mask_label, mask_pos):
"""Get the loss & accuracy for pretraining"""
mask_pos = fluid.layers.cast(x=mask_pos, dtype="int32")
# extract the first token feature in each sentence
self.next_sent_feat = self.get_pooled_output()
reshaped_emb_out = fluid.layers.reshape(x=self._enc_out, shape=[-1, self._emb_size])
# extract masked tokens' feature
mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)
# transform: fc
mask_trans_feat = fluid.layers.fc(
input=mask_feat,
size=self._emb_size,
act=self._hidden_act,
param_attr=fluid.ParamAttr(name="mask_lm_trans_fc.w_0", initializer=self._param_initializer),
bias_attr=fluid.ParamAttr(name="mask_lm_trans_fc.b_0"),
)
# transform: layer norm
mask_trans_feat = fluid.layers.layer_norm(
mask_trans_feat,
begin_norm_axis=len(mask_trans_feat.shape) - 1,
param_attr=fluid.ParamAttr(
name="mask_lm_trans_layer_norm_scale", initializer=fluid.initializer.Constant(1.0)
),
bias_attr=fluid.ParamAttr(
name="mask_lm_trans_layer_norm_bias", initializer=fluid.initializer.Constant(1.0)
),
)
# transform: layer norm
# mask_trans_feat = pre_process_layer(
# mask_trans_feat, 'n', name='mask_lm_trans')
mask_lm_out_bias_attr = fluid.ParamAttr(
name="mask_lm_out_fc.b_0", initializer=fluid.initializer.Constant(value=0.0)
)
if self._weight_sharing:
fc_out = paddle.matmul(
x=mask_trans_feat,
y=fluid.default_main_program().global_block().var(self._word_emb_name),
transpose_y=True,
)
fc_out += fluid.layers.create_parameter(
shape=[self._voc_size], dtype=self._emb_dtype, attr=mask_lm_out_bias_attr, is_bias=True
)
else:
fc_out = fluid.layers.fc(
input=mask_trans_feat,
size=self._voc_size,
param_attr=fluid.ParamAttr(name="mask_lm_out_fc.w_0", initializer=self._param_initializer),
bias_attr=mask_lm_out_bias_attr,
)
mask_lm_loss = fluid.layers.softmax_with_cross_entropy(logits=fc_out, label=mask_label)
mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss)
return mean_mask_lm_loss
def get_task_output(self, task, task_labels):
task_fc_out = fluid.layers.fc(
input=self.next_sent_feat,
size=task["num_labels"],
param_attr=fluid.ParamAttr(name=task["task_name"] + "_fc.w_0", initializer=self._param_initializer),
bias_attr=task["task_name"] + "_fc.b_0",
)
task_loss, task_softmax = fluid.layers.softmax_with_cross_entropy(
logits=task_fc_out, label=task_labels, return_softmax=True
)
task_acc = fluid.layers.accuracy(input=task_softmax, label=task_labels)
mean_task_loss = fluid.layers.mean(task_loss)
return mean_task_loss, task_acc
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Transformer encoder."""
from __future__ import absolute_import, division, print_function
from functools import partial
import paddle
import paddle.fluid as fluid
import paddle.fluid.layers as layers
def multi_head_attention(
queries,
keys,
values,
attn_bias,
d_key,
d_value,
d_model,
n_head=1,
dropout_rate=0.0,
cache=None,
param_initializer=None,
name="multi_head_att",
):
"""
Multi-Head Attention. Note that attn_bias is added to the logit before
computing softmax activation to mask certain selected positions so that
they will not considered in attention weights.
"""
keys = queries if keys is None else keys
values = keys if values is None else values
if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
raise ValueError("Inputs: queries, keys and values should all be 3-D tensors.")
def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
"""
Add linear projection to queries, keys, and values.
"""
q = layers.fc(
input=queries,
size=d_key * n_head,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(name=name + "_query_fc.w_0", initializer=param_initializer),
bias_attr=name + "_query_fc.b_0",
)
k = layers.fc(
input=keys,
size=d_key * n_head,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(name=name + "_key_fc.w_0", initializer=param_initializer),
bias_attr=name + "_key_fc.b_0",
)
v = layers.fc(
input=values,
size=d_value * n_head,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(name=name + "_value_fc.w_0", initializer=param_initializer),
bias_attr=name + "_value_fc.b_0",
)
return q, k, v
def __split_heads(x, n_head):
"""
Reshape the last dimension of input tensor x so that it becomes two
dimensions and then transpose. Specifically, input a tensor with shape
[bs, max_sequence_length, n_head * hidden_dim] then output a tensor
with shape [bs, n_head, max_sequence_length, hidden_dim].
"""
hidden_size = x.shape[-1]
# The value 0 in shape attr means copying the corresponding dimension
# size of the input as the output dimension size.
reshaped = layers.reshape(x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True)
# permuate the dimensions into:
# [batch_size, n_head, max_sequence_len, hidden_size_per_head]
return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
def __combine_heads(x):
"""
Transpose and then reshape the last two dimensions of input tensor x
so that it becomes one dimension, which is reverse to __split_heads.
"""
if len(x.shape) == 3:
return x
if len(x.shape) != 4:
raise ValueError("Input(x) should be a 4-D Tensor.")
trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
# The value 0 in shape attr means copying the corresponding dimension
# size of the input as the output dimension size.
return layers.reshape(x=trans_x, shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]], inplace=True)
def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
"""
Scaled Dot-Product Attention
"""
scaled_q = layers.scale(x=q, scale=d_key**-0.5)
product = paddle.matmul(x=scaled_q, y=k, transpose_y=True)
if attn_bias:
product += attn_bias
weights = layers.softmax(product)
if dropout_rate:
weights = layers.dropout(
weights, dropout_prob=dropout_rate, dropout_implementation="upscale_in_train", is_test=False
)
out = paddle.matmul(weights, v)
return out
q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
if cache is not None: # use cache and concat time steps
# Since the inplace reshape in __split_heads changes the shape of k and
# v, which is the cache input for next time step, reshape the cache
# input from the previous time step first.
k = cache["k"] = layers.concat([layers.reshape(cache["k"], shape=[0, 0, d_model]), k], axis=1)
v = cache["v"] = layers.concat([layers.reshape(cache["v"], shape=[0, 0, d_model]), v], axis=1)
q = __split_heads(q, n_head)
k = __split_heads(k, n_head)
v = __split_heads(v, n_head)
ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate)
out = __combine_heads(ctx_multiheads)
# Project back to the model size.
proj_out = layers.fc(
input=out,
size=d_model,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(name=name + "_output_fc.w_0", initializer=param_initializer),
bias_attr=name + "_output_fc.b_0",
)
return proj_out
def positionwise_feed_forward(x, d_inner_hid, d_hid, dropout_rate, hidden_act, param_initializer=None, name="ffn"):
"""
Position-wise Feed-Forward Networks.
This module consists of two linear transformations with a ReLU activation
in between, which is applied to each position separately and identically.
"""
hidden = layers.fc(
input=x,
size=d_inner_hid,
num_flatten_dims=2,
act=hidden_act,
param_attr=fluid.ParamAttr(name=name + "_fc_0.w_0", initializer=param_initializer),
bias_attr=name + "_fc_0.b_0",
)
if dropout_rate:
hidden = layers.dropout(
hidden, dropout_prob=dropout_rate, dropout_implementation="upscale_in_train", is_test=False
)
out = layers.fc(
input=hidden,
size=d_hid,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(name=name + "_fc_1.w_0", initializer=param_initializer),
bias_attr=name + "_fc_1.b_0",
)
return out
def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.0, name=""):
"""
Add residual connection, layer normalization and dropout to the out tensor
optionally according to the value of process_cmd.
This will be used before or after multi-head attention and position-wise
feed-forward networks.
"""
for cmd in process_cmd:
if cmd == "a": # add residual connection
out = out + prev_out if prev_out else out
elif cmd == "n": # add layer normalization
out_dtype = out.dtype
if out_dtype == fluid.core.VarDesc.VarType.FP16:
out = layers.cast(x=out, dtype="float32")
out = layers.layer_norm(
out,
begin_norm_axis=len(out.shape) - 1,
param_attr=fluid.ParamAttr(
name=name + "_layer_norm_scale", initializer=fluid.initializer.Constant(1.0)
),
bias_attr=fluid.ParamAttr(name=name + "_layer_norm_bias", initializer=fluid.initializer.Constant(0.0)),
)
if out_dtype == fluid.core.VarDesc.VarType.FP16:
out = layers.cast(x=out, dtype="float16")
elif cmd == "d": # add dropout
if dropout_rate:
out = layers.dropout(
out, dropout_prob=dropout_rate, dropout_implementation="upscale_in_train", is_test=False
)
return out
pre_process_layer = partial(pre_post_process_layer, None)
post_process_layer = pre_post_process_layer
def encoder_layer(
enc_input,
attn_bias,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
hidden_act,
preprocess_cmd="n",
postprocess_cmd="da",
param_initializer=None,
name="",
):
"""The encoder layers that can be stacked to form a deep encoder.
This module consists of a multi-head (self) attention followed by
position-wise feed-forward networks and both the two components companied
with the post_process_layer to add residual connection, layer normalization
and dropout.
"""
attn_output = multi_head_attention(
pre_process_layer(enc_input, preprocess_cmd, prepostprocess_dropout, name=name + "_pre_att"),
None,
None,
attn_bias,
d_key,
d_value,
d_model,
n_head,
attention_dropout,
param_initializer=param_initializer,
name=name + "_multi_head_att",
)
attn_output = post_process_layer(
enc_input, attn_output, postprocess_cmd, prepostprocess_dropout, name=name + "_post_att"
)
ffd_output = positionwise_feed_forward(
pre_process_layer(attn_output, preprocess_cmd, prepostprocess_dropout, name=name + "_pre_ffn"),
d_inner_hid,
d_model,
relu_dropout,
hidden_act,
param_initializer=param_initializer,
name=name + "_ffn",
)
return (
post_process_layer(attn_output, ffd_output, postprocess_cmd, prepostprocess_dropout, name=name + "_post_ffn"),
ffd_output,
)
def encoder(
enc_input,
attn_bias,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
hidden_act,
preprocess_cmd="n",
postprocess_cmd="da",
param_initializer=None,
model_name="",
name="",
):
"""
The encoder is composed of a stack of identical layers returned by calling
encoder_layer.
"""
checkpoints = []
for i in range(n_layer):
enc_output, cp = encoder_layer(
enc_input,
attn_bias,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
hidden_act,
preprocess_cmd,
postprocess_cmd,
param_initializer=param_initializer,
name=name + "_layer_" + str(i),
)
checkpoints.append(cp)
enc_input = enc_output
enc_output = pre_process_layer(
enc_output, preprocess_cmd, prepostprocess_dropout, name=model_name + "post_encoder"
)
return enc_output, checkpoints
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Optimization and learning rate scheduling."""
import paddle.fluid as fluid
from paddle.fluid.incubate.fleet.collective import fleet
def linear_warmup_decay(learning_rate, warmup_steps, num_train_steps):
"""Applies linear warmup of learning rate from 0 and decay to 0."""
with fluid.default_main_program()._lr_schedule_guard():
lr = fluid.layers.tensor.create_global_var(
shape=[1], value=0.0, dtype="float32", persistable=True, name="scheduled_learning_rate"
)
global_step = fluid.layers.learning_rate_scheduler._decay_step_counter()
with fluid.layers.control_flow.Switch() as switch:
with switch.case(global_step < warmup_steps):
warmup_lr = learning_rate * (global_step / warmup_steps)
fluid.layers.tensor.assign(warmup_lr, lr)
with switch.default():
decayed_lr = fluid.layers.learning_rate_scheduler.polynomial_decay(
learning_rate=learning_rate,
decay_steps=num_train_steps,
end_learning_rate=0.0,
power=1.0,
cycle=False,
)
fluid.layers.tensor.assign(decayed_lr, lr)
return lr
def optimization(
loss,
warmup_steps,
num_train_steps,
learning_rate,
train_program,
startup_prog,
weight_decay,
scheduler="linear_warmup_decay",
use_dynamic_loss_scaling=False,
incr_every_n_steps=1000,
decr_every_n_nan_or_inf=2,
incr_ratio=2.0,
decr_ratio=0.8,
dist_strategy=None,
use_lamb=False,
):
if warmup_steps > 0:
if scheduler == "noam_decay":
scheduled_lr = fluid.layers.learning_rate_scheduler.noam_decay(
1 / (warmup_steps * (learning_rate**2)), warmup_steps
)
elif scheduler == "linear_warmup_decay":
scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps, num_train_steps)
else:
raise ValueError("Unknown learning rate scheduler, should be " "'noam_decay' or 'linear_warmup_decay'")
if use_lamb:
optimizer = fluid.optimizer.LambOptimizer(learning_rate=scheduled_lr)
else:
optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
else:
scheduled_lr = fluid.layers.create_global_var(
name=fluid.unique_name.generate("learning_rate"),
shape=[1],
value=learning_rate,
dtype="float32",
persistable=True,
)
if use_lamb:
optimizer = fluid.optimizer.LambOptimizer(learning_rate=scheduled_lr)
else:
optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
optimizer._learning_rate_map[fluid.default_main_program()] = scheduled_lr
fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0))
def exclude_from_weight_decay(name):
if name.find("layer_norm") > -1:
return True
bias_suffix = ["_bias", "_b", ".b_0"]
for suffix in bias_suffix:
if name.endswith(suffix):
return True
return False
param_list = dict()
for param in train_program.global_block().all_parameters():
param_list[param.name] = param * 1.0
param_list[param.name].stop_gradient = True
if dist_strategy is not None:
# use fleet api
optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy)
_, param_grads = optimizer.minimize(loss)
if weight_decay > 0:
for param, grad in param_grads:
if exclude_from_weight_decay(param.name):
continue
with param.block.program._optimized_guard([param, grad]), fluid.framework.name_scope("weight_decay"):
updated_param = param - param_list[param.name] * weight_decay * scheduled_lr
fluid.layers.assign(output=param, input=updated_param)
return scheduled_lr
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Arguments for configuration."""
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
import os
import sys
import paddle.fluid as fluid
import six
from paddlenlp.trainer.argparser import strtobool
log = logging.getLogger(__name__)
def prepare_logger(logger, debug=False, save_to_file=None):
formatter = logging.Formatter(fmt="[%(levelname)s] %(asctime)s [%(filename)12s:%(lineno)5d]:\t%(message)s")
console_hdl = logging.StreamHandler()
console_hdl.setFormatter(formatter)
logger.addHandler(console_hdl)
if save_to_file is not None and not os.path.exists(save_to_file):
file_hdl = logging.FileHandler(save_to_file)
file_hdl.setFormatter(formatter)
logger.addHandler(file_hdl)
logger.setLevel(logging.DEBUG)
logger.propagate = False
class ArgumentGroup(object):
def __init__(self, parser, title, des):
self._group = parser.add_argument_group(title=title, description=des)
def add_arg(self, name, type, default, help, positional_arg=False, **kwargs):
prefix = "" if positional_arg else "--"
type = strtobool if type == bool else type
self._group.add_argument(
prefix + name, default=default, type=type, help=help + " Default: %(default)s.", **kwargs
)
def print_arguments(args):
log.info("----------- Configuration Arguments -----------")
for arg, value in sorted(six.iteritems(vars(args))):
log.info("%s: %s" % (arg, value))
log.info("------------------------------------------------")
def check_cuda(
use_cuda,
err="\nYou can not set use_cuda = True in the model because you are using paddlepaddle-cpu.\n \
Please: 1. Install paddlepaddle-gpu to run your models on GPU or 2. Set use_cuda = False to run models on CPU.\n",
):
try:
if use_cuda is True and fluid.is_compiled_with_cuda() is False:
log.error(err)
sys.exit(1)
except Exception:
pass
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import os
import paddle.fluid as fluid
log = logging.getLogger(__name__)
def init_checkpoint(exe, init_checkpoint_path, main_program):
assert os.path.exists(init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path
def existed_persitables(var):
if not fluid.io.is_persistable(var):
return False
if not os.path.exists(os.path.join(init_checkpoint_path, var.name)):
print("Var not exists: [%s]\t%s" % (var.name, os.path.join(init_checkpoint_path, var.name)))
# else:
# print ("Var exists: [%s]" % (var.name))
return os.path.exists(os.path.join(init_checkpoint_path, var.name))
fluid.io.load_vars(exe, init_checkpoint_path, main_program=main_program, predicate=existed_persitables)
log.info("Load model from {}".format(init_checkpoint_path))
def init_pretraining_params(exe, pretraining_params_path, main_program):
assert os.path.exists(pretraining_params_path), "[%s] cann't be found." % pretraining_params_path
def existed_params(var):
if not isinstance(var, fluid.framework.Parameter):
return False
if not os.path.exists(os.path.join(pretraining_params_path, var.name)):
print("Var not exists: [%s]\t%s" % (var.name, os.path.join(pretraining_params_path, var.name)))
# else:
# print ("Var exists: [%s]" % (var.name))
return os.path.exists(os.path.join(pretraining_params_path, var.name))
fluid.io.load_vars(exe, pretraining_params_path, main_program=main_program, predicate=existed_params)
log.info("Load pretraining parameters from {}.".format(pretraining_params_path))
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
export CUDA_VISIBLE_DEVICES=0
QUESTION=$1
# Question: NFC咋开门
if [ $# != 1 ];then
echo "USAGE: sh script/run_cross_encoder_test.sh \$QUESTION"
exit 1
fi
# compute scores for QUESTION and OCR parsing results with Rerank module
cd Rerank
bash run_test.sh ${QUESTION}
cd ..
# extraction answer for QUESTION from the top1 of rank
cd Extraction
bash run_test.sh ${QUESTION}
cd ..
[ERNIE-Layout](../../../model_zoo/ernie-layout)
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment