Commit e6b93898 authored by liucong's avatar liucong
Browse files

删除部分代码和文档

parent 35ad452d
Pipeline #295 failed with stages
in 0 seconds
import numpy as np
import json
import os.path
import tokenizers
import collections
from run_onnx_squad import read_squad_examples, write_predictions, convert_examples_to_features
import migraphx
RawResult = collections.namedtuple("RawResult",
["unique_id", "start_logits", "end_logits"])
# 数据前处理
input_file = './model/inputs_data.json'
# 使用run_onnx_squad中的read_squad_examples方法读取输入文件,进行数据处理,将文本拆分成一个个单词
eval_examples = read_squad_examples(input_file)
max_seq_length = 256 # 规定输入文本的最大长度
doc_stride = 128 # 滑动的窗口大小
max_query_length = 64 # 问题的最大长度
batch_size = 1 # batch_size值
n_best_size = 20 # 预选数量
max_answer_length = 30 # 问题的最大长度
# 分词工具
vocab_file = os.path.join('./model/uncased_L-12_H-768_A-12', 'vocab.txt')
tokenizer = tokenizers.BertWordPieceTokenizer(vocab_file)
# 使用run_onnx_squad中的convert_examples_to_features方法从输入中获取参数
input_ids, input_mask, segment_ids, extra_data = convert_examples_to_features(eval_examples, tokenizer, max_seq_length, doc_stride, max_query_length)
# 编译
print("INFO: Parsing and compiling the model...")
model = migraphx.parse_onnx("./model/bertsquad-10.onnx")
model.compile(migraphx.get_target("gpu"),device_id=0)
n = len(input_ids)
bs = batch_size
all_results = []
for idx in range(0, n):
item = eval_examples[idx]
# 推理
result = model.run({
"unique_ids_raw_output___9:0":
np.array([item.qas_id], dtype=np.int64), # position id
"input_ids:0":
input_ids[idx:idx + bs], # Token id,对应的文本数据转换为数值型数据
"input_mask:0":
input_mask[idx:idx + bs], # 掩码
"segment_ids:0":
segment_ids[idx:idx + bs] # segment id,对上下文文本和问题赋予不同的位置向量
})
in_batch = result[1].get_shape().lens()[0]
start_logits = [float(x) for x in result[1].tolist()] # 答案起始位置的概率值
end_logits = [float(x) for x in result[0].tolist()] # 答案结束位置的概率值
for i in range(0, in_batch):
unique_id = len(all_results)
all_results.append(
RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits))
# 数据后处理,获取预测结果
output_dir = 'predictions'
os.makedirs(output_dir, exist_ok=True)
output_prediction_file = os.path.join(output_dir, "predictions.json")
output_nbest_file = os.path.join(output_dir, "nbest_predictions.json")
write_predictions(eval_examples, extra_data, all_results, n_best_size,
max_answer_length, True, output_prediction_file,
output_nbest_file)
with open(output_prediction_file) as json_file:
test_data = json.load(json_file)
print(json.dumps(test_data, indent=2))
{
"data": [
{
"paragraphs": [
{
"context": "ROCm is the first open-source exascale-class platform for accelerated computing that’s also programming-language independent. It brings a philosophy of choice, minimalism and modular software development to GPU computing. You are free to choose or even develop tools and a language run time for your application. ROCm is built for scale, it supports multi-GPU computing and has a rich system run time with the critical features that large-scale application, compiler and language-run-time development requires. Since the ROCm ecosystem is comprised of open technologies: frameworks (Tensorflow / PyTorch), libraries (MIOpen / Blas / RCCL), programming model (HIP), inter-connect (OCD) and up streamed Linux® Kernel support – the platform is continually optimized for performance and extensibility.",
"qas": [
{
"question": "What is ROCm?",
"id": "1"
},
{
"question": "Which frameworks does ROCm support?",
"id": "2"
},
{
"question": "What is ROCm built for?",
"id": "3"
}
]
}
],
"title": "AMD ROCm"
}
]
}
\ No newline at end of file
onnxruntime
tokenizers
numpy
\ No newline at end of file
import argparse
import collections
import json
import math
import os
import sys
import numpy as np
import onnxruntime as onnxrt
import six
from tokenizers import BertWordPieceTokenizer
from tokenizers import pre_tokenizers
RawResult = collections.namedtuple("RawResult",
["unique_id", "start_logits", "end_logits"])
Feature = collections.namedtuple("Feature", [
"unique_id", "tokens", "example_index", "token_to_orig_map",
"token_is_max_context"
])
class SquadExample(object):
def __init__(self,
qas_id,
question_text,
doc_tokens,
orig_answer_text=None,
start_position=None,
end_position=None):
self.qas_id = qas_id
self.question_text = question_text
self.doc_tokens = doc_tokens
self.orig_answer_text = orig_answer_text
self.start_position = start_position
self.end_position = end_position
def __str__(self):
return self.__repr__()
def __repr__(self):
s = []
s.append("qas_id: %s" % (self.qas_id))
s.append("question_text: %s" % (self.question_text))
s.append("doc_tokens: [%s]" % (" ".join(self.doc_tokens)))
if self.start_position:
s.append("start_position: %d" % (self.start_position))
if self.start_position:
s.append("end_position: %d" % (self.end_position))
return ", ".join(s)
def check_is_max_context(doc_spans, cur_span_index, position):
best_score = None
best_span_index = None
for (span_index, doc_span) in enumerate(doc_spans):
end = doc_span.start + doc_span.length - 1
if position < doc_span.start:
continue
if position > end:
continue
num_left_context = position - doc_span.start
num_right_context = end - position
score = min(num_left_context,
num_right_context) + 0.01 * doc_span.length
if best_score is None or score > best_score:
best_score = score
best_span_index = span_index
return cur_span_index == best_span_index
def convert_examples_to_features(examples, tokenizer, max_seq_length,
doc_stride, max_query_length):
res_input_ids = []
res_input_mask = []
res_segment_ids = []
extra = []
unique_id = 0
for (example_index, example) in enumerate(examples):
# 对原始问题文本进行数据处理
query_tokens = tokenizer.encode(example.question_text)
if len(query_tokens) > max_query_length:
query_tokens = query_tokens[0:max_query_length]
# 对上下文文本进行数据处理
tok_to_orig_index = []
orig_to_tok_index = []
all_doc_tokens = []
for (i, token) in enumerate(example.doc_tokens):
orig_to_tok_index.append(len(all_doc_tokens))
sub_tokens = tokenizer.encode(token, add_special_tokens=False)
for sub_token in sub_tokens.tokens:
tok_to_orig_index.append(i)
all_doc_tokens.append(sub_token)
max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
# 当上下文文本的长度大于规定的最大长度,则使用滑动窗口的方法。
_DocSpan = collections.namedtuple("DocSpan", ["start", "length"])
doc_spans = []
start_offset = 0
while start_offset < len(all_doc_tokens):
length = len(all_doc_tokens) - start_offset
if length > max_tokens_for_doc:
length = max_tokens_for_doc
doc_spans.append(_DocSpan(start=start_offset, length=length))
if start_offset + length == len(all_doc_tokens): # 如果start_offset + length == len(all_doc_tokens),则不进行滑动窗口操作
break
start_offset += min(length, doc_stride)
# 拼接原始问题和上下文文本
for (doc_span_index, doc_span) in enumerate(doc_spans):
tokens = []
token_to_orig_map = {}
token_is_max_context = {}
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in query_tokens.tokens:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
for i in range(doc_span.length):
split_token_index = doc_span.start + i
token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
is_max_context = check_is_max_context(doc_spans,
doc_span_index,
split_token_index)
token_is_max_context[len(tokens)] = is_max_context
tokens.append(all_doc_tokens[split_token_index])
segment_ids.append(1)
tokens.append("[SEP]")
segment_ids.append(1)
input_ids = []
for token in tokens:
input_ids.append(tokenizer.token_to_id(token))
# 掩码为1表示真实标记,0表示填充标记。
input_mask = [1] * len(input_ids)
# 当序列长度小于max_seq_length时,零填充序列到max_seq_length长度
while len(input_ids) < max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
res_input_ids.append(np.array(input_ids, dtype=np.int64))
res_input_mask.append(np.array(input_mask, dtype=np.int64))
res_segment_ids.append(np.array(segment_ids, dtype=np.int64))
feature = Feature(unique_id=unique_id,
tokens=tokens,
example_index=example_index,
token_to_orig_map=token_to_orig_map,
token_is_max_context=token_is_max_context)
extra.append(feature)
unique_id += 1
return np.array(res_input_ids), np.array(res_input_mask), np.array(
res_segment_ids), extra
# 将SQuAD json文件读入到一个SquadEexample列表中
def read_squad_examples(input_file):
with open(input_file, "r") as f:
input_data = json.load(f)["data"]
def is_whitespace(c):
if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
return True
return False
examples = []
for idx, entry in enumerate(input_data):
# 获取上下文文本内容,并存储在doc_tokens列表中
for paragraph in entry["paragraphs"]:
paragraph_text = paragraph["context"]
doc_tokens = []
char_to_word_offset = []
prev_is_whitespace = True
for c in paragraph_text:
if is_whitespace(c):
prev_is_whitespace = True
else:
if prev_is_whitespace:
doc_tokens.append(c)
else:
doc_tokens[-1] += c
prev_is_whitespace = False
char_to_word_offset.append(len(doc_tokens) - 1)
# 获取原始问题文本和对应的id
for qa in paragraph["qas"]:
qas_id = qa["id"]
question_text = qa["question"]
start_position = None
end_position = None
orig_answer_text = None
# 将上下文文本和原始问题文本等保存在SquadExample列表中
example = SquadExample(qas_id=qas_id,
question_text=question_text,
doc_tokens=doc_tokens,
orig_answer_text=orig_answer_text,
start_position=start_position,
end_position=end_position)
examples.append(example)
return examples
def write_predictions(all_examples, all_features, all_results, n_best_size,
max_answer_length, do_lower_case, output_prediction_file,
output_nbest_file):
example_index_to_features = collections.defaultdict(list)
for feature in all_features:
example_index_to_features[feature.example_index].append(feature)
unique_id_to_result = {}
for result in all_results:
unique_id_to_result[result.unique_id] = result
_PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name
"PrelimPrediction", [
"feature_index", "start_index", "end_index", "start_logit",
"end_logit"
])
all_predictions = collections.OrderedDict()
all_nbest_json = collections.OrderedDict()
for (example_index, example) in enumerate(all_examples):
features = example_index_to_features[example_index]
prelim_predictions = []
for (feature_index, feature) in enumerate(features):
# 取前n_best_size个预测概率值
if not feature.unique_id in unique_id_to_result:
print("feature not in unique_Id", feature.unique_id)
continue
result = unique_id_to_result[feature.unique_id]
start_indexes = get_best_indexes(result.start_logits, n_best_size)
end_indexes = get_best_indexes(result.end_logits, n_best_size)
# 筛选与过滤,过滤掉不符合的开始索引和结束索引
for start_index in start_indexes:
for end_index in end_indexes:
if start_index >= len(feature.tokens):
continue
if end_index >= len(feature.tokens):
continue
if start_index not in feature.token_to_orig_map:
continue
if end_index not in feature.token_to_orig_map:
continue
if not feature.token_is_max_context.get(
start_index, False):
continue
if end_index < start_index:
continue
length = end_index - start_index + 1
if length > max_answer_length:
continue
prelim_predictions.append(
_PrelimPrediction(
feature_index=feature_index,
start_index=start_index,
end_index=end_index,
start_logit=result.start_logits[start_index],
end_logit=result.end_logits[end_index]))
# 排序,开始索引加结束索引的概率值和最大的排在前面
prelim_predictions = sorted(prelim_predictions,
key=lambda x:
(x.start_logit + x.end_logit),
reverse=True)
_NbestPrediction = collections.namedtuple(
"NbestPrediction", ["text", "start_logit", "end_logit"])
seen_predictions = {}
nbest = []
for pred in prelim_predictions:
# 取前n_best_size个概率值最大的结果
if len(nbest) >= n_best_size:
break
feature = features[pred.feature_index]
tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
orig_doc_start = feature.token_to_orig_map[pred.start_index]
orig_doc_end = feature.token_to_orig_map[pred.end_index]
orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
tok_text = " ".join(tok_tokens)
# 去标记化已分离的单词以及去除首尾空格
tok_text = tok_text.replace(" ##", "")
tok_text = tok_text.replace("##", "")
tok_text = tok_text.strip()
tok_text = " ".join(tok_text.split())
orig_text = " ".join(orig_tokens)
nbest.append(
_NbestPrediction(text=orig_text,
start_logit=pred.start_logit,
end_logit=pred.end_logit))
if not nbest:
nbest.append(
_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
assert len(nbest) >= 1
total_scores = []
for entry in nbest:
total_scores.append(entry.start_logit + entry.end_logit)
probs = compute_softmax(total_scores)
nbest_json = []
for (i, entry) in enumerate(nbest):
output = collections.OrderedDict()
output["text"] = entry.text # 获取对应的文本
output["probability"] = probs[i] # 预测概率值
output["start_logit"] = float(entry.start_logit) # 开始位置的概率值
output["end_logit"] = float(entry.end_logit) # 结束位置的概率值
nbest_json.append(output)
all_predictions[example.qas_id] = nbest_json[0]["text"] # 取最大概率值,作为最终的预测答案
all_nbest_json[example.qas_id] = nbest_json
with open(output_prediction_file, "w") as writer:
writer.write(json.dumps(all_predictions, indent=4) + "\n")
with open(output_nbest_file, "w") as writer:
writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
# 对logits的概率值排序,并获取前个n_best_size的概率值
def get_best_indexes(logits, n_best_size):
index_and_score = sorted(enumerate(logits),
key=lambda x: x[1],
reverse=True)
best_indexes = []
for i in range(len(index_and_score)):
if i >= n_best_size:
break
best_indexes.append(index_and_score[i][0])
return best_indexes
# 计算softmax
def compute_softmax(scores):
if not scores:
return []
max_score = None
for score in scores:
if max_score is None or score > max_score:
max_score = score
exp_scores = []
total_sum = 0.0
for score in scores:
x = math.exp(score - max_score)
exp_scores.append(x)
total_sum += x
probs = []
for score in exp_scores:
probs.append(score / total_sum)
return probs
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment