删除部分代码和文档

e6b93898 · liucong · 35ad452d · 35ad452d · 35ad452d · 35ad452d
Commit e6b93898 authored May 28, 2023 by liucong
Showing with 0 additions and 469 deletions

Sample_picture.png Sample_picture.png +0 -0

bert.py bert.py +0 -76

model/inputs_data.json model/inputs_data.json +0 -26

requirement.txt requirement.txt +0 -3

run_onnx_squad.py run_onnx_squad.py +0 -364

No files found.
--- a/Sample_picture.png
+++ b/Sample_picture.png
--- a/bert.py
+++ b/bert.py
-import numpy as np
-import json
-import os.path
-import tokenizers
-import collections
-from run_onnx_squad import read_squad_examples, write_predictions, convert_examples_to_features
-import migraphx
-
-RawResult = collections.namedtuple("RawResult",
-                                   ["unique_id", "start_logits", "end_logits"])
-
-# 数据前处理
-input_file = './model/inputs_data.json'
-
-# 使用run_onnx_squad中的read_squad_examples方法读取输入文件，进行数据处理，将文本拆分成一个个单词
-eval_examples = read_squad_examples(input_file)  
-
-max_seq_length = 256    # 规定输入文本的最大长度
-doc_stride = 128        # 滑动的窗口大小
-max_query_length = 64   # 问题的最大长度
-batch_size = 1          # batch_size值
-n_best_size = 20        # 预选数量
-max_answer_length = 30  # 问题的最大长度
-
-# 分词工具
-vocab_file = os.path.join('./model/uncased_L-12_H-768_A-12', 'vocab.txt')
-tokenizer = tokenizers.BertWordPieceTokenizer(vocab_file)
-
-# 使用run_onnx_squad中的convert_examples_to_features方法从输入中获取参数
-input_ids, input_mask, segment_ids, extra_data = convert_examples_to_features(eval_examples, tokenizer, max_seq_length, doc_stride, max_query_length)
-
-# 编译
-print("INFO: Parsing and compiling the model...")
-model = migraphx.parse_onnx("./model/bertsquad-10.onnx")
-model.compile(migraphx.get_target("gpu"),device_id=0)                        
-
-n = len(input_ids)
-bs = batch_size
-all_results = []
-
-for idx in range(0, n):
-    item = eval_examples[idx]
-
-    # 推理
-    result = model.run({
-        "unique_ids_raw_output___9:0":
-        np.array([item.qas_id], dtype=np.int64),   # position id
-        "input_ids:0":
-        input_ids[idx:idx + bs],                   # Token id，对应的文本数据转换为数值型数据
-        "input_mask:0":
-        input_mask[idx:idx + bs],                  # 掩码
-        "segment_ids:0":
-        segment_ids[idx:idx + bs]                  # segment id，对上下文文本和问题赋予不同的位置向量
-    })
-
-    in_batch = result[1].get_shape().lens()[0]
-    start_logits = [float(x) for x in result[1].tolist()]  # 答案起始位置的概率值
-    end_logits = [float(x) for x in result[0].tolist()]    # 答案结束位置的概率值
-
-    for i in range(0, in_batch):
-        unique_id = len(all_results)
-        all_results.append(
-            RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) 
-
-# 数据后处理，获取预测结果
-output_dir = 'predictions'               
-os.makedirs(output_dir, exist_ok=True)    
-output_prediction_file = os.path.join(output_dir, "predictions.json")  
-output_nbest_file = os.path.join(output_dir, "nbest_predictions.json")
-write_predictions(eval_examples, extra_data, all_results, n_best_size,
-                  max_answer_length, True, output_prediction_file,
-                  output_nbest_file)     
-
-with open(output_prediction_file) as json_file:
-    test_data = json.load(json_file)
-    print(json.dumps(test_data, indent=2))
--- a/model/inputs_data.json
+++ b/model/inputs_data.json
-{
-  "data": [
-    {
-      "paragraphs": [
-        {
-          "context": "ROCm is the first open-source exascale-class platform for accelerated computing that’s also programming-language independent. It brings a philosophy of choice, minimalism and modular software development to GPU computing. You are free to choose or even develop tools and a language run time for your application. ROCm is built for scale, it supports multi-GPU computing and has a rich system run time with the critical features that large-scale application, compiler and language-run-time development requires. Since the ROCm ecosystem is comprised of open technologies: frameworks (Tensorflow / PyTorch), libraries (MIOpen / Blas / RCCL), programming model (HIP), inter-connect (OCD) and up streamed Linux® Kernel support – the platform is continually optimized for performance and extensibility.",
-          "qas": [
-            {
-              "question": "What is ROCm?",
-              "id": "1"
-            },
-            {
-              "question": "Which frameworks does ROCm support?",
-              "id": "2"
-            },
-            {
-              "question": "What is ROCm built for?",
-              "id": "3"
-            }
-          ]
-        }
-      ],
-      "title": "AMD ROCm"
-    }
-  ]
-}
\ No newline at end of file
--- a/requirement.txt
+++ b/requirement.txt
-onnxruntime
-tokenizers
-numpy
\ No newline at end of file
--- a/run_onnx_squad.py
+++ b/run_onnx_squad.py
-import argparse
-import collections
-import json
-import math
-import os
-import sys
-
-import numpy as np
-import onnxruntime as onnxrt
-import six
-from tokenizers import BertWordPieceTokenizer
-from tokenizers import pre_tokenizers
-
-RawResult = collections.namedtuple("RawResult",
-                                   ["unique_id", "start_logits", "end_logits"])
-
-Feature = collections.namedtuple("Feature", [
-    "unique_id", "tokens", "example_index", "token_to_orig_map",
-    "token_is_max_context"
-])
-
-class SquadExample(object):
-    def __init__(self,
-                 qas_id,
-                 question_text,
-                 doc_tokens,
-                 orig_answer_text=None,
-                 start_position=None,
-                 end_position=None):
-        self.qas_id = qas_id
-        self.question_text = question_text
-        self.doc_tokens = doc_tokens
-        self.orig_answer_text = orig_answer_text
-        self.start_position = start_position
-        self.end_position = end_position
-
-    def __str__(self):
-        return self.__repr__()
-
-    def __repr__(self):
-        s = []
-        s.append("qas_id: %s" % (self.qas_id))
-        s.append("question_text: %s" % (self.question_text))
-        s.append("doc_tokens: [%s]" % (" ".join(self.doc_tokens)))
-        if self.start_position:
-            s.append("start_position: %d" % (self.start_position))
-        if self.start_position:
-            s.append("end_position: %d" % (self.end_position))
-        return ", ".join(s)
-
-def check_is_max_context(doc_spans, cur_span_index, position):
-    best_score = None
-    best_span_index = None
-    for (span_index, doc_span) in enumerate(doc_spans):
-        end = doc_span.start + doc_span.length - 1
-        if position < doc_span.start:
-            continue
-        if position > end:
-            continue
-        num_left_context = position - doc_span.start
-        num_right_context = end - position
-        score = min(num_left_context,
-                    num_right_context) + 0.01 * doc_span.length
-        if best_score is None or score > best_score:
-            best_score = score
-            best_span_index = span_index
-
-    return cur_span_index == best_span_index
-
-def convert_examples_to_features(examples, tokenizer, max_seq_length,
-                                 doc_stride, max_query_length):
-
-    res_input_ids = []
-    res_input_mask = []
-    res_segment_ids = []
-    extra = []
-    unique_id = 0
-
-    for (example_index, example) in enumerate(examples):
-        
-        # 对原始问题文本进行数据处理
-        query_tokens = tokenizer.encode(example.question_text) 
-        if len(query_tokens) > max_query_length:
-            query_tokens = query_tokens[0:max_query_length]
-
-        # 对上下文文本进行数据处理
-        tok_to_orig_index = []   
-        orig_to_tok_index = []   
-        all_doc_tokens = []      
-        for (i, token) in enumerate(example.doc_tokens):  
-            orig_to_tok_index.append(len(all_doc_tokens))
-            sub_tokens = tokenizer.encode(token, add_special_tokens=False)  
-
-            for sub_token in sub_tokens.tokens:
-                tok_to_orig_index.append(i)
-                all_doc_tokens.append(sub_token)
-        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3  
-
-        # 当上下文文本的长度大于规定的最大长度，则使用滑动窗口的方法。
-        _DocSpan = collections.namedtuple("DocSpan", ["start", "length"])
-        doc_spans = []
-        start_offset = 0
-        while start_offset < len(all_doc_tokens):        
-            length = len(all_doc_tokens) - start_offset  
-            if length > max_tokens_for_doc:              
-                length = max_tokens_for_doc
-            doc_spans.append(_DocSpan(start=start_offset, length=length))
-            if start_offset + length == len(all_doc_tokens):     # 如果start_offset + length == len(all_doc_tokens)，则不进行滑动窗口操作
-                break
-            start_offset += min(length, doc_stride)
-
-        # 拼接原始问题和上下文文本
-        for (doc_span_index, doc_span) in enumerate(doc_spans):
-            tokens = []
-            token_to_orig_map = {}
-            token_is_max_context = {}
-            segment_ids = []
-            tokens.append("[CLS]")                    
-            segment_ids.append(0)
-            for token in query_tokens.tokens:
-                tokens.append(token)                     
-                segment_ids.append(0)
-            tokens.append("[SEP]")                       
-            segment_ids.append(0)                        
-
-            for i in range(doc_span.length):
-                split_token_index = doc_span.start + i
-                token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] 
-
-                is_max_context = check_is_max_context(doc_spans,
-                                                       doc_span_index,
-                                                       split_token_index)
-                token_is_max_context[len(tokens)] = is_max_context
-                tokens.append(all_doc_tokens[split_token_index])
-                segment_ids.append(1)          
-            tokens.append("[SEP]")             
-            segment_ids.append(1)     
-
-            input_ids = []
-            for token in tokens:
-                input_ids.append(tokenizer.token_to_id(token))  
-
-            # 掩码为1表示真实标记，0表示填充标记。
-            input_mask = [1] * len(input_ids)
-
-            # 当序列长度小于max_seq_length时，零填充序列到max_seq_length长度
-            while len(input_ids) < max_seq_length:
-                input_ids.append(0)
-                input_mask.append(0)
-                segment_ids.append(0)
-            res_input_ids.append(np.array(input_ids, dtype=np.int64))
-            res_input_mask.append(np.array(input_mask, dtype=np.int64))
-            res_segment_ids.append(np.array(segment_ids, dtype=np.int64))
-            feature = Feature(unique_id=unique_id,
-                              tokens=tokens,
-                              example_index=example_index,
-                              token_to_orig_map=token_to_orig_map,
-                              token_is_max_context=token_is_max_context)
-            extra.append(feature)
-            unique_id += 1
-    return np.array(res_input_ids), np.array(res_input_mask), np.array(
-        res_segment_ids), extra
-
-# 将SQuAD json文件读入到一个SquadEexample列表中
-def read_squad_examples(input_file):
-    with open(input_file, "r") as f:
-        input_data = json.load(f)["data"]
-
-    def is_whitespace(c):
-        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
-            return True
-        return False
-
-    examples = []    
-    for idx, entry in enumerate(input_data):
-
-    # 获取上下文文本内容，并存储在doc_tokens列表中
-        for paragraph in entry["paragraphs"]:
-            paragraph_text = paragraph["context"]   
-            doc_tokens = []                         
-            char_to_word_offset = []
-            prev_is_whitespace = True
-            for c in paragraph_text:
-                if is_whitespace(c):
-                    prev_is_whitespace = True
-                else:
-                    if prev_is_whitespace:
-                        doc_tokens.append(c)
-                    else:
-                        doc_tokens[-1] += c
-                    prev_is_whitespace = False
-                char_to_word_offset.append(len(doc_tokens) - 1)
-                
-    # 获取原始问题文本和对应的id
-            for qa in paragraph["qas"]:             
-                qas_id = qa["id"]                  
-                question_text = qa["question"]      
-                start_position = None
-                end_position = None
-                orig_answer_text = None
-
-                # 将上下文文本和原始问题文本等保存在SquadExample列表中
-                example = SquadExample(qas_id=qas_id,
-                                       question_text=question_text,
-                                       doc_tokens=doc_tokens,
-                                       orig_answer_text=orig_answer_text,
-                                       start_position=start_position,
-                                       end_position=end_position)
-                examples.append(example)
-    return examples
-
-def write_predictions(all_examples, all_features, all_results, n_best_size,
-                      max_answer_length, do_lower_case, output_prediction_file,
-                      output_nbest_file):
-    example_index_to_features = collections.defaultdict(list)
-    for feature in all_features:
-        example_index_to_features[feature.example_index].append(feature)
-
-    unique_id_to_result = {}
-    for result in all_results:
-        unique_id_to_result[result.unique_id] = result
-
-    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "PrelimPrediction", [
-            "feature_index", "start_index", "end_index", "start_logit",
-            "end_logit"
-        ])
-
-    all_predictions = collections.OrderedDict()
-    all_nbest_json = collections.OrderedDict()
-    for (example_index, example) in enumerate(all_examples):
-        features = example_index_to_features[example_index]
-        prelim_predictions = []
-        for (feature_index, feature) in enumerate(features):
-
-            # 取前n_best_size个预测概率值
-            if not feature.unique_id in unique_id_to_result:
-                print("feature not in unique_Id", feature.unique_id)
-                continue
-            result = unique_id_to_result[feature.unique_id]
-            start_indexes = get_best_indexes(result.start_logits, n_best_size)   
-            end_indexes = get_best_indexes(result.end_logits, n_best_size)       
-
-            # 筛选与过滤，过滤掉不符合的开始索引和结束索引
-            for start_index in start_indexes:
-                for end_index in end_indexes:
-                    if start_index >= len(feature.tokens):                      
-                        continue
-                    if end_index >= len(feature.tokens):                        
-                        continue
-                    if start_index not in feature.token_to_orig_map:            
-                        continue
-                    if end_index not in feature.token_to_orig_map:             
-                        continue
-                    if not feature.token_is_max_context.get(               
-                            start_index, False):
-                        continue
-                    if end_index < start_index:                            
-                        continue
-                    length = end_index - start_index + 1
-                    if length > max_answer_length:                     
-                        continue
-                    prelim_predictions.append(
-                        _PrelimPrediction(
-                            feature_index=feature_index,                        
-                            start_index=start_index,                           
-                            end_index=end_index,                              
-                            start_logit=result.start_logits[start_index],
-                            end_logit=result.end_logits[end_index]))
-                    
-              # 排序，开始索引加结束索引的概率值和最大的排在前面      
-        prelim_predictions = sorted(prelim_predictions,
-                                    key=lambda x:
-                                    (x.start_logit + x.end_logit),
-                                    reverse=True)
-        _NbestPrediction = collections.namedtuple(
-            "NbestPrediction", ["text", "start_logit", "end_logit"])
-
-        seen_predictions = {}
-        nbest = []
-        for pred in prelim_predictions:
-
-            # 取前n_best_size个概率值最大的结果
-            if len(nbest) >= n_best_size:
-                break
-            feature = features[pred.feature_index]
-            tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)] 
-            orig_doc_start = feature.token_to_orig_map[pred.start_index]       
-            orig_doc_end = feature.token_to_orig_map[pred.end_index]          
-            orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)] 
-            tok_text = " ".join(tok_tokens)
-
-            # 去标记化已分离的单词以及去除首尾空格
-            tok_text = tok_text.replace(" ##", "")
-            tok_text = tok_text.replace("##", "")
-            tok_text = tok_text.strip()
-            tok_text = " ".join(tok_text.split())
-            orig_text = " ".join(orig_tokens)
-
-            nbest.append(
-                _NbestPrediction(text=orig_text,
-                                 start_logit=pred.start_logit,
-                                 end_logit=pred.end_logit))
-
-        if not nbest:
-            nbest.append(
-                _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
-        assert len(nbest) >= 1
-        total_scores = []
-        for entry in nbest:
-            total_scores.append(entry.start_logit + entry.end_logit)
-        probs = compute_softmax(total_scores)
-
-        nbest_json = []
-        for (i, entry) in enumerate(nbest):
-            output = collections.OrderedDict()
-            output["text"] = entry.text                                            # 获取对应的文本
-            output["probability"] = probs[i]                                       # 预测概率值
-            output["start_logit"] = float(entry.start_logit)                       # 开始位置的概率值
-            output["end_logit"] = float(entry.end_logit)                           # 结束位置的概率值
-            nbest_json.append(output)
-
-        all_predictions[example.qas_id] = nbest_json[0]["text"]                    # 取最大概率值，作为最终的预测答案
-        all_nbest_json[example.qas_id] = nbest_json
-
-    with open(output_prediction_file, "w") as writer:
-        writer.write(json.dumps(all_predictions, indent=4) + "\n")
-
-    with open(output_nbest_file, "w") as writer:
-        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
-
-# 对logits的概率值排序，并获取前个n_best_size的概率值
-def get_best_indexes(logits, n_best_size):
-    index_and_score = sorted(enumerate(logits),
-                             key=lambda x: x[1],
-                             reverse=True)
-    best_indexes = []
-    for i in range(len(index_and_score)):
-        if i >= n_best_size:
-            break
-        best_indexes.append(index_and_score[i][0])
-    return best_indexes
-
-# 计算softmax
-def compute_softmax(scores):
-    if not scores:
-        return []
-
-    max_score = None
-    for score in scores:
-        if max_score is None or score > max_score:
-            max_score = score
-
-    exp_scores = []
-    total_sum = 0.0
-    for score in scores:
-        x = math.exp(score - max_score)
-        exp_scores.append(x)
-        total_sum += x
-
-    probs = []
-    for score in exp_scores:
-        probs.append(score / total_sum)
-    return probs