run_onnx_squad.py

import argparse
import collections
import json
import math
import os
import sys

import numpy as np
import onnxruntime as onnxrt
import six
from tokenizers import BertWordPieceTokenizer
from tokenizers import pre_tokenizers

RawResult = collections.namedtuple("RawResult",
                                   ["unique_id", "start_logits", "end_logits"])

Feature = collections.namedtuple("Feature", [
    "unique_id", "tokens", "example_index", "token_to_orig_map",
    "token_is_max_context"
])

class SquadExample(object):
    def __init__(self,
                 qas_id,
                 question_text,
                 doc_tokens,
                 orig_answer_text=None,
                 start_position=None,
                 end_position=None):
        self.qas_id = qas_id
        self.question_text = question_text
        self.doc_tokens = doc_tokens
        self.orig_answer_text = orig_answer_text
        self.start_position = start_position
        self.end_position = end_position

    def __str__(self):
        return self.__repr__()

    def __repr__(self):
        s = []
        s.append("qas_id: %s" % (self.qas_id))
        s.append("question_text: %s" % (self.question_text))
        s.append("doc_tokens: [%s]" % (" ".join(self.doc_tokens)))
        if self.start_position:
            s.append("start_position: %d" % (self.start_position))
        if self.start_position:
            s.append("end_position: %d" % (self.end_position))
        return ", ".join(s)

def check_is_max_context(doc_spans, cur_span_index, position):
    best_score = None
    best_span_index = None
    for (span_index, doc_span) in enumerate(doc_spans):
        end = doc_span.start + doc_span.length - 1
        if position < doc_span.start:
            continue
        if position > end:
            continue
        num_left_context = position - doc_span.start
        num_right_context = end - position
        score = min(num_left_context,
                    num_right_context) + 0.01 * doc_span.length
        if best_score is None or score > best_score:
            best_score = score
            best_span_index = span_index

    return cur_span_index == best_span_index

def convert_examples_to_features(examples, tokenizer, max_seq_length,
                                 doc_stride, max_query_length):

    res_input_ids = []
    res_input_mask = []
    res_segment_ids = []
    extra = []
    unique_id = 0

    for (example_index, example) in enumerate(examples):
        
        # 对原始问题文本进行数据处理
        query_tokens = tokenizer.encode(example.question_text) 
        if len(query_tokens) > max_query_length:
            query_tokens = query_tokens[0:max_query_length]

        # 对上下文文本进行数据处理
        tok_to_orig_index = []   
        orig_to_tok_index = []   
        all_doc_tokens = []      
        for (i, token) in enumerate(example.doc_tokens):  
            orig_to_tok_index.append(len(all_doc_tokens))
            sub_tokens = tokenizer.encode(token, add_special_tokens=False)  

            for sub_token in sub_tokens.tokens:
                tok_to_orig_index.append(i)
                all_doc_tokens.append(sub_token)
        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3  

        # 当上下文文本的长度大于规定的最大长度，则使用滑动窗口的方法。
        _DocSpan = collections.namedtuple("DocSpan", ["start", "length"])
        doc_spans = []
        start_offset = 0
        while start_offset < len(all_doc_tokens):        
            length = len(all_doc_tokens) - start_offset  
            if length > max_tokens_for_doc:              
                length = max_tokens_for_doc
            doc_spans.append(_DocSpan(start=start_offset, length=length))
            if start_offset + length == len(all_doc_tokens):     # 如果start_offset + length == len(all_doc_tokens)，则不进行滑动窗口操作
                break
            start_offset += min(length, doc_stride)

        # 拼接原始问题和上下文文本
        for (doc_span_index, doc_span) in enumerate(doc_spans):
            tokens = []
            token_to_orig_map = {}
            token_is_max_context = {}
            segment_ids = []
            tokens.append("[CLS]")                    
            segment_ids.append(0)
            for token in query_tokens.tokens:
                tokens.append(token)                     
                segment_ids.append(0)
            tokens.append("[SEP]")                       
            segment_ids.append(0)                        

            for i in range(doc_span.length):
                split_token_index = doc_span.start + i
                token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] 

                is_max_context = check_is_max_context(doc_spans,
                                                       doc_span_index,
                                                       split_token_index)
                token_is_max_context[len(tokens)] = is_max_context
                tokens.append(all_doc_tokens[split_token_index])
                segment_ids.append(1)          
            tokens.append("[SEP]")             
            segment_ids.append(1)     

            input_ids = []
            for token in tokens:
                input_ids.append(tokenizer.token_to_id(token))  

            # 掩码为1表示真实标记，0表示填充标记。
            input_mask = [1] * len(input_ids)

            # 当序列长度小于max_seq_length时，零填充序列到max_seq_length长度
            while len(input_ids) < max_seq_length:
                input_ids.append(0)
                input_mask.append(0)
                segment_ids.append(0)
            res_input_ids.append(np.array(input_ids, dtype=np.int64))
            res_input_mask.append(np.array(input_mask, dtype=np.int64))
            res_segment_ids.append(np.array(segment_ids, dtype=np.int64))
            feature = Feature(unique_id=unique_id,
                              tokens=tokens,
                              example_index=example_index,
                              token_to_orig_map=token_to_orig_map,
                              token_is_max_context=token_is_max_context)
            extra.append(feature)
            unique_id += 1
    return np.array(res_input_ids), np.array(res_input_mask), np.array(
        res_segment_ids), extra

# 将SQuAD json文件读入到一个SquadEexample列表中
def read_squad_examples(input_file):
    with open(input_file, "r") as f:
        input_data = json.load(f)["data"]

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    examples = []    
    for idx, entry in enumerate(input_data):

    # 获取上下文文本内容，并存储在doc_tokens列表中
        for paragraph in entry["paragraphs"]:
            paragraph_text = paragraph["context"]   
            doc_tokens = []                         
            char_to_word_offset = []
            prev_is_whitespace = True
            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)
                
    # 获取原始问题文本和对应的id
            for qa in paragraph["qas"]:             
                qas_id = qa["id"]                  
                question_text = qa["question"]      
                start_position = None
                end_position = None
                orig_answer_text = None

                # 将上下文文本和原始问题文本等保存在SquadExample列表中
                example = SquadExample(qas_id=qas_id,
                                       question_text=question_text,
                                       doc_tokens=doc_tokens,
                                       orig_answer_text=orig_answer_text,
                                       start_position=start_position,
                                       end_position=end_position)
                examples.append(example)
    return examples

def write_predictions(all_examples, all_features, all_results, n_best_size,
                      max_answer_length, do_lower_case, output_prediction_file,
                      output_nbest_file):
    example_index_to_features = collections.defaultdict(list)
    for feature in all_features:
        example_index_to_features[feature.example_index].append(feature)

    unique_id_to_result = {}
    for result in all_results:
        unique_id_to_result[result.unique_id] = result

    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
        "PrelimPrediction", [
            "feature_index", "start_index", "end_index", "start_logit",
            "end_logit"
        ])

    all_predictions = collections.OrderedDict()
    all_nbest_json = collections.OrderedDict()
    for (example_index, example) in enumerate(all_examples):
        features = example_index_to_features[example_index]
        prelim_predictions = []
        for (feature_index, feature) in enumerate(features):

            # 取前n_best_size个预测概率值
            if not feature.unique_id in unique_id_to_result:
                print("feature not in unique_Id", feature.unique_id)
                continue
            result = unique_id_to_result[feature.unique_id]
            start_indexes = get_best_indexes(result.start_logits, n_best_size)   
            end_indexes = get_best_indexes(result.end_logits, n_best_size)       

            # 筛选与过滤，过滤掉不符合的开始索引和结束索引
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if start_index >= len(feature.tokens):                      
                        continue
                    if end_index >= len(feature.tokens):                        
                        continue
                    if start_index not in feature.token_to_orig_map:            
                        continue
                    if end_index not in feature.token_to_orig_map:             
                        continue
                    if not feature.token_is_max_context.get(               
                            start_index, False):
                        continue
                    if end_index < start_index:                            
                        continue
                    length = end_index - start_index + 1
                    if length > max_answer_length:                     
                        continue
                    prelim_predictions.append(
                        _PrelimPrediction(
                            feature_index=feature_index,                        
                            start_index=start_index,                           
                            end_index=end_index,                              
                            start_logit=result.start_logits[start_index],
                            end_logit=result.end_logits[end_index]))
                    
              # 排序，开始索引加结束索引的概率值和最大的排在前面      
        prelim_predictions = sorted(prelim_predictions,
                                    key=lambda x:
                                    (x.start_logit + x.end_logit),
                                    reverse=True)
        _NbestPrediction = collections.namedtuple(
            "NbestPrediction", ["text", "start_logit", "end_logit"])

        seen_predictions = {}
        nbest = []
        for pred in prelim_predictions:

            # 取前n_best_size个概率值最大的结果
            if len(nbest) >= n_best_size:
                break
            feature = features[pred.feature_index]
            tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)] 
            orig_doc_start = feature.token_to_orig_map[pred.start_index]       
            orig_doc_end = feature.token_to_orig_map[pred.end_index]          
            orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)] 
            tok_text = " ".join(tok_tokens)

            # 去标记化已分离的单词以及去除首尾空格
            tok_text = tok_text.replace(" ##", "")
            tok_text = tok_text.replace("##", "")
            tok_text = tok_text.strip()
            tok_text = " ".join(tok_text.split())
            orig_text = " ".join(orig_tokens)

            nbest.append(
                _NbestPrediction(text=orig_text,
                                 start_logit=pred.start_logit,
                                 end_logit=pred.end_logit))

        if not nbest:
            nbest.append(
                _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
        assert len(nbest) >= 1
        total_scores = []
        for entry in nbest:
            total_scores.append(entry.start_logit + entry.end_logit)
        probs = compute_softmax(total_scores)

        nbest_json = []
        for (i, entry) in enumerate(nbest):
            output = collections.OrderedDict()
            output["text"] = entry.text                                            # 获取对应的文本
            output["probability"] = probs[i]                                       # 预测概率值
            output["start_logit"] = float(entry.start_logit)                       # 开始位置的概率值
            output["end_logit"] = float(entry.end_logit)                           # 结束位置的概率值
            nbest_json.append(output)

        all_predictions[example.qas_id] = nbest_json[0]["text"]                    # 取最大概率值，作为最终的预测答案
        all_nbest_json[example.qas_id] = nbest_json

    with open(output_prediction_file, "w") as writer:
        writer.write(json.dumps(all_predictions, indent=4) + "\n")

    with open(output_nbest_file, "w") as writer:
        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")

# 对logits的概率值排序，并获取前个n_best_size的概率值
def get_best_indexes(logits, n_best_size):
    index_and_score = sorted(enumerate(logits),
                             key=lambda x: x[1],
                             reverse=True)
    best_indexes = []
    for i in range(len(index_and_score)):
        if i >= n_best_size:
            break
        best_indexes.append(index_and_score[i][0])
    return best_indexes

# 计算softmax
def compute_softmax(scores):
    if not scores:
        return []

    max_score = None
    for score in scores:
        if max_score is None or score > max_score:
            max_score = score

    exp_scores = []
    total_sum = 0.0
    for score in scores:
        x = math.exp(score - max_score)
        exp_scores.append(x)
        total_sum += x

    probs = []
    for score in exp_scores:
        probs.append(score / total_sum)
    return probs