Initial commit

c2540862 · yangql · c2540862 · c2540862 · c2540862 · c2540862
Commit c2540862 authored Jul 13, 2023 by yangql
20 changed files
--- a/3rdParty/InstallRBuild.sh
+++ b/3rdParty/InstallRBuild.sh
+############################ 在线安装依赖 ###############################
+#cd ./3rdParty
+#pip install rbuild-master.tar.gz
+############################ 离线安装依赖 ###############################
+# 安装依赖
+cd ./3rdParty/rbuild_depend
+pip install click-6.6-py2.py3-none-any.whl
+pip install six-1.15.0-py2.py3-none-any.whl
+pip install subprocess32-3.5.4.tar.gz
+pip install cget-0.1.9.tar.gz
+# 安装rbuild
+cd ../
+pip install rbuild-master.tar.gz
--- a/3rdParty/rbuild-master.tar.gz
+++ b/3rdParty/rbuild-master.tar.gz
--- a/3rdParty/rbuild_depend/cget-0.1.9.tar.gz
+++ b/3rdParty/rbuild_depend/cget-0.1.9.tar.gz
--- a/3rdParty/rbuild_depend/click-6.6-py2.py3-none-any.whl
+++ b/3rdParty/rbuild_depend/click-6.6-py2.py3-none-any.whl
--- a/3rdParty/rbuild_depend/six-1.15.0-py2.py3-none-any.whl
+++ b/3rdParty/rbuild_depend/six-1.15.0-py2.py3-none-any.whl
--- a/3rdParty/rbuild_depend/subprocess32-3.5.4.tar.gz
+++ b/3rdParty/rbuild_depend/subprocess32-3.5.4.tar.gz
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+# 设置cmake的最低版本
+cmake_minimum_required(VERSION 3.5)
+# 设置项目名
+project(Bert)
+# 设置编译器
+set(CMAKE_CXX_COMPILER g++)
+set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} -std=c++17) # 2.2版本以上需要c++17
+set(CMAKE_BUILD_TYPE release)
+# 添加头文件路径
+set(INCLUDE_PATH    ${CMAKE_CURRENT_SOURCE_DIR}/Src/
+                    ${CMAKE_CURRENT_SOURCE_DIR}/Src/Utility/ 
+                    $ENV{DTKROOT}/include/
+                    ${CMAKE_CURRENT_SOURCE_DIR}/depend/include/)
+include_directories(${INCLUDE_PATH})
+# 添加依赖库路径
+set(LIBRARY_PATH ${CMAKE_CURRENT_SOURCE_DIR}/depend/lib64/
+                $ENV{DTKROOT}/lib/)
+link_directories(${LIBRARY_PATH})
+# 添加依赖库
+set(LIBRARY onnxruntime)
+link_libraries(${LIBRARY})
+# 添加源文件
+set(SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/Src/main.cpp
+                ${CMAKE_CURRENT_SOURCE_DIR}/Src/Bert.cpp
+                ${CMAKE_CURRENT_SOURCE_DIR}/Src/Utility/tokenization.cpp
+                ${CMAKE_CURRENT_SOURCE_DIR}/Src/Utility/utf8proc.c
+                ${CMAKE_CURRENT_SOURCE_DIR}/Src/Utility/Filesystem.cpp)
+# 添加可执行目标
+add_executable(Bert ${SOURCE_FILES})
--- a/Doc/Images/Bert_01.png
+++ b/Doc/Images/Bert_01.png
--- a/Doc/Images/Bert_02.png
+++ b/Doc/Images/Bert_02.png
--- a/Doc/Images/Bert_03.png
+++ b/Doc/Images/Bert_03.png
--- a/Doc/Images/Bert_04.png
+++ b/Doc/Images/Bert_04.png
--- a/Doc/Tutorial_Cpp.md
+++ b/Doc/Tutorial_Cpp.md
+# Bert
+本示例主要通过Bert模型说明如何使用ONNXRuntime C++ API进行自然语言处理模型的推理，包括数据准备、预处理、模型推理以及数据后处理。
+## 模型简介
+自然语言处理（Natural Language Processing，NLP ）是能够实现人与计算机之间用自然语言进行有效沟通的理论和方法，是计算机科学领域与人工智能领域中的一个重要方向。本次采用经典的Bert模型完成问题回答任务，模型和分词文件下载链接：https://pan.baidu.com/s/1yc30IzM4ocOpTpfFuUMR0w, 提取码：8f1a, 下载bertsquad-10.onnx文件和uncased_L-12_H-768_A-12分词文件保存在Resource/文件夹下。整体模型结构如下图所示，也可以通过netron工具：https://netron.app/ 查看Bert模型结构。
+<img src="./Images/Bert_01.png" style="zoom:100%;" align=middle>
+问题回答任务是指输入一段上下文文本的描述和一个问题，模型从给定的文本中预测出答案。例如：
+```
+1.描述：My name is Li Ming
+2.问题：What's your name?
+3.答案：Li Ming
+```
+## 数据准备
+在自然语言处理领域中，首先需要准备文本数据，如下所示，通常需要提供问题（question）和上下文文本（context），自己可以根据需求准备相应的问题和上下文文本作为输入数据，进行模型推理。
+```json
+{
+  "context": "ROCm is the first open-source exascale-class platform for accelerated computing that’s also programming-language independent. It brings a philosophy of choice, minimalism and modular software development to GPU computing. You are free to choose or even develop tools and a language run time for your application. ROCm is built for scale, it supports multi-GPU computing and has a rich system run time with the critical features that large-scale application, compiler and language-run-time development requires. Since the ROCm ecosystem is comprised of open technologies: frameworks (Tensorflow / PyTorch), libraries (MIOpen / Blas / RCCL), programming model (HIP), inter-connect (OCD) and up streamed Linux® Kernel support – the platform is continually optimized for performance and extensibility.",
+  "question": "What is ROCm?"
+}
+```
+## 预处理
+提供的问题和上下文文本并不能直接输入到模型中执行推理，需要对数据做如下预处理：
+1.滑动窗口操作，当问题和上下文文本的字符超过256时，执行该操作，否则不执行。
+2.数据拼接，将原始问题和上下文文本拼接成一个序列，作为模型的输入数据。
+### 滑动窗口操作
+对于问题回答型任务，关键的是如何构建输入数据。通过对上下文文本和问题的长度做判断，如果上下文文本加问题不超过256个字符时，直接进行后续的数据拼接操作，否则先进行滑动窗口操作构建输入序列，再进行后续的数据拼接。
+如下图所示，为滑动窗口的具体操作：
+<img src="./Images/Bert_03.png" style="zoom:80%;" align=middle>
+从图中可以看出，通过指定窗口大小为256，进行滑动窗口处理可以将上下文文本分成多个子文本，用于后续的数据拼接。
+具体滑动窗口的过程由如下代码实现：
+```c++
+ErrorCode Bert::Preprocessing(...)
+{
+    ...
+    // 当上下文文本加问题文本的长度大于规定的最大长度，采用滑动窗口操作
+    if(tokens_text.size() + tokens_question.size() > max_seq_length - 5)
+    {
+        int windows_len = max_seq_length - 5 -tokens_question.size();
+        std::vector<std::string> tokens_text_window(windows_len);
+        std::vector<std::vector<std::string>> tokens_text_windows;
+        // 规定起始位置，通过滑动窗口操作将子文本存储在tokens_text_window中
+        int start_offset = 0;
+        int position = 0;
+        int n;
+        while (start_offset < tokens_text.size())
+        {
+            n = 0;
+            if(start_offset + windows_len > tokens_text.size())
+            {
+                for(int i = start_offset; i < tokens_text.size(); ++i)
+                {
+                    tokens_text_window[n] = tokens_text[i];
+                    ++n;
+                }
+            }
+            else
+            {
+                for(int i = start_offset; i < start_offset + windows_len; ++i)
+                {
+                    tokens_text_window[n] = tokens_text[i];
+                    ++n;
+                }
+            }
+            tokens_text_windows.push_back(tokens_text_window);
+            start_offset += 256;   
+            ++position;
+        }
+    }
+   ...
+}
+```
+### 数据拼接
+当获得指定的问题和上下文文本时，对问题和上下文文本进行拼接操作，具体过程如下图所示：
+<img src="./Images/Bert_02.png" style="zoom:80%;" align=middle>
+从图中可以看出，是将问题和上下文文本拼接成一个序列，问题和上下文文本用[SEP]符号隔开，完成数据拼接后再输入到模型中进行特征提取。其中，“[CLS]”是一个分类标志，表示后面的内容属于问题文本，“[SEP]”字符是一个分割标志，用来将问题和上下文文本分开。
+具体过程由如下代码实现：
+```c++
+ErrorCode Bert::Preprocessing(...)
+{
+    ...
+        for(int i=0; i < position; ++i)
+        {
+            // 将问题和上下文文本进行拼接处理
+            input_id[0] = tokenizer.convert_token_to_id("[CLS]"); 
+            segment_id[0] = 0; 
+            input_id[1] = tokenizer.convert_token_to_id("[CLS]");
+            segment_id[1] = 0;
+            for (int j = 0; j < tokens_question.size(); ++j)
+            {
+                input_id[j + 2] = tokenizer.convert_token_to_id(tokens_question[j]);
+                segment_id[j + 2] = 0;
+            }
+            input_id[tokens_question.size() + 2] = tokenizer.convert_token_to_id("[SEP]");
+            segment_id[tokens_question.size() + 2] = 0;
+            input_id[tokens_question.size() + 3] = tokenizer.convert_token_to_id("[SEP]");
+            segment_id[tokens_question.size() + 3] = 0;
+            for (int j = 0; j < tokens_question.size(); ++j)
+            {
+                input_id[j + tokens_text_windows[i].size() + 4] = tokenizer.convert_token_to_id(tokens_text_windows[i][j]);
+                segment_id[j + tokens_text_windows[i].size() + 4] = 1; 
+            }
+            input_id[tokens_question.size() + tokens_text_windows[i].size() + 4] = tokenizer.convert_token_to_id("[SEP]");
+            segment_id[tokens_question.size() + tokens_text_windows[i].size() + 4] = 1;
+            int len = tokens_text_windows[i].size() + tokens_question.size() + 5;
+            // 掩码为1的表示为真实标记，0表示为填充标记。
+            std::fill(input_mask.begin(), input_mask.begin() + len, 1);
+            std::fill(input_mask.begin() + len, input_mask.begin() + max_seq_length, 0);
+            std::fill(input_id.begin() + len, input_id.begin() + max_seq_length, 0);
+            std::fill(segment_id.begin() + len, segment_id.begin() + max_seq_length, 0);
+        }
+    ...
+}
+```
+在自然语言处理领域中，不能对文本数据直接处理，需要对文本进行编码后再输入到模型中。因此，Bert模型中的输入数据主要由input_id、segment_id以及input_mask组成，其中input_id主要存储了对文本进行编码后的数值型数据，segment_id主要存储了用于区问题和上下文文本的信息（问题标记为0，上下文文本标记为1），input_mask主要存储了掩码信息（序列长度固定为256，当文本长度不足256时，对应的文本标记为1，无文本的标记为0），标记模型需要关注的地方。
+## 推理
+完成数据预处理后，就可以执行推理，得到推理结果。
+```c++
+ErrorCode Bert::Inference(...)
+{
+    ...
+    for(int i=0;i<input_ids.size();++i)
+    {
+        // 创建输入数据
+        int64_t* input_test1 = (int64_t*)position_id[i];
+        int64_t* input_test2 = (int64_t*)segment_id[i];
+        int64_t* input_test3 = (int64_t*)input_mask[i];
+        int64_t* input_test4 = (int64_t*)input_id[i];
+        Ort::Value inputTensor1 = Ort::Value::CreateTensor<int64_t>(
+            memoryInfo1,
+            input_test1, 
+            inputTensorValues1.size(), 
+            inputShapes1.data(), 
+            inputShapes1.size());
+        Ort::Value inputTensor2 = Ort::Value::CreateTensor<int64_t>(
+            memoryInfo2,
+            input_test2, 
+            inputTensorValues2.size(), 
+            inputShapes2.data(), 
+            inputShapes2.size());
+        Ort::Value inputTensor3 = Ort::Value::CreateTensor<int64_t>(
+            memoryInfo3,
+            input_test3, 
+            inputTensorValues3.size(), 
+            inputShapes3.data(), 
+            inputShapes3.size());        
+        Ort::Value inputTensor4 = Ort::Value::CreateTensor<int64_t>(
+            memoryInfo4,
+            input_test4, 
+            inputTensorValues4.size(), 
+            inputShapes4.data(), 
+            inputShapes4.size());
+        std::vector<Ort::Value> intput_tensors;
+        intput_tensors.push_back(std::move(inputTensor1));
+        intput_tensors.push_back(std::move(inputTensor2));
+        intput_tensors.push_back(std::move(inputTensor3));
+        intput_tensors.push_back(std::move(inputTensor4));
+        // 推理
+        auto output_tensors = session->Run(Ort::RunOptions{nullptr}, input_node_names.data(), intput_tensors.data(), input_node_names.size(), output_node_names.data(), output_node_names.size());
+        // 获取输出节点的属性
+        const float* start_data = output_tensors[1].GetTensorMutableData<float>(); // 开始位置的数据指针
+        const float* end_data = output_tensors[0].GetTensorMutableData<float>();   // 结束位置的数据指针
+        // 保存推理结果
+        for(int i=0;i<256;++i)
+        {
+            start_position.push_back(start_data[i]);
+            end_position.push_back(end_data[i]);
+        }
+    }
+    return SUCCESS;
+}
+```
+1.通过数据预处理操作，得到三组输入参数input_id、segment_id以及input_mask，分别表示了文本信息、Segment信息（区分问题和上下文文本）、掩码信息。另外，将position_id默认设置为1。由于问题加上下文文本可能超过256字符，可能出现多个子序列，所以采用for循环操作，依次将子序列的四组参数输入到std::vector<Ort::Value>{}函数中保存输入数据，并执行session->Run()函数进行模型推理，得到推理结果。
+2.模型的推理结果是对输入序列中的每个词预测开始位置和结束位置的概率值，因此，分别采用start_position和end_position保存开始位置的概率值和结束位置的概率值，用于后续的数据后处理操作。
+## 数据后处理
+获得模型的推理结果后，并不能直接作为问题回答任务的结果显示，如下图所示，还需要进一步数据处理，得到最终的预测结果。
+<img src="./Images/Bert_04.png" style="zoom:80%;" align=middle>
+从图中可以看出，数据后处理主要包含如下操作：
+1.获取预测输出，根据模型推理结果，取前K个概率值最大的开始位置和结束位置。
+2.筛选与过滤，根据过滤规则筛选开始位置和结束位置。
+3.排序并输出结果，对开始位置加结束位置的概率值和进行排序，取概率值最大的组合作为最终的预测结果。
+过滤规则：
+1.开始位置和结束位置不能大于输入序列的长度。
+2.开始位置和结束位置必须位于上下文文本中。
+3.开始位置必须位于结束位置前。
+本示例数据后处理代码，详见Src/NLP/Bert/Bert.cpp脚本中的Postprocessing()函数。
--- a/Doc/Tutorial_Python.md
+++ b/Doc/Tutorial_Python.md
+# Bert
+本示例主要通过Bert模型说明如何使用ONNXRuntime Python API进行自然语言处理模型的推理，包括数据准备、预处理、模型推理以及数据后处理。
+## 模型简介
+自然语言处理（Natural Language Processing，NLP ）是能够实现人与计算机之间用自然语言进行有效沟通的理论和方法，是计算机科学领域与人工智能领域中的一个重要方向。本次采用经典的Bert模型完成问题回答任务，模型和分词文件下载链接：https://pan.baidu.com/s/1yc30IzM4ocOpTpfFuUMR0w, 提取码：8f1a, 将bertsquad-10.onnx文件和uncased_L-12_H-768_A-12分词文件保存在Resource/文件夹下。整体模型结构如下图所示，也可以通过netron工具：https://netron.app/ 查看Bert模型结构。
+<img src="./Images/Bert_01.png" style="zoom:100%;" align=middle>
+问题回答任务是指输入一段上下文文本的描述和一个问题，模型从上下文文本中预测出答案。例如：
+```
+1.描述：My name is Li Ming
+2.问题：What's your name?
+3.答案：Li Ming
+```
+## 数据准备
+本示例采用json文件保存文本数据，如下图所示，包含问题（question）和上下文文本（context），自己可以根据需求准备对应的问题和上下文文本作为输入数据，进行模型推理。
+```json
+{
+  "data": [
+    {
+      "paragraphs": [
+        {
+          "context": "ROCm is the first open-source exascale-class platform for accelerated computing that’s also programming-language independent. It brings a philosophy of choice, minimalism and modular software development to GPU computing. You are free to choose or even develop tools and a language run time for your application. ROCm is built for scale, it supports multi-GPU computing and has a rich system run time with the critical features that large-scale application, compiler and language-run-time development requires. Since the ROCm ecosystem is comprised of open technologies: frameworks (Tensorflow / PyTorch), libraries (MIOpen / Blas / RCCL), programming model (HIP), inter-connect (OCD) and up streamed Linux® Kernel support – the platform is continually optimized for performance and extensibility.",
+          "qas": [
+            {
+              "question": "What is ROCm?",
+              "id": "1"
+            },
+            {
+              "question": "Which frameworks does ROCm support?",
+              "id": "2"
+            },
+            {
+              "question": "What is ROCm built for?",
+              "id": "3"
+            }
+          ]
+        }
+      ],
+      "title": "AMD ROCm"
+    }
+  ]
+}
+```
+## 预处理
+将文本数据输入到模型之前，需要对数据做如下预处理：
+1.读取json文件，并整合文本数据存储到列表中
+2.数据重构，将问题和上下文文本拼接成一个序列，作为输入数据
+### 读取json文件
+读取json文件中的相应内容，首先处理上下文文本（context）内容，采用for循环将上下文文本（字符串）变为一个个词向量，存储在doc_tokens列表中。其次，获取问题文本（question）内容和对应的id。最后，将上下文文本和原始问题文本等保存在SquadEexample列表中，用于后续的数据重构。
+```python
+# 将SQuAD json文件内容读入到SquadEexample列表中
+def read_squad_examples(input_file):
+    with open(input_file, "r") as f:   
+        input_data = json.load(f)["data"]          
+    examples = []                              
+    for idx, entry in enumerate(input_data):
+        for paragraph in entry["paragraphs"]:
+     # 获取上下文文本内容，并存储在doc_tokens列表中
+            paragraph_text = paragraph["context"]  
+            doc_tokens = []                        
+            prev_is_whitespace = True
+            # 将上下文文本内容（字符串），变为一个个词向量存储在doc_tokens列表中
+            for c in paragraph_text:             
+                if is_whitespace(c):               # 当c为空格时，返回为True，否则为False
+                    prev_is_whitespace = True
+                else:
+                    if prev_is_whitespace:         # 将当前字符添加到列表末尾
+                        doc_tokens.append(c)
+                    else:
+                        doc_tokens[-1] += c        # 将当前字符与doc_tokens列表中的最后一个字符相加，存储在该位置中
+                    prev_is_whitespace = False
+      # 获取问题文本和对应的id
+            for qa in paragraph["qas"]:             
+                qas_id = qa["id"]                   
+                question_text = qa["question"]     
+                start_position = None
+                end_position = None
+                orig_answer_text = None
+                # 将上下文文本和原始问题文本等保存在SquadExample列表中
+                example = SquadExample(qas_id=qas_id,
+                                       question_text=question_text,
+                                       doc_tokens=doc_tokens,
+                                       orig_answer_text=orig_answer_text,
+                                       start_position=start_position,
+                                       end_position=end_position)
+                examples.append(example)
+    return examples
+```
+### 数据重构
+读取json文件中的文本之后，就需要对数据进行重构，将问题和上下文文本拼接成一个序列，输入到模型中执行推理。数据重构主要包含两个步骤：
+1.滑动窗口操作，如下图所示，当问题加上下文文本超过256个字符时，采取滑动窗口的方法构建输入序列。
+<img src="./Images/Bert_03.png" style="zoom:80%;" align=middle>
+从图中可以看出，问题部分不参与滑动处理，只将上下文文本进行滑动窗口操作，裁切得到多个子文本，用于后续的数据拼接。
+具体实现如下，首先通过while循环判断子文本的起始位置是否位于上下文文本中，其次，在循环体中使用start_offset变量确定子文本的起始位置，length变量确定子文本的长度，都存储在doc_spans列表中。因此，通过存储的起始位置和对应的长度，就可以获得对应的子文本。
+```Python
+def convert_examples_to_features(examples, tokenizer, max_seq_length,doc_stride, max_query_length):
+       ...
+        # 当上下文文本的长度大于规定的最大长度，采用滑动窗口的方法。
+        doc_spans = []
+        start_offset = 0
+        while start_offset < len(all_doc_tokens):
+            length = len(all_doc_tokens) - start_offset  # 确定在当前窗口下，确定剩余上下文文本的长度
+            if length > max_tokens_for_doc: 
+                length = max_tokens_for_doc
+            doc_spans.append(_DocSpan(start=start_offset, length=length))  # 存储开始索引和文本长度
+            if start_offset + length == len(all_doc_tokens):               # 如果相等，则停止滑动窗口操作，结束循环
+                break
+            start_offset += min(length, doc_stride)                        # 确定开始索引位置，进行下一次滑动窗口
+        ...
+```
+2.数据拼接，将获得的问题和上下文文本（子文本）拼接成一个序列，原理如下图所示：
+<img src="../Images/Bert_02.png" style="zoom:80%;" align=middle>
+从图中可以看出，构建的方法是将问题和上下文文本拼接成一个序列，开头用[CLS]表示后面对应的问题，中间和最后用[SEP]符号隔开。其中，“[CLS]”是一个分类标志，表示后面的内容属于问题文本，“[SEP]”字符是一个分割标志，用来将问题和上下文文本分开。
+```Python
+def convert_examples_to_features(examples, tokenizer, max_seq_length,doc_stride, max_query_length):
+    ...
+        # 拼接问题和上下文文本
+        for (doc_span_index, doc_span) in enumerate(doc_spans):
+        ...
+            tokens.append("[CLS]")            # 对tokens列表，开头添加[CLS]标志符
+            segment_ids.append(0)             
+            for token in query_tokens.tokens:
+                tokens.append(token)          # 对tokens列表，添加问题文本的分词
+                segment_ids.append(0)         
+            tokens.append("[SEP]")            # 对tokens列表，添加[SEP]标志符
+            segment_ids.append(0)             
+            for i in range(doc_span.length):
+                split_token_index = doc_span.start + i
+                token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
+                tokens.append(all_doc_tokens[split_token_index])  # 对tokens列表，添加上下文文本的分词
+                segment_ids.append(1)                             
+            tokens.append("[SEP]")                                # 在上下文文本的后端添加标志符[SEP]
+            segment_ids.append(1)                                 
+            for token in tokens:
+                input_ids.append(tokenizer.token_to_id(token))   # 将拼接好的文本数据转换为数值型数据
+        ...
+```
+## 推理
+完成数据预处理后，就可以执行推理，得到推理结果。
+```python
+for idx in range(0, n):
+    item = eval_examples[idx]
+    # 推理
+    result = dcu_session.run(None, {
+        "unique_ids_raw_output___9:0":
+        np.array([item.qas_id], dtype=np.int64),   # position id
+        "input_ids:0":
+        input_ids[idx:idx + bs],                   # Token id，对应的文本数据转换为数值型数据
+        "input_mask:0":
+        input_mask[idx:idx + bs],                  # 掩码
+        "segment_ids:0":
+        segment_ids[idx:idx + bs]                  # segment id，对上下文文本和问题赋予不同的位置向量
+    })
+    in_batch = result[1].shape[0]
+    npresule1 =np.array(result[0])
+    npresule2 =np.array(result[1])
+    start_logits = [float(x) for x in npresule1.flatten()]  # 答案起始位置的概率值
+    end_logits = [float(x) for x in npresule2.flatten()]    # 答案结束位置的概率值
+    for i in range(0, in_batch):
+        unique_id = len(all_results)
+        all_results.append(RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits))  
+```
+1.通过数据预处理操作后，获得了输入数据，分别为position id、Token id（文本信息）、segment id（区分问题和上下文文本）以及mask id（掩码信息，用于区分真实关注的位置），输入到dcu_session.run({...})中执行推理，得到模型的推理结果。推理结果主要包含了对答案开始位置和结束位置的预测概率值，都保存在all_results[]数组中，用于后续的数据后处理。
+## 数据后处理
+获得推理结果后，并不能直接作为问题回答任务的结果显示，如下图所示，还需要进一步数据处理，得到最终的预测结果。
+<img src="./Images/Bert_04.png" style="zoom:80%;" align=middle>
+从图中可以看出，数据后处理主要包含如下操作：
+1.获取预测输出，根据模型推理结果，取前K个概率值最大的开始位置和结束位置。
+2.筛选与过滤，根据过滤规则筛选开始位置和结束位置。
+3.排序并输出结果，对开始位置加结束位置的概率值和进行排序，取概率值最大的组合作为最终的预测结果。
+过滤规则：
+1.开始位置和结束位置不能大于输入序列的长度。
+2.开始位置和结束位置必须位于上下文文本中。
+3.开始位置必须位于结束位置前。
+本示例数据后处理代码，详见Python/NLP/Bert/run_onnx_squad.py脚本中的write_predictions()函数。
--- a/Python/bert.py
+++ b/Python/bert.py
+import numpy as np
+import json
+import os.path
+import tokenizers
+import collections
+from run_onnx_squad import read_squad_examples, write_predictions, convert_examples_to_features
+import onnxruntime as ort
+RawResult = collections.namedtuple("RawResult",
+                                   ["unique_id", "start_logits", "end_logits"])
+# 数据前处理
+input_file = '../Resource/inputs_data.json'
+# 使用run_onnx_squad中的read_squad_examples方法读取输入文件，进行数据处理，将文本拆分成一个个单词
+eval_examples = read_squad_examples(input_file)  
+max_seq_length = 256    # 规定输入文本的最大长度
+doc_stride = 256        # 滑动的窗口大小
+max_query_length = 64   # 问题的最大长度
+batch_size = 1          # batch_size值
+n_best_size = 20        # 预选数量
+max_answer_length = 30  # 问题的最大长度
+# 分词工具
+vocab_file = os.path.join('../Resource/uncased_L-12_H-768_A-12', 'vocab.txt')
+tokenizer = tokenizers.BertWordPieceTokenizer(vocab_file)
+# 使用run_onnx_squad中的convert_examples_to_features方法从输入中获取参数
+input_ids, input_mask, segment_ids, extra_data = convert_examples_to_features(eval_examples, tokenizer, max_seq_length, doc_stride, max_query_length)
+# 加载模型
+print("INFO: Parsing and compiling the model")
+sess_options = ort.SessionOptions()
+#设置图优化
+sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_BASIC
+#是否开启profiling
+sess_options.enable_profiling = False
+#加载模型
+dcu_session = ort.InferenceSession("../Resource/bertsquad-10.onnx",sess_options,providers=['ROCMExecutionProvider'],)
+input_name=dcu_session.get_inputs()[0].name                     
+n = len(input_ids)
+bs = batch_size
+all_results = []
+for idx in range(0, n):
+    item = eval_examples[idx]
+    # 推理
+    result = dcu_session.run(None, {
+        "unique_ids_raw_output___9:0":
+        np.array([item.qas_id], dtype=np.int64),   # position id
+        "input_ids:0":
+        input_ids[idx:idx + bs],                   # Token id，对应的文本数据转换为数值型数据
+        "input_mask:0":
+        input_mask[idx:idx + bs],                  # 掩码
+        "segment_ids:0":
+        segment_ids[idx:idx + bs]                  # segment id，对上下文文本和问题赋予不同的位置向量
+    })
+    in_batch = result[1].shape[0]
+    npresule1 =np.array(result[0])
+    npresule2 =np.array(result[1])
+    start_logits = [float(x) for x in npresule1.flatten()]  # 答案起始位置的概率值
+    end_logits = [float(x) for x in npresule2.flatten()]    # 答案结束位置的概率值
+    for i in range(0, in_batch):
+        unique_id = len(all_results)
+        all_results.append(
+            RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) 
+# 数据后处理，获取预测结果
+output_dir = 'predictions'               
+os.makedirs(output_dir, exist_ok=True)    
+output_prediction_file = os.path.join(output_dir, "predictions.json")  
+output_nbest_file = os.path.join(output_dir, "nbest_predictions.json")
+write_predictions(eval_examples, extra_data, all_results, n_best_size,
+                  max_answer_length, True, output_prediction_file,
+                  output_nbest_file)     
+with open(output_prediction_file) as json_file:
+    test_data = json.load(json_file)
+    print(json.dumps(test_data, indent=2))
--- a/Python/requirements.txt
+++ b/Python/requirements.txt
+onnxruntime
+tokenizers
+numpy
\ No newline at end of file
--- a/Python/run_onnx_squad.py
+++ b/Python/run_onnx_squad.py
+import argparse
+import collections
+import json
+import math
+import os
+import sys
+import numpy as np
+import onnxruntime as onnxrt
+import six
+from tokenizers import BertWordPieceTokenizer
+from tokenizers import pre_tokenizers
+RawResult = collections.namedtuple("RawResult",
+                                   ["unique_id", "start_logits", "end_logits"])
+Feature = collections.namedtuple("Feature", [
+    "unique_id", "tokens", "example_index", "token_to_orig_map",
+    "token_is_max_context"
+])
+class SquadExample(object):
+    def __init__(self,
+                 qas_id,
+                 question_text,
+                 doc_tokens,
+                 orig_answer_text=None,
+                 start_position=None,
+                 end_position=None):
+        self.qas_id = qas_id
+        self.question_text = question_text
+        self.doc_tokens = doc_tokens
+        self.orig_answer_text = orig_answer_text
+        self.start_position = start_position
+        self.end_position = end_position
+    def __str__(self):
+        return self.__repr__()
+    def __repr__(self):
+        s = []
+        s.append("qas_id: %s" % (self.qas_id))
+        s.append("question_text: %s" % (self.question_text))
+        s.append("doc_tokens: [%s]" % (" ".join(self.doc_tokens)))
+        if self.start_position:
+            s.append("start_position: %d" % (self.start_position))
+        if self.start_position:
+            s.append("end_position: %d" % (self.end_position))
+        return ", ".join(s)
+def check_is_max_context(doc_spans, cur_span_index, position):
+    best_score = None
+    best_span_index = None
+    for (span_index, doc_span) in enumerate(doc_spans):
+        end = doc_span.start + doc_span.length - 1
+        if position < doc_span.start:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span.start
+        num_right_context = end - position
+        score = min(num_left_context,
+                    num_right_context) + 0.01 * doc_span.length
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+    return cur_span_index == best_span_index
+def convert_examples_to_features(examples, tokenizer, max_seq_length,
+                                 doc_stride, max_query_length):
+    res_input_ids = []
+    res_input_mask = []
+    res_segment_ids = []
+    extra = []
+    unique_id = 0
+    for (example_index, example) in enumerate(examples):
+        # 对原始问题文本进行数据处理
+        query_tokens = tokenizer.encode(example.question_text) 
+        if len(query_tokens) > max_query_length:
+            query_tokens = query_tokens[0:max_query_length]
+        # 对上下文文本进行数据处理
+        tok_to_orig_index = []   
+        orig_to_tok_index = []   
+        all_doc_tokens = []      
+        for (i, token) in enumerate(example.doc_tokens):  
+            orig_to_tok_index.append(len(all_doc_tokens))
+            sub_tokens = tokenizer.encode(token, add_special_tokens=False)  
+            for sub_token in sub_tokens.tokens:
+                tok_to_orig_index.append(i)
+                all_doc_tokens.append(sub_token)
+        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3  
+        # 当上下文文本的长度大于规定的最大长度，则使用滑动窗口的方法。
+        _DocSpan = collections.namedtuple("DocSpan", ["start", "length"])
+        doc_spans = []
+        start_offset = 0
+        while start_offset < len(all_doc_tokens):        
+            length = len(all_doc_tokens) - start_offset  
+            if length > max_tokens_for_doc:              
+                length = max_tokens_for_doc
+            doc_spans.append(_DocSpan(start=start_offset, length=length))
+            if start_offset + length == len(all_doc_tokens):     # 如果start_offset + length == len(all_doc_tokens)，则不进行滑动窗口操作
+                break
+            start_offset += min(length, doc_stride)
+        # 拼接原始问题和上下文文本
+        for (doc_span_index, doc_span) in enumerate(doc_spans):
+            tokens = []
+            token_to_orig_map = {}
+            token_is_max_context = {}
+            segment_ids = []
+            tokens.append("[CLS]")                    
+            segment_ids.append(0)
+            for token in query_tokens.tokens:
+                tokens.append(token)                     
+                segment_ids.append(0)
+            tokens.append("[SEP]")                       
+            segment_ids.append(0)                        
+            for i in range(doc_span.length):
+                split_token_index = doc_span.start + i
+                token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] 
+                is_max_context = check_is_max_context(doc_spans,
+                                                       doc_span_index,
+                                                       split_token_index)
+                token_is_max_context[len(tokens)] = is_max_context
+                tokens.append(all_doc_tokens[split_token_index])
+                segment_ids.append(1)          
+            tokens.append("[SEP]")             
+            segment_ids.append(1)     
+            input_ids = []
+            for token in tokens:
+                input_ids.append(tokenizer.token_to_id(token))  
+            # 掩码为1表示真实标记，0表示填充标记。
+            input_mask = [1] * len(input_ids)
+            # 当序列长度小于max_seq_length时，零填充序列到max_seq_length长度
+            while len(input_ids) < max_seq_length:
+                input_ids.append(0)
+                input_mask.append(0)
+                segment_ids.append(0)
+            res_input_ids.append(np.array(input_ids, dtype=np.int64))
+            res_input_mask.append(np.array(input_mask, dtype=np.int64))
+            res_segment_ids.append(np.array(segment_ids, dtype=np.int64))
+            feature = Feature(unique_id=unique_id,
+                              tokens=tokens,
+                              example_index=example_index,
+                              token_to_orig_map=token_to_orig_map,
+                              token_is_max_context=token_is_max_context)
+            extra.append(feature)
+            unique_id += 1
+    return np.array(res_input_ids), np.array(res_input_mask), np.array(
+        res_segment_ids), extra
+# 将SQuAD json文件读入到一个SquadEexample列表中
+def read_squad_examples(input_file):
+    with open(input_file, "r") as f:
+        input_data = json.load(f)["data"]
+    def is_whitespace(c):
+        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+            return True
+        return False
+    examples = []    
+    for idx, entry in enumerate(input_data):
+    # 获取上下文文本内容，并存储在doc_tokens列表中
+        for paragraph in entry["paragraphs"]:
+            paragraph_text = paragraph["context"]   
+            doc_tokens = []                         
+            char_to_word_offset = []
+            prev_is_whitespace = True
+            for c in paragraph_text:
+                if is_whitespace(c):
+                    prev_is_whitespace = True
+                else:
+                    if prev_is_whitespace:
+                        doc_tokens.append(c)
+                    else:
+                        doc_tokens[-1] += c
+                    prev_is_whitespace = False
+                char_to_word_offset.append(len(doc_tokens) - 1)
+    # 获取原始问题文本和对应的id
+            for qa in paragraph["qas"]:             
+                qas_id = qa["id"]                  
+                question_text = qa["question"]      
+                start_position = None
+                end_position = None
+                orig_answer_text = None
+                # 将上下文文本和原始问题文本等保存在SquadExample列表中
+                example = SquadExample(qas_id=qas_id,
+                                       question_text=question_text,
+                                       doc_tokens=doc_tokens,
+                                       orig_answer_text=orig_answer_text,
+                                       start_position=start_position,
+                                       end_position=end_position)
+                examples.append(example)
+    return examples
+def write_predictions(all_examples, all_features, all_results, n_best_size,
+                      max_answer_length, do_lower_case, output_prediction_file,
+                      output_nbest_file):
+    example_index_to_features = collections.defaultdict(list)
+    for feature in all_features:
+        example_index_to_features[feature.example_index].append(feature)
+    unique_id_to_result = {}
+    for result in all_results:
+        unique_id_to_result[result.unique_id] = result
+    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "PrelimPrediction", [
+            "feature_index", "start_index", "end_index", "start_logit",
+            "end_logit"
+        ])
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    for (example_index, example) in enumerate(all_examples):
+        features = example_index_to_features[example_index]
+        prelim_predictions = []
+        for (feature_index, feature) in enumerate(features):
+            # 取前n_best_size个预测概率值
+            if not feature.unique_id in unique_id_to_result:
+                print("feature not in unique_Id", feature.unique_id)
+                continue
+            result = unique_id_to_result[feature.unique_id]
+            start_indexes = get_best_indexes(result.start_logits, n_best_size)   
+            end_indexes = get_best_indexes(result.end_logits, n_best_size)       
+            # 筛选与过滤，过滤掉不符合的开始索引和结束索引
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    if start_index >= len(feature.tokens):                      
+                        continue
+                    if end_index >= len(feature.tokens):                        
+                        continue
+                    if start_index not in feature.token_to_orig_map:            
+                        continue
+                    if end_index not in feature.token_to_orig_map:             
+                        continue
+                    if not feature.token_is_max_context.get(               
+                            start_index, False):
+                        continue
+                    if end_index < start_index:                            
+                        continue
+                    length = end_index - start_index + 1
+                    if length > max_answer_length:                     
+                        continue
+                    prelim_predictions.append(
+                        _PrelimPrediction(
+                            feature_index=feature_index,                        
+                            start_index=start_index,                           
+                            end_index=end_index,                              
+                            start_logit=result.start_logits[start_index],
+                            end_logit=result.end_logits[end_index]))
+              # 排序，开始索引加结束索引的概率值和最大的排在前面      
+        prelim_predictions = sorted(prelim_predictions,
+                                    key=lambda x:
+                                    (x.start_logit + x.end_logit),
+                                    reverse=True)
+        _NbestPrediction = collections.namedtuple(
+            "NbestPrediction", ["text", "start_logit", "end_logit"])
+        seen_predictions = {}
+        nbest = []
+        for pred in prelim_predictions:
+            # 取前n_best_size个概率值最大的结果
+            if len(nbest) >= n_best_size:
+                break
+            feature = features[pred.feature_index]
+            tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)] 
+            orig_doc_start = feature.token_to_orig_map[pred.start_index]       
+            orig_doc_end = feature.token_to_orig_map[pred.end_index]          
+            orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)] 
+            tok_text = " ".join(tok_tokens)
+            # 去标记化已分离的单词以及去除首尾空格
+            tok_text = tok_text.replace(" ##", "")
+            tok_text = tok_text.replace("##", "")
+            tok_text = tok_text.strip()
+            tok_text = " ".join(tok_text.split())
+            orig_text = " ".join(orig_tokens)
+            nbest.append(
+                _NbestPrediction(text=orig_text,
+                                 start_logit=pred.start_logit,
+                                 end_logit=pred.end_logit))
+        if not nbest:
+            nbest.append(
+                _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+        assert len(nbest) >= 1
+        total_scores = []
+        for entry in nbest:
+            total_scores.append(entry.start_logit + entry.end_logit)
+        probs = compute_softmax(total_scores)
+        nbest_json = []
+        for (i, entry) in enumerate(nbest):
+            output = collections.OrderedDict()
+            output["text"] = entry.text                                            # 获取对应的文本
+            output["probability"] = probs[i]                                       # 预测概率值
+            output["start_logit"] = float(entry.start_logit)                       # 开始位置的概率值
+            output["end_logit"] = float(entry.end_logit)                           # 结束位置的概率值
+            nbest_json.append(output)
+        all_predictions[example.qas_id] = nbest_json[0]["text"]                    # 取最大概率值，作为最终的预测答案
+        all_nbest_json[example.qas_id] = nbest_json
+    with open(output_prediction_file, "w") as writer:
+        writer.write(json.dumps(all_predictions, indent=4) + "\n")
+    with open(output_nbest_file, "w") as writer:
+        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+# 对logits的概率值排序，并获取前个n_best_size的概率值
+def get_best_indexes(logits, n_best_size):
+    index_and_score = sorted(enumerate(logits),
+                             key=lambda x: x[1],
+                             reverse=True)
+    best_indexes = []
+    for i in range(len(index_and_score)):
+        if i >= n_best_size:
+            break
+        best_indexes.append(index_and_score[i][0])
+    return best_indexes
+# 计算softmax
+def compute_softmax(scores):
+    if not scores:
+        return []
+    max_score = None
+    for score in scores:
+        if max_score is None or score > max_score:
+            max_score = score
+    exp_scores = []
+    total_sum = 0.0
+    for score in scores:
+        x = math.exp(score - max_score)
+        exp_scores.append(x)
+        total_sum += x
+    probs = []
+    for score in exp_scores:
+        probs.append(score / total_sum)
+    return probs
--- a/README.md
+++ b/README.md
+# Bidirectional Encoder Representation from Transformers(BERT)
+## 模型介绍
+BERT的全称为Bidirectional Encoder Representation from Transformers，是一个预训练的语言表征模型。它强调了不再像以往一样采用传统的单向语言模型或者把两个单向语言模型进行浅层拼接的方法进行预训练，而是采用新的masked language model（MLM），以致能生成深度的双向语言表征。
+## 模型结构
+以往的预训练模型的结构会受到单向语言模型（从左到右或者从右到左）的限制，因而也限制了模型的表征能力，使其只能获取单方向的上下文信息。而BERT利用MLM进行预训练并且采用深层的双向Transformer组件（单向的Transformer一般被称为Transformer decoder，其每一个token（符号）只会attend到目前往左的token。而双向的Transformer则被称为Transformer encoder，其每一个token会attend到所有的token）来构建整个模型，因此最终生成能融合左右上下文信息的深层双向语言表征。
+## Python版本推理
+本次采用经典的Bert模型完成问题回答任务，模型和分词文件下载链接：https://pan.baidu.com/s/1yc30IzM4ocOpTpfFuUMR0w, 提取码：8f1a, 将bertsquad-10.onnx文件和uncased_L-12_H-768_A-12分词文件保存在Resource/文件夹下。下面介绍如何运行python代码示例，Python示例的详细说明见Doc目录下的Tutorial_Python.md。
+### 下载镜像
+在光源中下载镜像： 
+```python
+docker pull image.sourcefind.cn:5000/dcu/admin/base/custom:ort1.14.0_migraphx3.0.0-dtk22.10.1
+```
+### 设置Python环境变量
+```
+export PYTHONPATH=/opt/dtk/lib:$PYTHONPATH
+```
+### 安装依赖
+```python
+# 进入bert ort工程根目录
+cd <path_to_bert_ort> 
+# 进入示例程序目录
+cd Python/
+# 安装依赖
+pip install -r requirements.txt
+```
+### 运行示例
+```python
+python bert.py
+```
+输出结果为：
+```
+“1”："open-source exascale-class platform for accelerated computing",
+"2"："(Tensorflow / PyTorch)",
+"3"："scale"
+```
+输出结果中，问题id对应预测概率值最大的答案。
+## C++版本推理
+本次采用经典的Bert模型完成问题回答任务，模型和分词文件下载链接：https://pan.baidu.com/s/1yc30IzM4ocOpTpfFuUMR0w, 提取码：8f1a, 将bertsquad-10.onnx文件和uncased_L-12_H-768_A-12分词文件保存在Resource/文件夹下。下面介绍如何运行C++代码示例，C++示例的详细说明见Doc目录下的Tutorial_Cpp.md。
+### 下载镜像
+在光源中下载镜像： 
+```
+docker pull image.sourcefind.cn:5000/dcu/admin/base/custom:ort1.14.0_migraphx3.0.0-dtk22.10.1
+```
+### 构建工程
+```
+rbuild build -d depend
+```
+### 设置环境变量
+将依赖库依赖加入环境变量LD_LIBRARY_PATH，在~/.bashrc中添加如下语句：
+```
+export LD_LIBRARY_PATH=<path_to_bert_ort>/depend/lib64/:$LD_LIBRARY_PATH
+```
+然后执行:
+```
+source ~/.bashrc
+source /opt/dtk/env.sh
+```
+### 运行示例
+```python
+# 进入bert ort工程根目录
+cd <path_to_bert_ort> 
+# 进入build目录
+cd build/
+# 执行示例程序
+./Bert
+```
+如下所示，在当前界面根据提示输入问题，得到预测答案。
+```
+question：What is ROCm?
+answer：open-source exascale-class platform for accelerated computing
+question：Which frameworks does ROCmsupport?
+answer：tensorflow / pytorch
+question：What is ROCm built for?
+answer：scale
+```
+## 源码仓库及问题反馈
+https://developer.hpccube.com/codes/modelzoo/bert_ort
+## 参考
+https://github.com/ROCmSoftwarePlatform/onnxruntime/blob/81120e9e8b377567daa00d55614c902f35b2ae8f/onnxruntime/python/tools/transformers/onnx_model_bert.py
\ No newline at end of file
--- a/Resource/inputs_data.json
+++ b/Resource/inputs_data.json
+{
+  "data": [
+    {
+      "paragraphs": [
+        {
+          "context": "ROCm is the first open-source exascale-class platform for accelerated computing that’s also programming-language independent. It brings a philosophy of choice, minimalism and modular software development to GPU computing. You are free to choose or even develop tools and a language run time for your application. ROCm is built for scale, it supports multi-GPU computing and has a rich system run time with the critical features that large-scale application, compiler and language-run-time development requires. Since the ROCm ecosystem is comprised of open technologies: frameworks (Tensorflow / PyTorch), libraries (MIOpen / Blas / RCCL), programming model (HIP), inter-connect (OCD) and up streamed Linux® Kernel support – the platform is continually optimized for performance and extensibility.",
+          "qas": [
+            {
+              "question": "What is ROCm?",
+              "id": "1"
+            },
+            {
+              "question": "Which frameworks does ROCm support?",
+              "id": "2"
+            },
+            {
+              "question": "What is ROCm built for?",
+              "id": "3"
+            }
+          ]
+        }
+      ],
+      "title": "AMD ROCm"
+    }
+  ]
+}
\ No newline at end of file
--- a/Src/Bert.cpp
+++ b/Src/Bert.cpp
+#include <Bert.h>
+#include <onnxruntime/core/session/onnxruntime_cxx_api.h>
+#include <Filesystem.h>
+#include <SimpleLog.h>
+#include <algorithm>
+#include <stdexcept>
+#include <tokenization.h>
+namespace ortSamples
+{
+Bert::Bert()
+{
+}
+Bert::~Bert()
+{
+}
+ErrorCode Bert::Initialize()
+{
+    // 获取模型文件
+    std::string modelPath="../Resource/bertsquad-10.onnx";
+    // 加载模型
+    if(Exists(modelPath)==false)
+    {
+        LOG_ERROR(stdout,"%s not exist!\n",modelPath.c_str());
+        return MODEL_NOT_EXIST;
+    }
+    //加载模型
+    OrtROCMProviderOptions rocm_options;
+    rocm_options.device_id = 0;
+    sessionOptions.AppendExecutionProvider_ROCM(rocm_options);
+    sessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_BASIC);
+    LOG_INFO(stdout,"succeed to load model: %s\n",GetFileName(modelPath).c_str());
+    session = new Ort::Session(env, modelPath.c_str(), sessionOptions);                
+    LOG_INFO(stdout,"succeed to compile model: %s\n",GetFileName(modelPath).c_str());
+    return SUCCESS;
+}
+ErrorCode Bert::Inference(const std::vector<std::vector<long unsigned int>> &input_ids, 
+                          const std::vector<std::vector<long unsigned int>> &input_masks, 
+                          const std::vector<std::vector<long unsigned int>> &segment_ids, 
+                          std::vector<float> &start_position, 
+                          std::vector<float> &end_position)
+{
+    // 保存预处理后的数据
+    int num = input_ids.size();
+    long unsigned int input_id[num][256];
+    long unsigned int input_mask[num][256];
+    long unsigned int segment_id[num][256];
+    long unsigned int position_id[num][1];
+    for(int i=0;i<input_ids.size();++i)
+    {
+        for(int j=0;j<input_ids[0].size();++j)
+        {
+            input_id[i][j] = input_ids[i][j];
+            segment_id[i][j] = segment_ids[i][j];
+            input_mask[i][j] = input_masks[i][j];
+            position_id[i][0] = 1;
+        }
+    }
+    // 获取模型输入属性
+      std::vector<const char*> input_node_names = {"unique_ids_raw_output___9:0","segment_ids:0","input_mask:0","input_ids:0"};
+    // 获取模型输出属性
+    std::vector<const char*> output_node_names = {"unstack:1","unstack:0","unique_ids:0"};
+    // 设置输入shape
+    std::array<int64_t, 1> inputShapes1{1};
+    std::array<int64_t, 2> inputShapes2{1, 256};
+    std::array<int64_t, 2> inputShapes3{1, 256};
+    std::array<int64_t, 2> inputShapes4{1, 256};
+    std::vector<float> inputTensorValues1(1);
+    std::vector<float> inputTensorValues2(256);
+    std::vector<float> inputTensorValues3(256);
+    std::vector<float> inputTensorValues4(256);
+    auto memoryInfo1 = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
+    auto memoryInfo2 = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
+    auto memoryInfo3 = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
+    auto memoryInfo4 = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
+    float* start_data;
+    float* end_data;
+    for(int i=0;i<input_ids.size();++i)
+    {
+        // 创建输入数据
+        int64_t* input_test1 = (int64_t*)position_id[i];
+        int64_t* input_test2 = (int64_t*)segment_id[i];
+        int64_t* input_test3 = (int64_t*)input_mask[i];
+        int64_t* input_test4 = (int64_t*)input_id[i];
+        Ort::Value inputTensor1 = Ort::Value::CreateTensor<int64_t>(
+            memoryInfo1,
+            input_test1, 
+            inputTensorValues1.size(), 
+            inputShapes1.data(), 
+            inputShapes1.size());
+        Ort::Value inputTensor2 = Ort::Value::CreateTensor<int64_t>(
+            memoryInfo2,
+            input_test2, 
+            inputTensorValues2.size(), 
+            inputShapes2.data(), 
+            inputShapes2.size());
+        Ort::Value inputTensor3 = Ort::Value::CreateTensor<int64_t>(
+            memoryInfo3,
+            input_test3, 
+            inputTensorValues3.size(), 
+            inputShapes3.data(), 
+            inputShapes3.size());        
+        Ort::Value inputTensor4 = Ort::Value::CreateTensor<int64_t>(
+            memoryInfo4,
+            input_test4, 
+            inputTensorValues4.size(), 
+            inputShapes4.data(), 
+            inputShapes4.size());
+        std::vector<Ort::Value> intput_tensors;
+        intput_tensors.push_back(std::move(inputTensor1));
+        intput_tensors.push_back(std::move(inputTensor2));
+        intput_tensors.push_back(std::move(inputTensor3));
+        intput_tensors.push_back(std::move(inputTensor4));
+        // 推理
+        auto output_tensors = session->Run(Ort::RunOptions{nullptr}, input_node_names.data(), intput_tensors.data(), input_node_names.size(), output_node_names.data(), output_node_names.size());
+        // 获取输出节点的属性
+        const float* start_data = output_tensors[1].GetTensorMutableData<float>(); // 开始位置的数据指针
+        const float* end_data = output_tensors[0].GetTensorMutableData<float>();   // 结束位置的数据指针
+        // 保存推理结果
+        for(int i=0;i<256;++i)
+        {
+            start_position.push_back(start_data[i]);
+            end_position.push_back(end_data[i]);
+        }
+    }
+    return SUCCESS;
+}
+ErrorCode Bert::Preprocessing(cuBERT::FullTokenizer tokenizer,
+                             int batch_size,
+                             int max_seq_length,
+                             const char *text,
+                             char *question,
+                             std::vector<std::vector<long unsigned int>> &input_ids, 
+                             std::vector<std::vector<long unsigned int>> &input_masks, 
+                             std::vector<std::vector<long unsigned int>> &segment_ids)
+{
+    std::vector<long unsigned int> input_id(max_seq_length);          
+    std::vector<long unsigned int> input_mask(max_seq_length);
+    std::vector<long unsigned int> segment_id(max_seq_length);
+    // 对上下文文本和问题进行分词操作
+    tokens_text.reserve(max_seq_length);        
+    tokens_question.reserve(max_seq_length);   
+    tokenizer.tokenize(text, &tokens_text, max_seq_length); 
+    tokenizer.tokenize(question, &tokens_question, max_seq_length);
+    // 当上下文文本加问题文本的长度大于规定的最大长度，采用滑动窗口操作
+    if(tokens_text.size() + tokens_question.size() > max_seq_length - 5)
+    {
+        int windows_len = max_seq_length - 5 -tokens_question.size();
+        std::vector<std::string> tokens_text_window(windows_len);
+        std::vector<std::vector<std::string>> tokens_text_windows;
+        int start_offset = 0;
+        int position = 0;
+        int n;
+        while (start_offset < tokens_text.size())
+        {
+            n = 0;
+            if(start_offset+windows_len>tokens_text.size())
+            {
+                for(int i=start_offset;i<tokens_text.size();++i)
+                {
+                    tokens_text_window[n] = tokens_text[i];
+                    ++n;
+                }
+            }
+            else
+            {
+                for(int i=start_offset;i<start_offset+windows_len;++i)
+                {
+                    tokens_text_window[n] = tokens_text[i];
+                    ++n;
+                }
+            }
+            tokens_text_windows.push_back(tokens_text_window);
+            start_offset += 256;   
+            ++position;
+        }
+        for(int i=0;i<position;++i)
+        {
+            input_id[0] = tokenizer.convert_token_to_id("[CLS]");
+            segment_id[0] = 0;
+            input_id[1] = tokenizer.convert_token_to_id("[CLS]");
+            segment_id[1] = 0;
+            for (int j=0;j<tokens_question.size();++j)
+            {
+                input_id[j + 2] = tokenizer.convert_token_to_id(tokens_question[j]);
+                segment_id[j + 2] = 0;
+            }
+            input_id[tokens_question.size() + 2] = tokenizer.convert_token_to_id("[SEP]");
+            segment_id[tokens_question.size() + 2] = 0;
+            input_id[tokens_question.size() + 3] = tokenizer.convert_token_to_id("[SEP]");
+            segment_id[tokens_question.size() + 3] = 0;
+            for (int j=0;j<tokens_question.size();++j)
+            {
+                input_id[j + tokens_text_windows[i].size() + 4] = tokenizer.convert_token_to_id(tokens_text_windows[i][j]);
+                segment_id[j + tokens_text_windows[i].size() + 4] = 1;
+            }
+            input_id[tokens_question.size() + tokens_text_windows[i].size() + 4] = tokenizer.convert_token_to_id("[SEP]");
+            segment_id[tokens_question.size() + tokens_text_windows[i].size() + 4] = 1;
+            // 掩码为1的表示为真实标记，0表示为填充标记。
+            int len = tokens_text_windows[i].size() + tokens_question.size() + 5;
+            std::fill(input_mask.begin(), input_mask.begin() + len, 1);
+            std::fill(input_mask.begin() + len, input_mask.begin() + max_seq_length, 0);
+            std::fill(input_id.begin() + len, input_id.begin() + max_seq_length, 0);
+            std::fill(segment_id.begin() + len, segment_id.begin() + max_seq_length, 0);
+            input_ids.push_back(input_id);
+            input_masks.push_back(input_mask);
+            segment_ids.push_back(segment_id);
+        }
+    }
+    else
+    {
+        // 当上下文文本加问题文本的长度小于等于规定的最大长度，直接拼接处理
+        input_id[0] = tokenizer.convert_token_to_id("[CLS]");
+        segment_id[0] = 0;
+        input_id[1] = tokenizer.convert_token_to_id("[CLS]");
+        segment_id[1] = 0;
+        for (int i=0;i<tokens_question.size();++i) 
+        {
+            input_id[i + 2] = tokenizer.convert_token_to_id(tokens_question[i]);
+            segment_id[i + 2] = 0;
+        }
+        input_id[tokens_question.size() + 2] = tokenizer.convert_token_to_id("[SEP]");
+        segment_id[tokens_question.size() + 2] = 0;
+        input_id[tokens_question.size() + 3] = tokenizer.convert_token_to_id("[SEP]");
+        segment_id[tokens_question.size() + 3] = 0;
+        for (int i=0;i<tokens_text.size();++i) 
+        {
+            input_id[i + tokens_question.size() + 4] = tokenizer.convert_token_to_id(tokens_text[i]);
+            segment_id[i + tokens_question.size() + 4] = 1;
+        }
+        input_id[tokens_question.size() + tokens_text.size() + 4] = tokenizer.convert_token_to_id("[SEP]");
+        segment_id[tokens_question.size() + tokens_text.size() + 4] = 1;
+        // 掩码为1的表示为真实标记，0表示为填充标记。
+        int len = tokens_text.size() + tokens_question.size() + 5;
+        std::fill(input_mask.begin(), input_mask.begin() + len, 1);
+        std::fill(input_mask.begin() + len, input_mask.begin() + max_seq_length, 0);
+        std::fill(input_id.begin() + len, input_id.begin() + max_seq_length, 0);
+        std::fill(segment_id.begin() + len, segment_id.begin() + max_seq_length, 0);
+        input_ids.push_back(input_id);
+        input_masks.push_back(input_mask);
+        segment_ids.push_back(segment_id);
+    }
+    return SUCCESS;
+}
+static bool Compare(Sort_st a, Sort_st b) 
+{
+	return a.value > b.value;                                                                                            
+}
+static bool CompareM(ResultOfPredictions a, ResultOfPredictions b)
+{
+	return a.start_predictionvalue + a.end_predictionvalue > b.start_predictionvalue + b.end_predictionvalue;     
+}
+ErrorCode Bert::Postprocessing(int n_best_size,
+                               int max_answer_length,
+                               const std::vector<float> &start_position,
+                               const std::vector<float> &end_position,
+                               std::string &answer)
+{
+    // 取前n_best_size个最大概率值的索引
+    std::vector<Sort_st> start_array(start_position.size());
+    std::vector<Sort_st> end_array(end_position.size());
+    for (int i=0;i<start_position.size();++i)
+    {
+		start_array[i].index = i;
+		start_array[i].value = start_position.at(i);
+		end_array[i].index = i;
+		end_array[i].value = end_position.at(i);
+	}
+    std::sort(start_array.begin(), start_array.end(), Compare);
+    std::sort(end_array.begin(), end_array.end(), Compare);
+    // 过滤和筛选，筛选掉不符合的索引
+    std::vector<ResultOfPredictions> resultsOfPredictions(400);
+    int num = start_position.size() / 256;
+    bool flag;
+    int n=0;
+    for(int i=0;i<n_best_size;++i)
+    {
+        for(int j=0;j<n_best_size;++j)
+        {
+            flag = false;
+            if(start_array[i].index > start_position.size())
+            {
+                continue;
+            }
+            if(end_array[j].index > end_position.size())
+            {
+                continue;
+            }
+            for(int t=0;t<num;++t)
+            {
+                if(start_array[i].index > t*256 && start_array[i].index < tokens_question.size()+4+t*256)
+                {
+                    flag = true;
+                    break;
+                }
+                if(end_array[j].index > t*256 && end_array[j].index < tokens_question.size()+4+t*256)
+                {
+                    flag = true;
+                    break;
+                }
+            }
+            if(flag)
+            {
+                continue;
+            }
+            if(start_array[i].index > end_array[j].index)
+            {
+                continue;
+            }
+            int length = end_array[j].index - start_array[i].index  + 1;
+            if(length > max_answer_length)
+            {
+                continue;
+            }
+            resultsOfPredictions[n].start_index = start_array[i].index;
+            resultsOfPredictions[n].end_index =  end_array[j].index;
+            resultsOfPredictions[n].start_predictionvalue = start_array[i].value;
+            resultsOfPredictions[n].end_predictionvalue = end_array[j].value;
+            ++n;
+        }
+    }
+    // 排序，将开始索引加结束索引的概率值和最大的排在前面
+    std::sort(resultsOfPredictions.begin(), resultsOfPredictions.end(), CompareM);
+    int start_index = 0;
+    int end_index = 0;
+    for(int i=0;i<400;++i)
+    {
+        if(resultsOfPredictions[i].start_predictionvalue==0 && resultsOfPredictions[i].end_predictionvalue==0)
+        {
+            continue;
+        }
+        start_index = resultsOfPredictions[i].start_index;
+        end_index = resultsOfPredictions[i].end_index;
+        break;
+    }
+    // 映射回上下文文本的索引,（当前的索引值-问题的长度-4）
+    int answer_start_index = start_index - tokens_question.size()- 4;            
+    int answer_end_index = end_index - tokens_question.size() - 4 + 1;           
+    // 根据开始索引和结束索引，获取区间内的数据
+    int j=0;
+    for(int i=answer_start_index;i<answer_end_index;++i)
+    {
+        if(tokens_text[i].find('#') != -1)
+        {
+            j=i-1;
+            break;
+        }
+    }
+    for(int i=answer_start_index;i<answer_end_index;++i)
+    {
+        answer += tokens_text[i];
+        if(tokens_text[i].find('#') != -1 || i==j)
+        {
+            continue;
+        }
+        answer += " ";
+    }
+    int index = 0;
+    while( (index = answer.find('#',index)) != string::npos)
+    {
+        answer.erase(index,1);
+	}
+    tokens_text.clear();
+    tokens_question.clear();
+    return SUCCESS;
+}
+}
\ No newline at end of file
--- a/Src/Bert.h
+++ b/Src/Bert.h
+#ifndef __BERT_H__
+#define __BERT_H__
+#include <cstdint>
+#include <string>
+#include <onnxruntime/core/session/onnxruntime_cxx_api.h>
+#include <tokenization.h>
+namespace ortSamples
+{
+    typedef enum _ErrorCode
+    {
+        SUCCESS=0, 
+        MODEL_NOT_EXIST, 
+        CONFIG_FILE_NOT_EXIST, 
+        FAIL_TO_LOAD_MODEL, 
+        FAIL_TO_OPEN_CONFIG_FILE, 
+    }ErrorCode;
+    typedef struct _Sort_st
+    {
+        int index;
+        float value;
+    }Sort_st;
+    typedef struct _ResultOfPredictions
+    {
+        int start_index;
+        int end_index;
+        float start_predictionvalue;
+        float end_predictionvalue;
+    }ResultOfPredictions;
+class Bert
+{
+public:
+    Bert();
+    ~Bert();
+    ErrorCode Initialize();
+    ErrorCode Inference(const std::vector<std::vector<long unsigned int>> &input_ids, 
+                        const std::vector<std::vector<long unsigned int>> &input_masks, 
+                        const std::vector<std::vector<long unsigned int>> &segment_ids,
+                        std::vector<float> &start_position,
+                        std::vector<float> &end_position);
+    ErrorCode Preprocessing(cuBERT::FullTokenizer tokenizer,
+                             int batch_size,
+                             int max_seq_length,
+                             const char *text,
+                             char *question,
+                             std::vector<std::vector<long unsigned int>> &input_ids, 
+                             std::vector<std::vector<long unsigned int>> &input_masks, 
+                             std::vector<std::vector<long unsigned int>> &segment_ids);
+    ErrorCode Postprocessing(int n_best_size, 
+                             int max_answer_length, 
+                             const std::vector<float> &start_position,
+                             const std::vector<float> &end_position, 
+                             std::string &answer);
+private:
+    std::vector<std::string> tokens_text;
+    std::vector<std::string> tokens_question;
+    Ort::Session *session;
+    Ort::Env env = Ort::Env(ORT_LOGGING_LEVEL_ERROR, "ONNXRuntime");
+    Ort::SessionOptions sessionOptions = Ort::SessionOptions();
+};
+}
+#endif
\ No newline at end of file