Unverified Commit a2e33148 authored by Cagri Eryilmaz's avatar Cagri Eryilmaz Committed by GitHub
Browse files

BERT SQuAD Example - Updates (#815)



* update to readme for pip3 upgrade, needed for tf2.4. requirement file change. initial commit for bert example update

* remove tokenization.py file

* added tokenizers and bertwordpiecetokenizer to main file

* added tokenizers to requirements file

* changes to run_onnx_squad after importing tokenizers module, to replace py file

* additional post processing tokenizer change in run_onnx_squad.py

* changes to notebook after tokenizers

* cleanup notebook output cells

* typo in readme

* formatting on py file
Co-authored-by: default avatarmvermeulen <5479696+mvermeulen@users.noreply.github.com>
parent e5bfdd72
...@@ -43,7 +43,7 @@ ...@@ -43,7 +43,7 @@
"from os import path\n", "from os import path\n",
"import sys\n", "import sys\n",
"\n", "\n",
"import tokenization\n", "import tokenizers\n",
"from run_onnx_squad import *\n", "from run_onnx_squad import *\n",
"\n", "\n",
"import migraphx" "import migraphx"
...@@ -137,8 +137,7 @@ ...@@ -137,8 +137,7 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"vocab_file = os.path.join('uncased_L-12_H-768_A-12', 'vocab.txt')\n", "vocab_file = os.path.join('uncased_L-12_H-768_A-12', 'vocab.txt')\n",
"tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,\n", "tokenizer = tokenizers.BertWordPieceTokenizer(vocab_file)"
" do_lower_case=True)"
] ]
}, },
{ {
......
...@@ -7,21 +7,25 @@ There are two ways to run the example: ...@@ -7,21 +7,25 @@ There are two ways to run the example:
# Steps # Steps
1) Install MIGraphX to your environment. Please follow the steps to build MIGraphX given at https://github.com/ROCmSoftwarePlatform/AMDMIGraphX 1) Install MIGraphX to your environment. Please follow the steps to build MIGraphX given at https://github.com/ROCmSoftwarePlatform/AMDMIGraphX
2) Install the requirements file 2) Upgrade your pip3 to latest version
```
pip3 install --upgrade pip
```
3) Install the requirements file
``` ```
pip3 install -r requirements_bertsquad.txt pip3 install -r requirements_bertsquad.txt
``` ```
3) Install `unzip` and fetch the uncased file (vocabulary): 4) Install `unzip` and fetch the uncased file (vocabulary):
``` ```
apt-get install unzip apt-get install unzip
wget -q https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip wget -q https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
unzip uncased_L-12_H-768_A-12.zip unzip uncased_L-12_H-768_A-12.zip
``` ```
4) Get BERT ONNX model (bertsquad-10.onnx): 5) Get BERT ONNX model (bertsquad-10.onnx):
``` ```
wget https://github.com/onnx/models/raw/master/text/machine_comprehension/bert-squad/model/bertsquad-10.onnx wget https://github.com/onnx/models/raw/master/text/machine_comprehension/bert-squad/model/bertsquad-10.onnx
``` ```
5) Run the inference, it will compile and run the model on three questions and small data provided in `inputs.json`: 6) Run the inference, it will compile and run the model on three questions and small data provided in `inputs.json`:
``` ```
python3 bert-squad-migraphx.py python3 bert-squad-migraphx.py
``` ```
......
...@@ -5,7 +5,7 @@ import os.path ...@@ -5,7 +5,7 @@ import os.path
from os import path from os import path
import sys import sys
import tokenization import tokenizers
from run_onnx_squad import * from run_onnx_squad import *
import migraphx import migraphx
...@@ -30,8 +30,7 @@ n_best_size = 20 ...@@ -30,8 +30,7 @@ n_best_size = 20
max_answer_length = 30 max_answer_length = 30
vocab_file = os.path.join('uncased_L-12_H-768_A-12', 'vocab.txt') vocab_file = os.path.join('uncased_L-12_H-768_A-12', 'vocab.txt')
tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, tokenizer = tokenizers.BertWordPieceTokenizer(vocab_file)
do_lower_case=True)
# Use convert_examples_to_features method from run_onnx_squad to get parameters from the input # Use convert_examples_to_features method from run_onnx_squad to get parameters from the input
input_ids, input_mask, segment_ids, extra_data = convert_examples_to_features( input_ids, input_mask, segment_ids, extra_data = convert_examples_to_features(
......
tensorflow==1.14 tensorflow==2.4.0
onnxruntime onnxruntime
\ No newline at end of file tokenizers
\ No newline at end of file
...@@ -38,7 +38,8 @@ from timeit import default_timer as timer ...@@ -38,7 +38,8 @@ from timeit import default_timer as timer
import numpy as np import numpy as np
import onnxruntime as onnxrt import onnxruntime as onnxrt
import six import six
import tokenization from tokenizers import BertWordPieceTokenizer
from tokenizers import pre_tokenizers
RawResult = collections.namedtuple("RawResult", RawResult = collections.namedtuple("RawResult",
["unique_id", "start_logits", "end_logits"]) ["unique_id", "start_logits", "end_logits"])
...@@ -70,9 +71,8 @@ class SquadExample(object): ...@@ -70,9 +71,8 @@ class SquadExample(object):
def __repr__(self): def __repr__(self):
s = [] s = []
s.append("qas_id: %s" % (tokenization.printable_text(self.qas_id))) s.append("qas_id: %s" % (self.qas_id))
s.append("question_text: %s" % s.append("question_text: %s" % (self.question_text))
(tokenization.printable_text(self.question_text)))
s.append("doc_tokens: [%s]" % (" ".join(self.doc_tokens))) s.append("doc_tokens: [%s]" % (" ".join(self.doc_tokens)))
if self.start_position: if self.start_position:
s.append("start_position: %d" % (self.start_position)) s.append("start_position: %d" % (self.start_position))
...@@ -130,7 +130,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, ...@@ -130,7 +130,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
unique_id = 0 unique_id = 0
for (example_index, example) in enumerate(examples): for (example_index, example) in enumerate(examples):
query_tokens = tokenizer.tokenize(example.question_text) query_tokens = tokenizer.encode(example.question_text)
if len(query_tokens) > max_query_length: if len(query_tokens) > max_query_length:
query_tokens = query_tokens[0:max_query_length] query_tokens = query_tokens[0:max_query_length]
...@@ -140,8 +140,8 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, ...@@ -140,8 +140,8 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
all_doc_tokens = [] all_doc_tokens = []
for (i, token) in enumerate(example.doc_tokens): for (i, token) in enumerate(example.doc_tokens):
orig_to_tok_index.append(len(all_doc_tokens)) orig_to_tok_index.append(len(all_doc_tokens))
sub_tokens = tokenizer.tokenize(token) sub_tokens = tokenizer.encode(token, add_special_tokens=False)
for sub_token in sub_tokens: for sub_token in sub_tokens.tokens:
tok_to_orig_index.append(i) tok_to_orig_index.append(i)
all_doc_tokens.append(sub_token) all_doc_tokens.append(sub_token)
...@@ -172,7 +172,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, ...@@ -172,7 +172,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
segment_ids = [] segment_ids = []
tokens.append("[CLS]") tokens.append("[CLS]")
segment_ids.append(0) segment_ids.append(0)
for token in query_tokens: for token in query_tokens.tokens:
tokens.append(token) tokens.append(token)
segment_ids.append(0) segment_ids.append(0)
tokens.append("[SEP]") tokens.append("[SEP]")
...@@ -192,7 +192,9 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, ...@@ -192,7 +192,9 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
tokens.append("[SEP]") tokens.append("[SEP]")
segment_ids.append(1) segment_ids.append(1)
input_ids = tokenizer.convert_tokens_to_ids(tokens) input_ids = []
for token in tokens:
input_ids.append(tokenizer.token_to_id(token))
# The mask has 1 for real tokens and 0 for padding tokens. Only real # The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to. # tokens are attended to.
...@@ -437,9 +439,15 @@ def get_final_text(pred_text, orig_text, do_lower_case): ...@@ -437,9 +439,15 @@ def get_final_text(pred_text, orig_text, do_lower_case):
# and `pred_text`, and check if they are the same length. If they are # and `pred_text`, and check if they are the same length. If they are
# NOT the same length, the heuristic has failed. If they are the same # NOT the same length, the heuristic has failed. If they are the same
# length, we assume the characters are one-to-one aligned. # length, we assume the characters are one-to-one aligned.
tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case) tokenizer = pre_tokenizers.Sequence(
[pre_tokenizers.Whitespace(),
pre_tokenizers.Punctuation()])
tok_text = " ".join(tokenizer.tokenize(orig_text)) tok_text = []
for item in tokenizer.pre_tokenize_str(orig_text):
tok_text.append(item[0])
tok_text = " ".join(tok_text)
start_position = tok_text.find(pred_text) start_position = tok_text.find(pred_text)
if start_position == -1: if start_position == -1:
...@@ -559,8 +567,7 @@ def main(): ...@@ -559,8 +567,7 @@ def main():
sess_options = onnxrt.SessionOptions() sess_options = onnxrt.SessionOptions()
sess_options.session_log_verbosity_level = args.log sess_options.session_log_verbosity_level = args.log
tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, tokenizer = BertWordPieceTokenizer(vocab_file)
do_lower_case=True)
eval_examples = read_squad_examples(input_file=args.predict_file) eval_examples = read_squad_examples(input_file=args.predict_file)
input_ids, input_mask, segment_ids, extra_data = \ input_ids, input_mask, segment_ids, extra_data = \
......
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import re
import unicodedata
import six
import tensorflow as tf
def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
"""Checks whether the casing config is consistent with the checkpoint name."""
# The casing has to be passed in by the user and there is no explicit check
# as to whether it matches the checkpoint. The casing information probably
# should have been stored in the bert_config.json file, but it's not, so
# we have to heuristically detect it to validate.
if not init_checkpoint:
return
m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
if m is None:
return
model_name = m.group(1)
lower_models = [
"uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
"multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
]
cased_models = [
"cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
"multi_cased_L-12_H-768_A-12"
]
is_bad_config = False
if model_name in lower_models and not do_lower_case:
is_bad_config = True
actual_flag = "False"
case_name = "lowercased"
opposite_flag = "True"
if model_name in cased_models and do_lower_case:
is_bad_config = True
actual_flag = "True"
case_name = "cased"
opposite_flag = "False"
if is_bad_config:
raise ValueError(
"You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
"However, `%s` seems to be a %s model, so you "
"should pass in `--do_lower_case=%s` so that the fine-tuning matches "
"how the model was pre-training. If this error is wrong, please "
"just comment out this check." %
(actual_flag, init_checkpoint, model_name, case_name,
opposite_flag))
def convert_to_unicode(text):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
if six.PY3:
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
elif six.PY2:
if isinstance(text, str):
return text.decode("utf-8", "ignore")
elif isinstance(text, unicode):
return text
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
else:
raise ValueError("Not running on Python2 or Python 3?")
def printable_text(text):
"""Returns text encoded in a way suitable for print or `tf.logging`."""
# These functions want `str` for both Python2 and Python3, but in one case
# it's a Unicode string and in the other it's a byte string.
if six.PY3:
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
elif six.PY2:
if isinstance(text, str):
return text
elif isinstance(text, unicode):
return text.encode("utf-8")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
else:
raise ValueError("Not running on Python2 or Python 3?")
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
index = 0
with tf.gfile.GFile(vocab_file, "r") as reader:
while True:
token = convert_to_unicode(reader.readline())
if not token:
break
token = token.strip()
vocab[token] = index
index += 1
return vocab
def convert_by_vocab(vocab, items):
"""Converts a sequence of [tokens|ids] using the vocab."""
output = []
for item in items:
output.append(vocab[item])
return output
def convert_tokens_to_ids(vocab, tokens):
return convert_by_vocab(vocab, tokens)
def convert_ids_to_tokens(inv_vocab, ids):
return convert_by_vocab(inv_vocab, ids)
def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
class FullTokenizer(object):
"""Runs end-to-end tokenziation."""
def __init__(self, vocab_file, do_lower_case=True):
self.vocab = load_vocab(vocab_file)
self.inv_vocab = {v: k for k, v in self.vocab.items()}
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
def tokenize(self, text):
split_tokens = []
for token in self.basic_tokenizer.tokenize(text):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)
return split_tokens
def convert_tokens_to_ids(self, tokens):
return convert_by_vocab(self.vocab, tokens)
def convert_ids_to_tokens(self, ids):
return convert_by_vocab(self.inv_vocab, ids)
class BasicTokenizer(object):
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
def __init__(self, do_lower_case=True):
"""Constructs a BasicTokenizer.
Args:
do_lower_case: Whether to lower case the input.
"""
self.do_lower_case = do_lower_case
def tokenize(self, text):
"""Tokenizes a piece of text."""
text = convert_to_unicode(text)
text = self._clean_text(text)
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
split_tokens = []
for token in orig_tokens:
if self.do_lower_case:
token = token.lower()
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token))
output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
def _run_split_on_punc(self, text):
"""Splits punctuation on a piece of text."""
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)
def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
(cp >= 0x3400 and cp <= 0x4DBF) or #
(cp >= 0x20000 and cp <= 0x2A6DF) or #
(cp >= 0x2A700 and cp <= 0x2B73F) or #
(cp >= 0x2B740 and cp <= 0x2B81F) or #
(cp >= 0x2B820 and cp <= 0x2CEAF) or
(cp >= 0xF900 and cp <= 0xFAFF) or #
(cp >= 0x2F800 and cp <= 0x2FA1F)): #
return True
return False
def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xfffd or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)
class WordpieceTokenizer(object):
"""Runs WordPiece tokenziation."""
def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, text):
"""Tokenizes a piece of text into its word pieces.
This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.
For example:
input = "unaffable"
output = ["un", "##aff", "##able"]
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer.
Returns:
A list of wordpiece tokens.
"""
text = convert_to_unicode(text)
output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue
is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = "".join(chars[start:end])
if start > 0:
substr = "##" + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end
if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens
def _is_whitespace(char):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# as whitespace since they are generally considered as such.
if char == " " or char == "\t" or char == "\n" or char == "\r":
return True
cat = unicodedata.category(char)
if cat == "Zs":
return True
return False
def _is_control(char):
"""Checks whether `chars` is a control character."""
# These are technically control characters but we count them as whitespace
# characters.
if char == "\t" or char == "\n" or char == "\r":
return False
cat = unicodedata.category(char)
if cat in ("Cc", "Cf"):
return True
return False
def _is_punctuation(char):
"""Checks whether `chars` is a punctuation character."""
cp = ord(char)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64)
or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
return True
cat = unicodedata.category(char)
if cat.startswith("P"):
return True
return False
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment