"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "513fa30a636642ccc1d93f3e6a48d612d08dbce8"
Unverified Commit edf0582c authored by Simon Böhm's avatar Simon Böhm Committed by GitHub
Browse files

Fix token_type_id in BERT question-answering example (#3790)

token_type_id is converted into the segment embedding. For question answering,
this needs to highlight whether a token belongs to sequence 0 or 1.
encode_plus takes care of correctly setting this parameter automatically.
parent 6d00033e
...@@ -1406,8 +1406,8 @@ class BertForQuestionAnswering(BertPreTrainedModel): ...@@ -1406,8 +1406,8 @@ class BertForQuestionAnswering(BertPreTrainedModel):
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_ids = tokenizer.encode(question, text) encoding = tokenizer.encode_plus(question, text)
token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))] input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"]
start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids])) start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
all_tokens = tokenizer.convert_ids_to_tokens(input_ids) all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
......
...@@ -1148,10 +1148,16 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel): ...@@ -1148,10 +1148,16 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel):
from transformers import BertTokenizer, TFBertForQuestionAnswering from transformers import BertTokenizer, TFBertForQuestionAnswering
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForQuestionAnswering.from_pretrained('bert-base-uncased') model = TFBertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
outputs = model(input_ids) question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
start_scores, end_scores = outputs[:2] encoding = tokenizer.encode_plus(question, text)
input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"]
start_scores, end_scores = model(tf.constant(input_ids)[None, :], token_type_ids=tf.constant(token_type_ids)[None, :])
all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
answer = ' '.join(all_tokens[tf.math.argmax(tf.squeeze(start_scores)) : tf.math.argmax(tf.squeeze(end_scores))+1])
assert answer == "a nice puppet"
""" """
outputs = self.bert(inputs, **kwargs) outputs = self.bert(inputs, **kwargs)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment