Fix token_type_id in BERT question-answering example (#3790)

token_type_id is converted into the segment embedding. For question answering, this needs to highlight whether a token belongs to sequence 0 or 1. encode_plus takes care of correctly setting this parameter automatically.

Fix token_type_id in BERT question-answering example (#3790)
token_type_id is converted into the segment embedding. For question answering, this needs to highlight whether a token belongs to sequence 0 or 1. encode_plus takes care of correctly setting this parameter automatically.
edf0582c · Simon Böhm · GitHub · 6d00033e · edf0582c · edf0582c
Unverified Commit edf0582c authored Apr 17, 2020 by Simon Böhm Committed by GitHub Apr 17, 2020
Hide whitespace changes
Inline Side-by-side

Showing with 12 additions and 6 deletions

src/transformers/modeling_bert.py src/transformers/modeling_bert.py +2 -2

src/transformers/modeling_tf_bert.py src/transformers/modeling_tf_bert.py +10 -4

No files found.
--- a/src/transformers/modeling_bert.py
+++ b/src/transformers/modeling_bert.py
@@ -1406,8 +1406,8 @@ class BertForQuestionAnswering(BertPreTrainedModel):
        model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        input_ids = tokenizer.encode(question, text)
+        encoding = tokenizer.encode_plus(question, text)
-        token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]
+        input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"]
        start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)

--- a/src/transformers/modeling_tf_bert.py
+++ b/src/transformers/modeling_tf_bert.py
@@ -1148,10 +1148,16 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel):
        from transformers import BertTokenizer, TFBertForQuestionAnswering
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = TFBertForQuestionAnswering.from_pretrained('bert-base-uncased')
+        model = TFBertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-        outputs = model(input_ids)
+        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        start_scores, end_scores = outputs[:2]
+        encoding = tokenizer.encode_plus(question, text)
+        input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"]
+        start_scores, end_scores = model(tf.constant(input_ids)[None, :], token_type_ids=tf.constant(token_type_ids)[None, :])
+        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
+        answer = ' '.join(all_tokens[tf.math.argmax(tf.squeeze(start_scores)) : tf.math.argmax(tf.squeeze(end_scores))+1])
+        assert answer == "a nice puppet"
        """
        outputs = self.bert(inputs, **kwargs)