correct docs (#9378)

75ff5305 · Patrick von Platen · GitHub · ec54d70e · 75ff5305
Unverified Commit 75ff5305 authored Jan 04, 2021 by Patrick von Platen Committed by GitHub Jan 04, 2021
Hide whitespace changes
Inline Side-by-side

Showing with 6 additions and 3 deletions

docs/source/custom_datasets.rst docs/source/custom_datasets.rst +6 -3

No files found.
--- a/docs/source/custom_datasets.rst
+++ b/docs/source/custom_datasets.rst
@@ -558,12 +558,15 @@ we can use the built in :func:`~transformers.BatchEncoding.char_to_token` method
        end_positions = []
        for i in range(len(answers)):
            start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
-            end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
+            end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))
-            # if None, the answer passage has been truncated
+            # if start position is None, the answer passage has been truncated
            if start_positions[-1] is None:
                start_positions[-1] = tokenizer.model_max_length
+            # if end position is None, the 'char_to_token' function points to the space before the correct token - > add + 1
            if end_positions[-1] is None:
-                end_positions[-1] = tokenizer.model_max_length
+                end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] + 1)
        encodings.update({'start_positions': start_positions, 'end_positions': end_positions})
    add_token_positions(train_encodings, train_answers)