Unverified Commit daf520b0 authored by ghlai9665's avatar ghlai9665 Committed by GitHub
Browse files

tiny tweak to allow BatchEncoding.token_to_char when token doesn't correspond to chars (#15901)



* tweak to allow BatchEncoding.char_to_token(0)

* update docstring

* remote trailing whitespace

* make fixup

* make value checking for span_indices explicit
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent cb7e1664
...@@ -503,7 +503,8 @@ class BatchEncoding(UserDict): ...@@ -503,7 +503,8 @@ class BatchEncoding(UserDict):
the sequence. the sequence.
Returns: Returns:
[`~tokenization_utils_base.CharSpan`]: Span of characters in the original string. [`~tokenization_utils_base.CharSpan`]: Span of characters in the original string, or None, if the token
(e.g. <s>, </s>) doesn't correspond to any chars in the origin string.
""" """
if not self._encodings: if not self._encodings:
...@@ -513,7 +514,9 @@ class BatchEncoding(UserDict): ...@@ -513,7 +514,9 @@ class BatchEncoding(UserDict):
else: else:
batch_index = 0 batch_index = 0
token_index = batch_or_token_index token_index = batch_or_token_index
return CharSpan(*(self._encodings[batch_index].token_to_chars(token_index))) span_indices = self._encodings[batch_index].token_to_chars(token_index)
return CharSpan(*span_indices) if span_indices is not None else None
def char_to_token( def char_to_token(
self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0 self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment