Unverified Commit 06e737fb authored by Matt's avatar Matt Committed by GitHub
Browse files

Tweak ESM tokenizer for Nucleotide Transformer (#22770)

* If EOS is None, don't add it to sequences

* If EOS is None, don't add it to sequences
parent c8df3900
......@@ -91,11 +91,16 @@ class EsmTokenizer(PreTrainedTokenizer):
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.eos_token_id]
cls = [self.cls_token_id]
sep = [self.eos_token_id] # No sep token in ESM vocabulary
return cls + token_ids_0 + sep + token_ids_1 + sep
if token_ids_1 is None:
if self.eos_token_id is None:
return cls + token_ids_0
else:
return cls + token_ids_0 + sep
elif self.eos_token_id is None:
raise ValueError("Cannot tokenize multiple sequences when EOS token is not set!")
return cls + token_ids_0 + sep + token_ids_1 + sep # Multiple inputs always have an EOS token
def get_special_tokens_mask(
self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment