Unverified Commit 535542d3 authored by Arthur's avatar Arthur Committed by GitHub
Browse files

[Lllama] Update tokenization code to ensure parsing of the special tokens [core] (#24042)

* preventllama fast from returning token type ids

* remove type hints

* normalised False
parent 2e2088f2
...@@ -1134,9 +1134,9 @@ class LlamaConverter(SpmConverter): ...@@ -1134,9 +1134,9 @@ class LlamaConverter(SpmConverter):
) )
tokenizer.add_special_tokens( tokenizer.add_special_tokens(
[ [
AddedToken("<unk>", normalized=True), AddedToken("<unk>", normalized=False),
AddedToken("<s>", normalized=True), AddedToken("<s>", normalized=False),
AddedToken("</s>", normalized=True), AddedToken("</s>", normalized=False),
] ]
) )
else: else:
......
...@@ -77,6 +77,7 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast): ...@@ -77,6 +77,7 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
slow_tokenizer_class = LlamaTokenizer slow_tokenizer_class = LlamaTokenizer
padding_side = "left" padding_side = "left"
model_input_names = ["input_ids", "attention_mask"]
def __init__( def __init__(
self, self,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment