Unverified Commit d007387a authored by Mingliang Li's avatar Mingliang Li Committed by GitHub
Browse files

[Bugfix] Cache added_vocab to avoid per-token overhead (#30351)


Signed-off-by: default avatarlimingliang <limingliang@stepfun.com>
Co-authored-by: default avatarlimingliang <limingliang@stepfun.com>
parent 3bdd4266
......@@ -17,6 +17,8 @@ class DeepseekV32Tokenizer(HfTokenizer):
self.name_or_path = (
tokenizer.name_or_path if hasattr(tokenizer, "name_or_path") else ""
)
self._added_vocab = self.tokenizer.get_added_vocab()
self._added_vocab_size = len(self._added_vocab)
@classmethod
def from_pretrained(
......@@ -98,7 +100,7 @@ class DeepseekV32Tokenizer(HfTokenizer):
def __len__(self) -> int:
# </think> is an added token in DeepseekV32 tokenizer
return self.vocab_size + len(self.get_added_vocab())
return self.vocab_size + self._added_vocab_size
def __call__(
self,
......@@ -120,7 +122,7 @@ class DeepseekV32Tokenizer(HfTokenizer):
return self.tokenizer.get_vocab()
def get_added_vocab(self) -> dict[str, int]:
return self.tokenizer.get_added_vocab()
return self._added_vocab.copy()
def encode(
self,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment