"docs/vscode:/vscode.git/clone" did not exist on "83b824c8b4ee55824b30f0509fd312b0cddb35e5"
Unverified Commit d007387a authored by Mingliang Li's avatar Mingliang Li Committed by GitHub
Browse files

[Bugfix] Cache added_vocab to avoid per-token overhead (#30351)


Signed-off-by: default avatarlimingliang <limingliang@stepfun.com>
Co-authored-by: default avatarlimingliang <limingliang@stepfun.com>
parent 3bdd4266
...@@ -17,6 +17,8 @@ class DeepseekV32Tokenizer(HfTokenizer): ...@@ -17,6 +17,8 @@ class DeepseekV32Tokenizer(HfTokenizer):
self.name_or_path = ( self.name_or_path = (
tokenizer.name_or_path if hasattr(tokenizer, "name_or_path") else "" tokenizer.name_or_path if hasattr(tokenizer, "name_or_path") else ""
) )
self._added_vocab = self.tokenizer.get_added_vocab()
self._added_vocab_size = len(self._added_vocab)
@classmethod @classmethod
def from_pretrained( def from_pretrained(
...@@ -98,7 +100,7 @@ class DeepseekV32Tokenizer(HfTokenizer): ...@@ -98,7 +100,7 @@ class DeepseekV32Tokenizer(HfTokenizer):
def __len__(self) -> int: def __len__(self) -> int:
# </think> is an added token in DeepseekV32 tokenizer # </think> is an added token in DeepseekV32 tokenizer
return self.vocab_size + len(self.get_added_vocab()) return self.vocab_size + self._added_vocab_size
def __call__( def __call__(
self, self,
...@@ -120,7 +122,7 @@ class DeepseekV32Tokenizer(HfTokenizer): ...@@ -120,7 +122,7 @@ class DeepseekV32Tokenizer(HfTokenizer):
return self.tokenizer.get_vocab() return self.tokenizer.get_vocab()
def get_added_vocab(self) -> dict[str, int]: def get_added_vocab(self) -> dict[str, int]:
return self.tokenizer.get_added_vocab() return self._added_vocab.copy()
def encode( def encode(
self, self,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment