Unverified Commit c60749d6 authored by Arthur's avatar Arthur Committed by GitHub
Browse files

[fix codellama conversion] (#30472)

* fix codellama conversion

* nit
parent e9b16354
......@@ -1395,14 +1395,14 @@ class LlamaConverter(SpmConverter):
def normalizer(self, proto):
if getattr(self.original_tokenizer, "legacy", True):
sequence = []
if getattr(self.original_tokenizer, "add_prefix_space"):
if getattr(self.original_tokenizer, "add_prefix_space", True):
sequence += [normalizers.Prepend(prepend="▁")]
sequence += [normalizers.Replace(pattern=" ", content="▁")]
return normalizers.Sequence(sequence)
return None # non-legacy, no normalizer
def pre_tokenizer(self, replacement, add_prefix_space):
if not self.original_tokenizer.legacy: # non-legacy, we need a replace
if not getattr(self.original_tokenizer, "legacy", True): # non-legacy, we need a replace
prepend_scheme = _get_prepend_scheme(add_prefix_space, self.original_tokenizer)
return pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme, split=False)
return None
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment