"vscode:/vscode.git/clone" did not exist on "b24ead87e1be6bce17e4ec5c953b6d028e4b3af7"
Unverified Commit 6d430616 authored by Younes Belkada's avatar Younes Belkada Committed by GitHub
Browse files

GGUF: Fix llama 3 GGUF (#31358)

* Create push-important-models.yml

* llama3 support for GGUF

* fixup

* Update src/transformers/integrations/ggml.py

* fix pre-tokenizer

* fix

* fix

* fix

* fix

* fix

* fix

* address final comment

* handle special tokens + add tests
parent 35b112d3
...@@ -21,7 +21,7 @@ with extra methods beings exposed ...@@ -21,7 +21,7 @@ with extra methods beings exposed
from array import array from array import array
import numpy as np import numpy as np
from tokenizers import Tokenizer, decoders from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers
from tokenizers.models import BPE from tokenizers.models import BPE
from .. import AddedToken from .. import AddedToken
...@@ -540,15 +540,26 @@ class GGUFTokenizerSkeleton: ...@@ -540,15 +540,26 @@ class GGUFTokenizerSkeleton:
self.merges = merges self.merges = merges
else: else:
self.merges = [tuple(merge.split(" ")) for merge in self.merges] self.merges = [tuple(merge.split(" ")) for merge in self.merges]
if not hasattr(self, "scores"):
self.scores = [None for _ in range(len(self.tokens))]
if not hasattr(self, "added_tokens"): if not hasattr(self, "added_tokens"):
self.added_tokens = [] self.added_tokens = []
if not hasattr(self, "unk_token_id"):
self.unk_token_id = None
# Llama2 uses the field `unknown_token_id`
if hasattr(self, "unknown_token_id") and self.unk_token_id is None:
self.unk_token_id = self.unknown_token_id
class GGUFLlamaConverter(LlamaConverter): class GGUFLlamaConverter(LlamaConverter):
def __init__(self, tokenizer_dict): def __init__(self, tokenizer_dict):
self.proto = GGUFTokenizerSkeleton(tokenizer_dict) self.proto = GGUFTokenizerSkeleton(tokenizer_dict)
self.original_tokenizer = self.proto self.original_tokenizer = self.proto
self.additional_kwargs = {}
self.is_llama_3_tokenizer = getattr(self.proto, "tokenizer_type", "llama") != "llama"
def vocab(self, proto): def vocab(self, proto):
return list(zip(proto.tokens, proto.scores)) return list(zip(proto.tokens, proto.scores))
...@@ -560,22 +571,50 @@ class GGUFLlamaConverter(LlamaConverter): ...@@ -560,22 +571,50 @@ class GGUFLlamaConverter(LlamaConverter):
vocab_scores = self.vocab(self.proto) vocab_scores = self.vocab(self.proto)
merges = self.merges(self.proto) merges = self.merges(self.proto)
bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)} bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)}
tokenizer = Tokenizer(
BPE(bpe_vocab, merges, unk_token=proto.tokens[proto.unk_token_id], fuse_unk=True, byte_fallback=True) unk_token = proto.tokens[proto.unk_token_id] if proto.unk_token_id is not None else None
) bos_token = proto.tokens[proto.bos_token_id] if getattr(proto, "bos_token_id", None) is not None else None
tokenizer.add_special_tokens( eos_token = proto.tokens[proto.bos_token_id] if getattr(proto, "eos_token_id", None) is not None else None
[
AddedToken("<unk>", normalized=False, special=True), tokenizer = Tokenizer(BPE(bpe_vocab, merges, unk_token=unk_token, fuse_unk=True, byte_fallback=True))
AddedToken("<s>", normalized=False, special=True),
AddedToken("</s>", normalized=False, special=True), special_tokens = []
]
) if not hasattr(self.proto, "token_type"):
if unk_token is not None:
special_tokens.append(AddedToken(unk_token, normalized=False, special=True))
if bos_token is not None:
special_tokens.append(AddedToken(bos_token, normalized=False, special=True))
if eos_token is not None:
special_tokens.append(AddedToken(eos_token, normalized=False, special=True))
else:
# 3 stands for special tokens
special_tokens_idx = np.where(np.array(self.proto.token_type) == 3)[0]
for idx in special_tokens_idx:
special_tokens.append(AddedToken(self.proto.tokens[idx], normalized=False, special=True))
if len(special_tokens) != 0:
tokenizer.add_special_tokens(special_tokens)
if len(self.proto.added_tokens) != 0: if len(self.proto.added_tokens) != 0:
tokenizer.add_special_tokens( tokenizer.add_tokens(
[AddedToken(added_token, normalized=False, special=False) for added_token in self.added_tokens] [AddedToken(added_token, normalized=False, special=False) for added_token in self.proto.added_tokens]
) )
self.additional_kwargs["unk_token"] = unk_token
self.additional_kwargs["eos_token"] = bos_token
self.additional_kwargs["bos_token"] = eos_token
if self.is_llama_3_tokenizer:
self.additional_kwargs["add_prefix_space"] = False
self.additional_kwargs["clean_up_tokenization_spaces"] = True
self.additional_kwargs["legacy"] = False
self.original_tokenizer.legacy = False
return tokenizer return tokenizer
def decoder(self, replacement, add_prefix_space): def decoder(self, replacement, add_prefix_space):
...@@ -584,14 +623,34 @@ class GGUFLlamaConverter(LlamaConverter): ...@@ -584,14 +623,34 @@ class GGUFLlamaConverter(LlamaConverter):
decoders.Fuse(), decoders.Fuse(),
decoders.Replace("▁", " "), decoders.Replace("▁", " "),
] ]
if self.is_llama_3_tokenizer:
sequence += [decoders.ByteLevel(add_prefix_space=False, trim_offsets=False, use_regex=True)]
if add_prefix_space: if add_prefix_space:
sequence += [decoders.Strip(content=" ", left=1)] sequence += [decoders.Strip(content=" ", left=1)]
return decoders.Sequence(sequence) return decoders.Sequence(sequence)
def converted(self):
tokenizer = super().converted()
# HACK: patch the llama-3 tokenizer to use the correspinding pre-tokenizer
# and normalizer
if self.is_llama_3_tokenizer:
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
add_prefix_space=False, trim_offsets=False, use_regex=True
)
# This is tricky as the additional kwargs are passed after legacy is force-set in LlamaTokenizer's
# init.
tokenizer.normalizer = normalizers.Sequence([])
return tokenizer
class GGUFQwen2Converter(Qwen2Converter): class GGUFQwen2Converter(Qwen2Converter):
def __init__(self, tokenizer_dict): def __init__(self, tokenizer_dict):
self.original_tokenizer = GGUFTokenizerSkeleton(tokenizer_dict) self.original_tokenizer = GGUFTokenizerSkeleton(tokenizer_dict)
self.additional_kwargs = {}
def converted(self) -> Tokenizer: def converted(self) -> Tokenizer:
vocab = {word: i for i, word in enumerate(self.original_tokenizer.tokens)} vocab = {word: i for i, word in enumerate(self.original_tokenizer.tokens)}
...@@ -629,5 +688,6 @@ def convert_gguf_tokenizer(architecture, tokenizer_dict) -> Tokenizer: ...@@ -629,5 +688,6 @@ def convert_gguf_tokenizer(architecture, tokenizer_dict) -> Tokenizer:
[`~tokenization_utils_base.PreTrainedTokenizerFast`] [`~tokenization_utils_base.PreTrainedTokenizerFast`]
""" """
tokenizer_class_name = architecture tokenizer_class_name = architecture
converter_class = GGUF_TO_FAST_CONVERTERS[tokenizer_class_name] converter = GGUF_TO_FAST_CONVERTERS[tokenizer_class_name](tokenizer_dict)
return converter_class(tokenizer_dict).converted() fast_tokenizer = converter.converted()
return fast_tokenizer, converter.additional_kwargs
...@@ -158,7 +158,8 @@ class LlamaTokenizer(PreTrainedTokenizer): ...@@ -158,7 +158,8 @@ class LlamaTokenizer(PreTrainedTokenizer):
" expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you." " expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you."
" If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it" " If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it"
" means, and thoroughly read the reason why this was added as explained in" " means, and thoroughly read the reason why this was added as explained in"
" https://github.com/huggingface/transformers/pull/24565" " https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file"
" you can ignore this message"
) )
legacy = True legacy = True
......
...@@ -145,7 +145,8 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast): ...@@ -145,7 +145,8 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
" expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you." " expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you."
" If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it" " If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it"
" means, and thoroughly read the reason why this was added as explained in" " means, and thoroughly read the reason why this was added as explained in"
" https://github.com/huggingface/transformers/pull/24565" " https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file"
" you can ignore this message."
) )
legacy = True legacy = True
self.legacy = legacy self.legacy = legacy
......
...@@ -121,7 +121,11 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): ...@@ -121,7 +121,11 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
gguf_param = load_gguf_checkpoint(kwargs.get("vocab_file")) gguf_param = load_gguf_checkpoint(kwargs.get("vocab_file"))
architecture = gguf_param["config"]["model_type"] architecture = gguf_param["config"]["model_type"]
tokenizer_dict = gguf_param["tokenizer"] tokenizer_dict = gguf_param["tokenizer"]
fast_tokenizer = convert_gguf_tokenizer(architecture, tokenizer_dict) fast_tokenizer, additional_kwargs = convert_gguf_tokenizer(architecture, tokenizer_dict)
if len(additional_kwargs) > 0:
kwargs.update(additional_kwargs)
elif self.slow_tokenizer_class is not None: elif self.slow_tokenizer_class is not None:
# We need to create and convert a slow tokenizer to build the backend # We need to create and convert a slow tokenizer to build the backend
slow_tokenizer = self.slow_tokenizer_class(*args, **kwargs) slow_tokenizer = self.slow_tokenizer_class(*args, **kwargs)
...@@ -184,6 +188,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): ...@@ -184,6 +188,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
tokens_to_add += [ tokens_to_add += [
token for token in self.all_special_tokens_extended if token not in encoder and token not in tokens_to_add token for token in self.all_special_tokens_extended if token not in encoder and token not in tokens_to_add
] ]
if len(tokens_to_add) > 0: if len(tokens_to_add) > 0:
# super hack: if a token.special is set, tokenizer ignores it for now so FIXME @ArthurZ # super hack: if a token.special is set, tokenizer ignores it for now so FIXME @ArthurZ
# Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for # Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for
......
...@@ -32,6 +32,7 @@ class GgufIntegrationTests(unittest.TestCase): ...@@ -32,6 +32,7 @@ class GgufIntegrationTests(unittest.TestCase):
model_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF" model_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
mistral_model_id = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF" mistral_model_id = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
qwen2_model_id = "Qwen/Qwen1.5-0.5B-Chat-GGUF" qwen2_model_id = "Qwen/Qwen1.5-0.5B-Chat-GGUF"
llama3_model_id = "NousResearch/Meta-Llama-3-8B-GGUF"
q4_0_gguf_model_id = "tinyllama-1.1b-chat-v1.0.Q4_0.gguf" q4_0_gguf_model_id = "tinyllama-1.1b-chat-v1.0.Q4_0.gguf"
q4_k_gguf_model_id = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf" q4_k_gguf_model_id = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
...@@ -43,6 +44,7 @@ class GgufIntegrationTests(unittest.TestCase): ...@@ -43,6 +44,7 @@ class GgufIntegrationTests(unittest.TestCase):
q4_0_mistral_model_id = "mistral-7b-instruct-v0.2.Q4_0.gguf" q4_0_mistral_model_id = "mistral-7b-instruct-v0.2.Q4_0.gguf"
q4_0_qwen2_model_id = "qwen1_5-0_5b-chat-q4_0.gguf" q4_0_qwen2_model_id = "qwen1_5-0_5b-chat-q4_0.gguf"
q4_llama3_model_id = "Meta-Llama-3-8B-Q4_K_M.gguf"
example_text = "Hello" example_text = "Hello"
...@@ -171,6 +173,25 @@ class GgufIntegrationTests(unittest.TestCase): ...@@ -171,6 +173,25 @@ class GgufIntegrationTests(unittest.TestCase):
EXPECTED_TEXT = "Hello.jsoup\n\nI am a beginner" EXPECTED_TEXT = "Hello.jsoup\n\nI am a beginner"
self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT) self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT)
def test_llama3_q4_0_tokenizer(self):
tokenizer_gguf = AutoTokenizer.from_pretrained(self.llama3_model_id, gguf_file=self.q4_llama3_model_id)
special_sentence = "สวัสดี"
predicted_text = tokenizer_gguf.decode(tokenizer_gguf.encode(special_sentence, return_tensors="pt")[0])
self.assertEqual(predicted_text, "<|begin_of_text|>" + special_sentence)
def test_llama3_q4_0(self):
tokenizer = AutoTokenizer.from_pretrained(self.llama3_model_id, gguf_file=self.q4_llama3_model_id)
model = AutoModelForCausalLM.from_pretrained(
self.llama3_model_id, gguf_file=self.q4_llama3_model_id, device_map="auto", torch_dtype=torch.float16
)
text = tokenizer(self.example_text, return_tensors="pt").to(torch_device)
out = model.generate(**text, max_new_tokens=10)
EXPECTED_TEXT = "Hello, I am new to this forum. I am"
self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT)
def test_tokenization_xnli(self): def test_tokenization_xnli(self):
import tqdm import tqdm
from datasets import load_dataset from datasets import load_dataset
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment