Commit 0d0a2a22 authored by wxj's avatar wxj
Browse files

Update tokenizer.py add Llama3Tokenizer

parent 9c04fee1
Pipeline #2031 passed with stage
...@@ -49,6 +49,9 @@ def build_tokenizer(args, **kwargs): ...@@ -49,6 +49,9 @@ def build_tokenizer(args, **kwargs):
elif args.tokenizer_type == 'Llama2Tokenizer': elif args.tokenizer_type == 'Llama2Tokenizer':
assert args.tokenizer_model is not None assert args.tokenizer_model is not None
tokenizer = _Llama2Tokenizer(args.tokenizer_model) tokenizer = _Llama2Tokenizer(args.tokenizer_model)
elif args.tokenizer_type == 'Llama3Tokenizer':
assert args.tokenizer_model is not None
tokenizer = _Llama3Tokenizer(args.tokenizer_model)
elif args.tokenizer_type == 'QwenTokenizer': elif args.tokenizer_type == 'QwenTokenizer':
tokenizer = _Qwen2Tokenizer(args.vocab_file, args.merge_file) tokenizer = _Qwen2Tokenizer(args.vocab_file, args.merge_file)
elif args.tokenizer_type == 'TikTokenizer': elif args.tokenizer_type == 'TikTokenizer':
...@@ -93,6 +96,59 @@ def _vocab_size_with_padding(orig_vocab_size, args, logging_enabled=True): ...@@ -93,6 +96,59 @@ def _vocab_size_with_padding(orig_vocab_size, args, logging_enabled=True):
return after return after
class _Llama3Tokenizer(MegatronTokenizer):
"""tiktokenTokenizer-Megatron llama3 改写"""
# https://github.com/meta-llama/llama3/blob/main/llama/tokenizer.py
def __init__(self, model_file):
super().__init__(model_file)
from pathlib import Path
import tiktoken
from tiktoken.load import load_tiktoken_bpe
tokenizer_path=model_file
special_tokens = [
"<|begin_of_text|>",
"<|end_of_text|>",
"<|reserved_special_token_0|>",
"<|reserved_special_token_1|>",
"<|reserved_special_token_2|>",
"<|reserved_special_token_3|>",
"<|start_header_id|>",
"<|end_header_id|>",
"<|reserved_special_token_4|>",
"<|eot_id|>", # end of turn
] + [f"<|reserved_special_token_{i}|>" for i in range (5, 256 - 5)]
mergeable_ranks = load_tiktoken_bpe(tokenizer_path)
self.tokenizer = tiktoken.Encoding(tokenizer_path,
pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+",
mergeable_ranks=mergeable_ranks,
special_tokens={token: len (mergeable_ranks) + i for i, token in enumerate (special_tokens)},
)
self.eod_id = self.tokenizer.encode("<|end_of_text|>", allowed_special="all")[0]
@property
def vocab_size(self):
return self.tokenizer.n_vocab
@property
def vocab(self):
return self.tokenizer.encode
@property
def inv_vocab(self):
return self.tokenizer.encode
def tokenize(self, text):
return self.tokenizer.encode(text)
def detokenize(self, token_ids):
return self.tokenizer.encode(token_ids)
@property
def eod(self):
return self.eod_id
class _HuggingFaceTokenizer(MegatronTokenizer): class _HuggingFaceTokenizer(MegatronTokenizer):
def __init__(self, pretrained_model_name_or_path, **kwargs): def __init__(self, pretrained_model_name_or_path, **kwargs):
super().__init__(pretrained_model_name_or_path, **kwargs) super().__init__(pretrained_model_name_or_path, **kwargs)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment