Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
0d0a2a22
Commit
0d0a2a22
authored
Dec 03, 2024
by
wxj
Browse files
Update tokenizer.py add Llama3Tokenizer
parent
9c04fee1
Pipeline
#2031
passed with stage
Changes
1
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
56 additions
and
0 deletions
+56
-0
megatron/training/tokenizer/tokenizer.py
megatron/training/tokenizer/tokenizer.py
+56
-0
No files found.
megatron/training/tokenizer/tokenizer.py
View file @
0d0a2a22
...
@@ -49,6 +49,9 @@ def build_tokenizer(args, **kwargs):
...
@@ -49,6 +49,9 @@ def build_tokenizer(args, **kwargs):
elif
args
.
tokenizer_type
==
'Llama2Tokenizer'
:
elif
args
.
tokenizer_type
==
'Llama2Tokenizer'
:
assert
args
.
tokenizer_model
is
not
None
assert
args
.
tokenizer_model
is
not
None
tokenizer
=
_Llama2Tokenizer
(
args
.
tokenizer_model
)
tokenizer
=
_Llama2Tokenizer
(
args
.
tokenizer_model
)
elif
args
.
tokenizer_type
==
'Llama3Tokenizer'
:
assert
args
.
tokenizer_model
is
not
None
tokenizer
=
_Llama3Tokenizer
(
args
.
tokenizer_model
)
elif
args
.
tokenizer_type
==
'QwenTokenizer'
:
elif
args
.
tokenizer_type
==
'QwenTokenizer'
:
tokenizer
=
_Qwen2Tokenizer
(
args
.
vocab_file
,
args
.
merge_file
)
tokenizer
=
_Qwen2Tokenizer
(
args
.
vocab_file
,
args
.
merge_file
)
elif
args
.
tokenizer_type
==
'TikTokenizer'
:
elif
args
.
tokenizer_type
==
'TikTokenizer'
:
...
@@ -93,6 +96,59 @@ def _vocab_size_with_padding(orig_vocab_size, args, logging_enabled=True):
...
@@ -93,6 +96,59 @@ def _vocab_size_with_padding(orig_vocab_size, args, logging_enabled=True):
return
after
return
after
class
_Llama3Tokenizer
(
MegatronTokenizer
):
"""tiktokenTokenizer-Megatron llama3 改写"""
# https://github.com/meta-llama/llama3/blob/main/llama/tokenizer.py
def
__init__
(
self
,
model_file
):
super
().
__init__
(
model_file
)
from
pathlib
import
Path
import
tiktoken
from
tiktoken.load
import
load_tiktoken_bpe
tokenizer_path
=
model_file
special_tokens
=
[
"<|begin_of_text|>"
,
"<|end_of_text|>"
,
"<|reserved_special_token_0|>"
,
"<|reserved_special_token_1|>"
,
"<|reserved_special_token_2|>"
,
"<|reserved_special_token_3|>"
,
"<|start_header_id|>"
,
"<|end_header_id|>"
,
"<|reserved_special_token_4|>"
,
"<|eot_id|>"
,
# end of turn
]
+
[
f
"<|reserved_special_token_
{
i
}
|>"
for
i
in
range
(
5
,
256
-
5
)]
mergeable_ranks
=
load_tiktoken_bpe
(
tokenizer_path
)
self
.
tokenizer
=
tiktoken
.
Encoding
(
tokenizer_path
,
pat_str
=
r
"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
,
mergeable_ranks
=
mergeable_ranks
,
special_tokens
=
{
token
:
len
(
mergeable_ranks
)
+
i
for
i
,
token
in
enumerate
(
special_tokens
)},
)
self
.
eod_id
=
self
.
tokenizer
.
encode
(
"<|end_of_text|>"
,
allowed_special
=
"all"
)[
0
]
@
property
def
vocab_size
(
self
):
return
self
.
tokenizer
.
n_vocab
@
property
def
vocab
(
self
):
return
self
.
tokenizer
.
encode
@
property
def
inv_vocab
(
self
):
return
self
.
tokenizer
.
encode
def
tokenize
(
self
,
text
):
return
self
.
tokenizer
.
encode
(
text
)
def
detokenize
(
self
,
token_ids
):
return
self
.
tokenizer
.
encode
(
token_ids
)
@
property
def
eod
(
self
):
return
self
.
eod_id
class
_HuggingFaceTokenizer
(
MegatronTokenizer
):
class
_HuggingFaceTokenizer
(
MegatronTokenizer
):
def
__init__
(
self
,
pretrained_model_name_or_path
,
**
kwargs
):
def
__init__
(
self
,
pretrained_model_name_or_path
,
**
kwargs
):
super
().
__init__
(
pretrained_model_name_or_path
,
**
kwargs
)
super
().
__init__
(
pretrained_model_name_or_path
,
**
kwargs
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment