Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
c099d843
Commit
c099d843
authored
Mar 11, 2025
by
dongcl
Browse files
增加deepseek tokenizer
parent
de64c444
Pipeline
#2466
passed with stage
Changes
3
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
74 additions
and
1 deletion
+74
-1
megatron/training/arguments.py
megatron/training/arguments.py
+2
-0
megatron/training/theoretical_memory_usage.py
megatron/training/theoretical_memory_usage.py
+4
-1
megatron/training/tokenizer/tokenizer.py
megatron/training/tokenizer/tokenizer.py
+68
-0
No files found.
megatron/training/arguments.py
View file @
c099d843
...
...
@@ -1836,6 +1836,8 @@ def _add_tokenizer_args(parser):
group
=
parser
.
add_argument_group
(
title
=
'tokenizer'
)
group
.
add_argument
(
'--vocab-size'
,
type
=
int
,
default
=
None
,
help
=
'Size of vocab before EOD or padding.'
)
group
.
add_argument
(
'--extra-vocab-size'
,
type
=
int
,
default
=
0
,
help
=
"--extra-vocab-size"
)
group
.
add_argument
(
'--vocab-file'
,
type
=
str
,
default
=
None
,
help
=
'Path to the vocab file.'
)
group
.
add_argument
(
'--merge-file'
,
type
=
str
,
default
=
None
,
...
...
megatron/training/theoretical_memory_usage.py
View file @
c099d843
...
...
@@ -65,8 +65,11 @@ def compute_weight_and_optimizer_memory(args, verbose=False):
)
# params of mtp embedding and mtp output layer
num_parameters_in_mtp_embedding_or_output
=
args
.
num_nextn_predict_layers
*
args
.
hidden_size
*
args
.
padded_vocab_size
if
not
args
.
share_mtp_embedding_and_output_weight
:
num_parameters_in_mtp_layers
+=
2
*
args
.
num_nextn_predict_layers
*
args
.
hidden_size
*
args
.
padded_vocab_size
num_parameters_in_mtp_layers
+=
2
*
num_parameters_in_mtp_embedding_or_output
elif
args
.
pipeline_model_parallel_size
>
1
:
num_parameters_in_mtp_layers
+=
num_parameters_in_mtp_embedding_or_output
num_total_parameters
=
num_parameters_in_transformer_layers
+
num_parameters_in_embedding_layers
+
num_parameters_in_mtp_layers
if
verbose
:
...
...
megatron/training/tokenizer/tokenizer.py
View file @
c099d843
...
...
@@ -98,6 +98,9 @@ def build_tokenizer(args, **kwargs):
args
.
special_tokens
,
args
.
image_tag_type
,
)
elif
args
.
tokenizer_type
==
'DeepSeekV2Tokenizer'
:
tokenizer
=
_DeepSeekV2Tokenizer
(
args
.
tokenizer_model
,
args
.
extra_vocab_size
)
args
.
padded_vocab_size
=
tokenizer
.
vocab_size
else
:
raise
NotImplementedError
(
'{} tokenizer is not '
'implemented.'
.
format
(
args
.
tokenizer_type
))
...
...
@@ -917,3 +920,68 @@ class _NullTokenizer(MegatronTokenizer):
@
property
def
additional_special_tokens_ids
(
self
):
return
None
class
_DeepSeekV2Tokenizer
(
MegatronTokenizer
):
def
__init__
(
self
,
tokenizer_path
,
extra_vocab_size
):
super
().
__init__
(
tokenizer_path
)
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
tokenizer_path
,
padding_side
=
"right"
,
trust_remote_code
=
True
)
self
.
extra_vocab_size
=
extra_vocab_size
if
self
.
tokenizer
.
chat_template
is
None
:
self
.
tokenizer
.
chat_template
=
"{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '
\n\n
' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '
\n\n
' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"
try
:
test_conversation
=
[
{
'role'
:
'user'
,
'content'
:
'hello world'
}
]
self
.
apply_chat_template
(
test_conversation
)
except
Exception
:
# the default chat_template is invalid, assume user will not do SFT
self
.
tokenizer
.
chat_template
=
None
def
__call__
(
self
,
text
,
return_tensors
=
None
,
padding
=
None
,
max_length
=
None
,
truncation
=
None
,
add_special_tokens
=
None
):
return
self
.
tokenizer
(
text
,
return_tensors
=
return_tensors
,
padding
=
padding
,
max_length
=
max_length
,
truncation
=
truncation
,
add_special_tokens
=
add_special_tokens
)
def
apply_chat_template
(
self
,
conversations
,
tokenize
:
bool
=
True
,
**
kwargs
):
return
self
.
tokenizer
.
apply_chat_template
(
conversations
,
tokenize
=
tokenize
,
**
kwargs
)
@
property
def
vocab_size
(
self
):
return
len
(
self
.
tokenizer
)
+
self
.
extra_vocab_size
-
2
@
property
def
vocab
(
self
):
return
self
.
tokenizer
.
encoder
@
property
def
inv_vocab
(
self
):
return
self
.
tokenizer
.
decoder
def
tokenize
(
self
,
text
):
return
self
.
tokenizer
.
encode
(
text
)
def
detokenize
(
self
,
token_ids
):
return
self
.
tokenizer
.
decode
(
token_ids
)
@
property
def
eod
(
self
):
return
self
.
tokenizer
.
eos_token_id
@
property
def
eos_token
(
self
):
return
self
.
tokenizer
.
eos_token
@
property
def
pad_token_id
(
self
):
return
self
.
tokenizer
.
pad_token_id
@
property
def
eos_token_id
(
self
):
return
self
.
tokenizer
.
eos_token_id
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment