Commit c099d843 authored by dongcl's avatar dongcl
Browse files

增加deepseek tokenizer

parent de64c444
Pipeline #2466 passed with stage
......@@ -1836,6 +1836,8 @@ def _add_tokenizer_args(parser):
group = parser.add_argument_group(title='tokenizer')
group.add_argument('--vocab-size', type=int, default=None,
help='Size of vocab before EOD or padding.')
group.add_argument('--extra-vocab-size', type=int, default=0,
help="--extra-vocab-size")
group.add_argument('--vocab-file', type=str, default=None,
help='Path to the vocab file.')
group.add_argument('--merge-file', type=str, default=None,
......
......@@ -65,8 +65,11 @@ def compute_weight_and_optimizer_memory(args, verbose=False):
)
# params of mtp embedding and mtp output layer
num_parameters_in_mtp_embedding_or_output = args.num_nextn_predict_layers * args.hidden_size * args.padded_vocab_size
if not args.share_mtp_embedding_and_output_weight:
num_parameters_in_mtp_layers += 2 * args.num_nextn_predict_layers * args.hidden_size * args.padded_vocab_size
num_parameters_in_mtp_layers += 2 * num_parameters_in_mtp_embedding_or_output
elif args.pipeline_model_parallel_size > 1:
num_parameters_in_mtp_layers += num_parameters_in_mtp_embedding_or_output
num_total_parameters = num_parameters_in_transformer_layers + num_parameters_in_embedding_layers + num_parameters_in_mtp_layers
if verbose:
......
......@@ -98,6 +98,9 @@ def build_tokenizer(args, **kwargs):
args.special_tokens,
args.image_tag_type,
)
elif args.tokenizer_type == 'DeepSeekV2Tokenizer':
tokenizer = _DeepSeekV2Tokenizer(args.tokenizer_model, args.extra_vocab_size)
args.padded_vocab_size = tokenizer.vocab_size
else:
raise NotImplementedError('{} tokenizer is not ' 'implemented.'.format(args.tokenizer_type))
......@@ -917,3 +920,68 @@ class _NullTokenizer(MegatronTokenizer):
@property
def additional_special_tokens_ids(self):
return None
class _DeepSeekV2Tokenizer(MegatronTokenizer):
def __init__(self, tokenizer_path, extra_vocab_size):
super().__init__(tokenizer_path)
self.tokenizer = AutoTokenizer.from_pretrained(
tokenizer_path,
padding_side="right",
trust_remote_code=True
)
self.extra_vocab_size = extra_vocab_size
if self.tokenizer.chat_template is None:
self.tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"
try:
test_conversation = [
{'role': 'user', 'content': 'hello world'}
]
self.apply_chat_template(test_conversation)
except Exception:
# the default chat_template is invalid, assume user will not do SFT
self.tokenizer.chat_template = None
def __call__(self, text, return_tensors=None,
padding=None, max_length=None, truncation=None, add_special_tokens=None):
return self.tokenizer(text, return_tensors=return_tensors, padding=padding,
max_length=max_length, truncation=truncation, add_special_tokens=add_special_tokens)
def apply_chat_template(self, conversations, tokenize:bool=True, **kwargs):
return self.tokenizer.apply_chat_template(conversations, tokenize=tokenize, **kwargs)
@property
def vocab_size(self):
return len(self.tokenizer) + self.extra_vocab_size - 2
@property
def vocab(self):
return self.tokenizer.encoder
@property
def inv_vocab(self):
return self.tokenizer.decoder
def tokenize(self, text):
return self.tokenizer.encode(text)
def detokenize(self, token_ids):
return self.tokenizer.decode(token_ids)
@property
def eod(self):
return self.tokenizer.eos_token_id
@property
def eos_token(self):
return self.tokenizer.eos_token
@property
def pad_token_id(self):
return self.tokenizer.pad_token_id
@property
def eos_token_id(self):
return self.tokenizer.eos_token_id
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment