# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. """SFT tokenizer.""" from typing import Dict, List, Union import numpy as np nemotron_h_aligned_custom_template = """{% for message in messages %}{% if message['role'] == 'system' %}{{ 'System\n' + message['content'].strip() + '\n' }}{% elif message['role'] == 'user' %}{{ 'User\n' + message['content'].strip() + '\n' + 'Assistant\n' }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() + '\n' }}{% endif %}{% endfor %}""" from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer from megatron.training.datasets.sft_dataset import IGNORE_INDEX from megatron.training.tokenizer.multimodal_tokenizer import PromptConfig class SFTTokenizer(MegatronTokenizer): """SFT Tokenizer.""" def __init__( self, tokenizer_path: str, prompt_format: str, ): """ Note: Currently, only HuggingFaceTokenizer is supported as the underlying text tokenizer. Args: tokenizer_path (str): Underlying tokenizer path. prompt_format (str): Prompt format for the tokenizer. """ super().__init__(tokenizer_path, prompt_format=prompt_format) try: import transformers except ImportError: raise ImportError( "SFTTokenizer currently requires transformers library to be installed" ) # Currently, only HuggingFace tokenizers are supported. tokenizer = transformers.AutoTokenizer.from_pretrained( pretrained_model_name_or_path=tokenizer_path, ) self._vocab_size = len(tokenizer) self._tokenizer = tokenizer if prompt_format == "nemotron-h-aligned": self._prompt_config = PromptConfig( assistant_prefix_len=0, pad_token_id=tokenizer.convert_tokens_to_ids(""), custom_chat_template=nemotron_h_aligned_custom_template, has_bos=False, has_system_role=True, ) else: raise NotImplementedError("unknown SFT prompt format", prompt_format) self._prompt_format = prompt_format def tokenize_conversation( self, conversation: List[Dict], return_target: bool, add_generation_prompt: bool ): """Convert a conversation to tokens. Args: conversation (List[Dict]): Sequence of system/user/assistant messages. Must be in the following format: [ {"role": "system", "content": "something"}, {"role": "user", "content": "something1"}, {"role": "assistant", "content": "something2"}, ] return_target (bool): Return target tokens with system and assistant masked. add_generation_prompt (bool): Add assistant prefix to the end. """ # Skip system message if the tokenizer doesn't have a system role. if not self._prompt_config.has_system_role and conversation[0]["role"] == "system": conversation = conversation[1:] tokens = self._tokenizer.apply_chat_template( conversation, tokenize=True, add_generation_prompt=add_generation_prompt, return_assistant_token_mask=False, return_tensors="np", chat_template=self._prompt_config.custom_chat_template, )[0] if not return_target: return tokens target = tokens.copy() # Mask system and user tokens in the target. idx = 0 for turn_idx, turn in enumerate(conversation): if turn["role"].lower() == "assistant" and len(turn["content"]) == 0: raise ValueError(f"empty assistant turn in conversation: {conversation}.") if turn["role"].lower() == "assistant": assert conversation[turn_idx-1]["role"].lower() == "user" turn_tokens = self._tokenizer.apply_chat_template( [turn], tokenize=True, chat_template=self._prompt_config.custom_chat_template ) # There should be only one BOS at the very beginning. # After the first turn, skip BOS token. if self._prompt_config.has_bos and turn_idx > 0: turn_tokens = turn_tokens[1:] turn_len = len(turn_tokens) role = turn["role"].lower() if role in ("system", "user"): target[idx : idx + turn_len] = IGNORE_INDEX elif role == "assistant": if self._prompt_config.assistant_prefix_len > 0: target[idx : idx + self._prompt_config.assistant_prefix_len] = IGNORE_INDEX else: raise ValueError(f"Wrong role value.") assert np.allclose( tokens[idx : idx + turn_len], turn_tokens ), f"expected turn tokens to match tokens in conversation {conversation}" idx += turn_len assert idx == len(tokens), f"mismatch in target masking the conversation {conversation}" return tokens, target def tokenize(self, text: Union[str, List[Dict]]): """Tokenize conversation or string input.""" if isinstance(text, list): # This code path is used by the inference code currently. return self.tokenize_conversation(text, return_target=False, add_generation_prompt=True).tolist() return self._encode(text) def _encode(self, text: str): """Tokenize text input, w/o chat template""" return self._tokenizer.encode(text) def convert_tokens_to_ids(self, tokens: List[str]): """Convert tokens to IDs.""" return self._tokenizer.convert_tokens_to_ids(tokens) def detokenize(self, tokens: List[int]): """Detokenize tokens.""" return self._tokenizer.decode(tokens) def get_special_tokens(self): """Get special tokens.""" return self._tokenizer.get_added_vocab() @property def pad(self): """Pad token ID.""" return self._prompt_config.pad_token_id @property def eod(self): """End of sentence token ID.""" return self._tokenizer.eos_token_id @property def vocab(self): """Vocab.""" return NotImplementedError("not used") @property def inv_vocab(self): """Inverse vocab.""" return NotImplementedError("not used") @property def vocab_size(self): """Vocabulary size.""" return self._vocab_size