"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "c70c88a26822902c9b16714940077d3e3ec349a8"
Unverified Commit a4dd53d8 authored by Arthur's avatar Arthur Committed by GitHub
Browse files

Update-llama-code (#25826)



* some bug fixes

* updates

* Update code_llama.md
Co-authored-by: default avatarOmar Sanseviero <osanseviero@users.noreply.github.com>

* Add co author
Co-authored-by: default avatarpcuenca <pedro@latenitesoft.com>

* add a test

* fixup

* nits

* some updates

* fix-coies

* adress comments

* nits

* nits

* fix docsting

* Apply suggestions from code review
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* update

* add int for https://huggingface.co/spaces/hf-accelerate/model-memory-usage



---------
Co-authored-by: default avatarOmar Sanseviero <osanseviero@users.noreply.github.com>
Co-authored-by: default avatarpcuenca <pedro@latenitesoft.com>
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>
parent 3587769c
...@@ -49,6 +49,8 @@ Here is a sample usage ...@@ -49,6 +49,8 @@ Here is a sample usage
python src/transformers/models/llama/convert_llama_weights_to_hf.py \ python src/transformers/models/llama/convert_llama_weights_to_hf.py \
--input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path
``` ```
Note that executing the script requires enough CPU RAM to host the whole model in float16 precision (even if the biggest versions
come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
- After conversion, the model and tokenizer can be loaded via: - After conversion, the model and tokenizer can be loaded via:
...@@ -90,8 +92,8 @@ If you only want the infilled part: ...@@ -90,8 +92,8 @@ If you only want the infilled part:
>>> generator = pipeline("text-generation",model="codellama/CodeLlama-7b-hf",torch_dtype=torch.float16, device_map="auto") >>> generator = pipeline("text-generation",model="codellama/CodeLlama-7b-hf",torch_dtype=torch.float16, device_map="auto")
>>> generator('def remove_non_ascii(s: str) -> str:\n """ <FILL_ME>\n return result', max_new_tokens = 128, return_type = 1) >>> generator('def remove_non_ascii(s: str) -> str:\n """ <FILL_ME>\n return result', max_new_tokens = 128, return_type = 1)
``` ```
Note that executing the script requires enough CPU RAM to host the whole model in float16 precision (even if the biggest versions
come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). For the 75B model, it's thus 145GB of RAM needed. Under the hood, the tokenizer [automatically splits by `<FILL_ME>`](https://huggingface.co/docs/transformers/main/model_doc/code_llama#transformers.CodeLlamaTokenizer.fill_token) to create a formatted input string that follows [the original training pattern](https://github.com/facebookresearch/codellama/blob/cb51c14ec761370ba2e2bc351374a79265d0465e/llama/generation.py#L402). This is more robust than preparing the pattern yourself: it avoids pitfalls, such as token glueing, that are very hard to debug. To see how much CPU and GPU memory you need for this model or others, try [this calculator](https://huggingface.co/spaces/hf-accelerate/model-memory-usage) which can help determine that value.
- The LLaMA tokenizer is a BPE model based on [sentencepiece](https://github.com/google/sentencepiece). One quirk of sentencepiece is that when decoding a sequence, if the first token is the start of the word (e.g. "Banana"), the tokenizer does not prepend the prefix space to the string. - The LLaMA tokenizer is a BPE model based on [sentencepiece](https://github.com/google/sentencepiece). One quirk of sentencepiece is that when decoding a sequence, if the first token is the start of the word (e.g. "Banana"), the tokenizer does not prepend the prefix space to the string.
......
...@@ -64,6 +64,10 @@ class CodeLlamaTokenizer(PreTrainedTokenizer): ...@@ -64,6 +64,10 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
Construct a CodeLlama tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as Construct a CodeLlama tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as
there is no padding token in the original model. there is no padding token in the original model.
The default configuration match that of
[codellama/CodeLlama-7b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf/blob/main/tokenizer_config.json)
which supports prompt infilling.
Args: Args:
vocab_file (`str`): vocab_file (`str`):
Path to the vocabulary file. Path to the vocabulary file.
...@@ -80,8 +84,6 @@ class CodeLlamaTokenizer(PreTrainedTokenizer): ...@@ -80,8 +84,6 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
unk_token (`str`, *optional*, defaults to `"<unk>"`): unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead. token instead.
pad_token (`str`, *optional*, defaults to `"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
prefix_token (`str`, *optional*, defaults to `"▁<PRE>"`): prefix_token (`str`, *optional*, defaults to `"▁<PRE>"`):
Prefix token used for infilling. Prefix token used for infilling.
suffix_token (`str`, *optional*, defaults to `"▁<SUF>"`): suffix_token (`str`, *optional*, defaults to `"▁<SUF>"`):
...@@ -111,7 +113,8 @@ class CodeLlamaTokenizer(PreTrainedTokenizer): ...@@ -111,7 +113,8 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
- `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout. BPE-dropout.
use_default_system_prompt (`bool`, *optional*, defaults to `False`):
Whether or not the default system prompt for Llama should be used.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
...@@ -125,7 +128,6 @@ class CodeLlamaTokenizer(PreTrainedTokenizer): ...@@ -125,7 +128,6 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
unk_token="<unk>", unk_token="<unk>",
bos_token="<s>", bos_token="<s>",
eos_token="</s>", eos_token="</s>",
pad_token=None,
prefix_token="▁<PRE>", prefix_token="▁<PRE>",
middle_token="▁<MID>", middle_token="▁<MID>",
suffix_token="▁<SUF>", suffix_token="▁<SUF>",
...@@ -136,6 +138,8 @@ class CodeLlamaTokenizer(PreTrainedTokenizer): ...@@ -136,6 +138,8 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
add_bos_token=True, add_bos_token=True,
add_eos_token=False, add_eos_token=False,
clean_up_tokenization_spaces=False, clean_up_tokenization_spaces=False,
additional_special_tokens=None,
use_default_system_prompt=False,
**kwargs, **kwargs,
): ):
requires_backends(self, "protobuf") requires_backends(self, "protobuf")
...@@ -143,16 +147,17 @@ class CodeLlamaTokenizer(PreTrainedTokenizer): ...@@ -143,16 +147,17 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
self.use_default_system_prompt = use_default_system_prompt
# mark tokens special to skip them # mark tokens special to skip them
additional_special_tokens = kwargs.pop("additional_special_tokens", []) additional_special_tokens = additional_special_tokens or []
additional_special_tokens += [prefix_token, middle_token, suffix_token, eot_token] for token in [prefix_token, middle_token, suffix_token, eot_token]:
additional_special_tokens += [token] if token is not None else []
super().__init__( super().__init__(
bos_token=bos_token, bos_token=bos_token,
eos_token=eos_token, eos_token=eos_token,
unk_token=unk_token, unk_token=unk_token,
pad_token=pad_token,
add_bos_token=add_bos_token, add_bos_token=add_bos_token,
add_eos_token=add_eos_token, add_eos_token=add_eos_token,
prefix_token=prefix_token, prefix_token=prefix_token,
...@@ -164,6 +169,7 @@ class CodeLlamaTokenizer(PreTrainedTokenizer): ...@@ -164,6 +169,7 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
suffix_first=suffix_first, suffix_first=suffix_first,
clean_up_tokenization_spaces=clean_up_tokenization_spaces, clean_up_tokenization_spaces=clean_up_tokenization_spaces,
additional_special_tokens=additional_special_tokens, additional_special_tokens=additional_special_tokens,
use_default_system_prompt=use_default_system_prompt,
**kwargs, **kwargs,
) )
self.vocab_file = vocab_file self.vocab_file = vocab_file
...@@ -239,6 +245,7 @@ class CodeLlamaTokenizer(PreTrainedTokenizer): ...@@ -239,6 +245,7 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
"""Returns vocab size""" """Returns vocab size"""
return self.sp_model.get_piece_size() return self.sp_model.get_piece_size()
# Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.get_vocab
def get_vocab(self): def get_vocab(self):
"""Returns vocab as a dict""" """Returns vocab as a dict"""
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
...@@ -247,7 +254,7 @@ class CodeLlamaTokenizer(PreTrainedTokenizer): ...@@ -247,7 +254,7 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
def tokenize(self, prefix, suffix=None, suffix_first=False, **kwargs) -> List[int]: def tokenize(self, prefix, suffix=None, suffix_first=False, **kwargs) -> List[int]:
# add a prefix space to `prefix` # add a prefix space to `prefix`
if self.fill_token in prefix and suffix is None: if self.fill_token is not None and self.fill_token in prefix and suffix is None:
prefix, suffix = prefix.split(self.fill_token) prefix, suffix = prefix.split(self.fill_token)
if len(prefix) > 0: if len(prefix) > 0:
...@@ -263,9 +270,9 @@ class CodeLlamaTokenizer(PreTrainedTokenizer): ...@@ -263,9 +270,9 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
if None in (self.prefix_id, self.middle_id, self.suffix_id): if None in (self.prefix_id, self.middle_id, self.suffix_id):
raise ValueError( raise ValueError(
"Then input includes a `prefix` and a `suffix` used for the infilling task," "The input either includes a `prefix` and a `suffix` used for the infilling task,"
" the `prefix_id, middle_id, suffix_id` must all be initialized. Current" f" or can be split on the {self.fill_token} token, creating a suffix and prefix,"
f" values : {self.prefix_id, self.middle_id, self.suffix_id}" " but the model does not support `infilling`."
) )
suffix_tokens = self._tokenize(suffix) # make sure CodeLlama sp model does not mess up suffix_tokens = self._tokenize(suffix) # make sure CodeLlama sp model does not mess up
...@@ -293,10 +300,12 @@ class CodeLlamaTokenizer(PreTrainedTokenizer): ...@@ -293,10 +300,12 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
# 2. Remove self.unk_token from ['<','unk','>', '▁Hey'] # 2. Remove self.unk_token from ['<','unk','>', '▁Hey']
return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens
# Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer._convert_token_to_id
def _convert_token_to_id(self, token): def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab.""" """Converts a token (str) in an id using the vocab."""
return self.sp_model.piece_to_id(token) return self.sp_model.piece_to_id(token)
# Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer._convert_id_to_token
def _convert_id_to_token(self, index): def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab.""" """Converts an index (integer) in a token (str) using the vocab."""
token = self.sp_model.IdToPiece(index) token = self.sp_model.IdToPiece(index)
...@@ -320,6 +329,7 @@ class CodeLlamaTokenizer(PreTrainedTokenizer): ...@@ -320,6 +329,7 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
out_string += self.sp_model.decode(current_sub_tokens) out_string += self.sp_model.decode(current_sub_tokens)
return out_string return out_string
# Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.save_vocabulary
def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]: def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
""" """
Save the vocabulary and special tokens file to a directory. Save the vocabulary and special tokens file to a directory.
...@@ -347,6 +357,7 @@ class CodeLlamaTokenizer(PreTrainedTokenizer): ...@@ -347,6 +357,7 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
return (out_vocab_file,) return (out_vocab_file,)
# Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.build_inputs_with_special_tokens
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
bos_token_id = [self.bos_token_id] if self.add_bos_token else [] bos_token_id = [self.bos_token_id] if self.add_bos_token else []
eos_token_id = [self.eos_token_id] if self.add_eos_token else [] eos_token_id = [self.eos_token_id] if self.add_eos_token else []
...@@ -358,6 +369,7 @@ class CodeLlamaTokenizer(PreTrainedTokenizer): ...@@ -358,6 +369,7 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
return output return output
# Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.get_special_tokens_mask
def get_special_tokens_mask( def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]: ) -> List[int]:
...@@ -395,6 +407,7 @@ class CodeLlamaTokenizer(PreTrainedTokenizer): ...@@ -395,6 +407,7 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
+ eos_token_id + eos_token_id
) )
# Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.create_token_type_ids_from_sequences
def create_token_type_ids_from_sequences( def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]: ) -> List[int]:
...@@ -443,7 +456,7 @@ class CodeLlamaTokenizer(PreTrainedTokenizer): ...@@ -443,7 +456,7 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
>>> from transformers import Conversation >>> from transformers import Conversation
>>> Conversation( >>> Conversation(
... "<<SYS>>\n Only answer with emojis, and charades\n<</SYS>>\n\nHow can I build a house in 10 septs?" ... "<<SYS>>\n Complete the functions without any documentation\n<</SYS>>\n\n `def remove_non_ascii(s: str) -> str:`"
... ) # doctest: +IGNORE_RESULT ... ) # doctest: +IGNORE_RESULT
``` ```
Args: Args:
...@@ -453,8 +466,12 @@ class CodeLlamaTokenizer(PreTrainedTokenizer): ...@@ -453,8 +466,12 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
`List[int]`: `List[int]`:
Input ids for the conversation. Input ids for the conversation.
""" """
if self.use_default_system_prompt:
if len(conversation.past_user_inputs) > 0: if len(conversation.past_user_inputs) > 0:
if not conversation.past_user_inputs[0].startswith(B_SYS) or E_SYS not in conversation.past_user_inputs[0]: if (
not conversation.past_user_inputs[0].startswith(B_SYS)
or E_SYS not in conversation.past_user_inputs[0]
):
conversation.past_user_inputs[0] = ( conversation.past_user_inputs[0] = (
B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0] B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
) )
......
...@@ -73,7 +73,9 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast): ...@@ -73,7 +73,9 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods. refer to this superclass for more information regarding those methods. The default configuration match that of
[codellama/CodeLlama-7b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf/blob/main/tokenizer_config.json)
which supports prompt infilling.
Args: Args:
vocab_file (`str`): vocab_file (`str`):
...@@ -104,6 +106,10 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast): ...@@ -104,6 +106,10 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
The token used to split the input between the prefix and suffix. The token used to split the input between the prefix and suffix.
suffix_first (`bool`, *optional*, default to `False`): suffix_first (`bool`, *optional*, default to `False`):
Whether the input prompt and suffix should be formatted with the suffix first. Whether the input prompt and suffix should be formatted with the suffix first.
additional_special_tokens (`List[str]`, *optional*):
Additional special tokens used by the tokenizer.
use_default_system_prompt (`bool`, *optional*, defaults to `True`):
Whether or not the default system prompt for Llama should be used.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
...@@ -124,13 +130,18 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast): ...@@ -124,13 +130,18 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
suffix_token="▁<SUF>", suffix_token="▁<SUF>",
eot_token="▁<EOT>", eot_token="▁<EOT>",
fill_token="<FILL_ME>", fill_token="<FILL_ME>",
additional_special_tokens=None,
add_bos_token=True, add_bos_token=True,
add_eos_token=False, add_eos_token=False,
use_default_system_prompt=False,
**kwargs, **kwargs,
): ):
# mark tokens special to skip them # mark tokens special to skip them
additional_special_tokens = kwargs.pop("additional_special_tokens", []) additional_special_tokens = additional_special_tokens or []
additional_special_tokens += [prefix_token, middle_token, suffix_token, eot_token] for token in [prefix_token, middle_token, suffix_token, eot_token]:
additional_special_tokens += [token] if token is not None else []
self.use_default_system_prompt = use_default_system_prompt
super().__init__( super().__init__(
vocab_file=vocab_file, vocab_file=vocab_file,
tokenizer_file=tokenizer_file, tokenizer_file=tokenizer_file,
...@@ -144,6 +155,7 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast): ...@@ -144,6 +155,7 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
suffix_token=suffix_token, suffix_token=suffix_token,
eot_token=eot_token, eot_token=eot_token,
fill_token=fill_token, fill_token=fill_token,
use_default_system_prompt=use_default_system_prompt,
**kwargs, **kwargs,
) )
self._add_bos_token = add_bos_token self._add_bos_token = add_bos_token
...@@ -162,6 +174,7 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast): ...@@ -162,6 +174,7 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
def can_save_slow_tokenizer(self) -> bool: def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False return os.path.isfile(self.vocab_file) if self.vocab_file else False
# Copied from transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast.update_post_processor
def update_post_processor(self): def update_post_processor(self):
""" """
Updates the underlying post processor with the current `bos_token` and `eos_token`. Updates the underlying post processor with the current `bos_token` and `eos_token`.
...@@ -300,6 +313,7 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast): ...@@ -300,6 +313,7 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
self.set_infilling_processor(True) self.set_infilling_processor(True)
return tokens return tokens
# Copied from transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast.save_vocabulary
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not self.can_save_slow_tokenizer: if not self.can_save_slow_tokenizer:
raise ValueError( raise ValueError(
...@@ -343,12 +357,12 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast): ...@@ -343,12 +357,12 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
Returns: Returns:
`List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens. `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
""" """
# TODO process the ids for fast? Or update the template processing for infilling task when using `tokenize_infilling`
if token_ids_1 is None: if token_ids_1 is None:
return self.prefix_tokens + token_ids_0 + self.suffix_tokens return self.bos_token_id + token_ids_0 + self.eos_token_id
return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens return self.bos_token_id + token_ids_0 + token_ids_1 + self.eos_token_id
def _build_conversation_input_ids(self, conversation: "Conversation"): # Copied from transformers.models.code_llama.tokenization_code_llama.CodeLlamaTokenizer._build_conversation_input_ids
def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
r"""Builds the input ids for a conversation. r"""Builds the input ids for a conversation.
This is the format used in the provided examples. System prompts should be manually added at the beginning of This is the format used in the provided examples. System prompts should be manually added at the beginning of
the conversation. If no system prompt is given, the `DEFAULT_SYSTEM_PROMPT` will be used. the conversation. If no system prompt is given, the `DEFAULT_SYSTEM_PROMPT` will be used.
...@@ -363,7 +377,7 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast): ...@@ -363,7 +377,7 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
>>> from transformers import Conversation >>> from transformers import Conversation
>>> Conversation( >>> Conversation(
... "<<SYS>>\n Only answer with emojis, and charades\n<</SYS>>\n\nHow can I build a house in 10 septs?" ... "<<SYS>>\n Complete the functions without any documentation\n<</SYS>>\n\n `def remove_non_ascii(s: str) -> str:`"
... ) # doctest: +IGNORE_RESULT ... ) # doctest: +IGNORE_RESULT
``` ```
Args: Args:
...@@ -373,8 +387,12 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast): ...@@ -373,8 +387,12 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
`List[int]`: `List[int]`:
Input ids for the conversation. Input ids for the conversation.
""" """
if self.use_default_system_prompt:
if len(conversation.past_user_inputs) > 0: if len(conversation.past_user_inputs) > 0:
if not conversation.past_user_inputs[0].startswith(B_SYS) or E_SYS not in conversation.past_user_inputs[0]: if (
not conversation.past_user_inputs[0].startswith(B_SYS)
or E_SYS not in conversation.past_user_inputs[0]
):
conversation.past_user_inputs[0] = ( conversation.past_user_inputs[0] = (
B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0] B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
) )
...@@ -392,7 +410,7 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast): ...@@ -392,7 +410,7 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
"The model only supports 'user' and 'assistant' roles, starting with user and alternating (u/a/u/a/u...)" "The model only supports 'user' and 'assistant' roles, starting with user and alternating (u/a/u/a/u...)"
) )
dialog_tokens = [] dialog_tokens: List[int] = []
dialog_tokens += sum( dialog_tokens += sum(
[ [
[self.bos_token_id] [self.bos_token_id]
......
...@@ -65,6 +65,11 @@ class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -65,6 +65,11 @@ class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer.pad_token = tokenizer.eos_token tokenizer.pad_token = tokenizer.eos_token
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(self.tmpdirname)
def test_no_infilling_init(self):
tokenizer = CodeLlamaTokenizer(SAMPLE_VOCAB, prefix_token=None, keep_accents=True)
with self.assertRaises(ValueError):
tokenizer.tokenize("This is <FILL_ME> prefix")
def test_full_tokenizer(self): def test_full_tokenizer(self):
tokenizer = CodeLlamaTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer = CodeLlamaTokenizer(SAMPLE_VOCAB, keep_accents=True)
...@@ -587,8 +592,8 @@ split, ...@@ -587,8 +592,8 @@ split,
end end
""", """,
] ]
tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-hf") tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-Instruct-hf")
tokenizer_fast = CodeLlamaTokenizerFast.from_pretrained("codellama/CodeLlama-7b-hf") tokenizer_fast = CodeLlamaTokenizerFast.from_pretrained("codellama/CodeLlama-7b-Instruct-hf")
formatted_prompt = tokenizer.tokenize(PROMPTS[0]) formatted_prompt = tokenizer.tokenize(PROMPTS[0])
self.assertEqual(formatted_prompt, tokenizer_fast.tokenize(PROMPTS[0])) self.assertEqual(formatted_prompt, tokenizer_fast.tokenize(PROMPTS[0]))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment