"tests/models/mpnet/test_tokenization_mpnet.py" did not exist on "00aa9dbca29dcf0e3a624354ef5c80a8e5226339"
Unverified Commit 015f8e11 authored by Arthur's avatar Arthur Committed by GitHub
Browse files

[`CodeLlama`] Add support for `CodeLlama` (#25740)



* add all

* Revert "Delete .github directory"

This reverts commit 9b0ff7b052e2b20b629a26fb13606b78a42944d1.

* make conversion script backward compatible

* fixup

* more styling

* copy to llama changes

* fix repo consistency

* nits

* document correct classes

* updates

* more fixes

* nits

* update auto mappings

* add readmes

* smallupdates

* llama-code replace with llama_code

* make fixup

* updates to the testsing suite

* fix fast nits

* more small fixes

* fix decode

* fix template processing

* properly reset the normalizer

* nits processor

* tokenization tests pass

* styling

* last tests

* additional nits

* one test is left

* nits

Co-authored-by faabian <faabian@users.noreply.github.com>

* update failing test

* fixup

* remove decode infilling users should handle it on their onw after generation, padding can be a problem

* update

* make test slow and more meaningfull

* fixup

* doc update

* fixup

* Apply suggestions from code review

* add kwargs doc

* tokenizer requires `requires_backend`

* type requires_backends

* CodeLlama instead of LlamaCode

* more name cahnges

* nits

* make doctests happy

* small pipeline nits

* last nit

* Apply suggestions from code review
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* update

* add codellama to toctree

---------
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent 74081cb5
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from shutil import copyfile
from typing import TYPE_CHECKING, List, Optional, Tuple
from tokenizers import normalizers, processors
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import is_sentencepiece_available, logging
from ...utils.versions import require_version
if TYPE_CHECKING:
from transformers.pipelines.conversational import Conversation
require_version("tokenizers>=0.13.3")
if is_sentencepiece_available():
from .tokenization_code_llama import CodeLlamaTokenizer
else:
CodeLlamaTokenizer = None
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model", "tokenizer_file": "tokenizer.json"}
SPIECE_UNDERLINE = "▁"
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
# fmt: off
DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your \
answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
that your responses are socially unbiased and positive in nature.
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not \
correct. If you don't know the answer to a question, please don't share false information."""
# fmt: on
class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
"""
Construct a Llama tokenizer. Based on byte-level Byte-Pair-Encoding.
This uses notably ByteFallback and no normalization.
```python
>>> from transformers import CodeLlamaTokenizerFast
>>> tokenizer = CodeLlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer")
>>> tokenizer.encode("Hello this is a test")
[1, 15043, 445, 338, 263, 1243]
```
If you want to change the `bos_token` or the `eos_token`, make sure to specify them when initializing the model, or
call `tokenizer.update_post_processor()` to make sure that the post-processing is correctly done (otherwise the
values of the first token and final token of an encoded sequence will not be correct). For more details, checkout
[post-processors] (https://huggingface.co/docs/tokenizers/api/post-processors) documentation.
This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.
Args:
vocab_file (`str`):
[SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that
contains the vocabulary necessary to instantiate a tokenizer.
tokenizer_file (`str`):
[tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
contains everything needed to load the tokenizer.
clean_up_tokenization_spaces (`str`, *optional*, defaults to `False`):
Wether to cleanup spaces after decoding, cleanup consists in removing potential artifacts like extra
spaces.
bos_token (`str`, *optional*, defaults to `"<s>"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
eos_token (`str`, *optional*, defaults to `"</s>"`):
The end of sequence token.
unk_token (`str`, *optional*, defaults to `"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
prefix_token (`str`, *optional*, defaults to `"▁<PRE>"`):
Prefix token used for infilling.
suffix_token (`str`, *optional*, defaults to `"▁<SUF>"`):
Suffix token used for infilling.
middle_token (`str`, *optional*, defaults to `"▁<MID>"`):
Middle token used for infilling.
eot_token (`str`, *optional*, defaults to `"▁<EOT>"`):
End of text token used for infilling.
fill_token (`str`, *optional*, defaults to `"<FILL_ME>"`):
The token used to split the input between the prefix and suffix.
suffix_first (`bool`, *optional*, default to `False`):
Whether the input prompt and suffix should be formatted with the suffix first.
"""
vocab_files_names = VOCAB_FILES_NAMES
slow_tokenizer_class = CodeLlamaTokenizer
padding_side = "left"
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file=None,
tokenizer_file=None,
clean_up_tokenization_spaces=False,
unk_token="<unk>",
bos_token="<s>",
eos_token="</s>",
prefix_token="▁<PRE>",
middle_token="▁<MID>",
suffix_token="▁<SUF>",
eot_token="▁<EOT>",
fill_token="<FILL_ME>",
add_bos_token=True,
add_eos_token=False,
**kwargs,
):
# mark tokens special to skip them
additional_special_tokens = kwargs.pop("additional_special_tokens", [])
additional_special_tokens += [prefix_token, middle_token, suffix_token, eot_token]
super().__init__(
vocab_file=vocab_file,
tokenizer_file=tokenizer_file,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
additional_special_tokens=additional_special_tokens,
unk_token=unk_token,
bos_token=bos_token,
eos_token=eos_token,
prefix_token=prefix_token,
middle_token=middle_token,
suffix_token=suffix_token,
eot_token=eot_token,
fill_token=fill_token,
**kwargs,
)
self._add_bos_token = add_bos_token
self._add_eos_token = add_eos_token
self.update_post_processor()
self.vocab_file = vocab_file
self.can_save_slow_tokenizer = False if not self.vocab_file else True
self._prefix_token = prefix_token
self._middle_token = middle_token
self._suffix_token = suffix_token
self._eot_token = eot_token
self.fill_token = fill_token
def update_post_processor(self):
"""
Updates the underlying post processor with the current `bos_token` and `eos_token`.
"""
bos = self.bos_token
bos_token_id = self.bos_token_id
eos = self.eos_token
eos_token_id = self.eos_token_id
single = f"{(bos+':0 ') * self.add_bos_token}$A:0{(' '+eos+':0') * self.add_eos_token}"
pair = f"{single}{(' '+bos+':1') * self.add_bos_token} $B:1{(' '+eos+':1') * self.add_eos_token}"
special_tokens = []
if self.add_bos_token:
special_tokens.append((bos, bos_token_id))
if self.add_eos_token:
special_tokens.append((eos, eos_token_id))
self._tokenizer.post_processor = processors.TemplateProcessing(
single=single, pair=pair, special_tokens=special_tokens
)
@property
def prefix_token(self):
return self._prefix_token
@property
def prefix_id(self):
if self._prefix_token is None:
return None
return self.convert_tokens_to_ids(self.prefix_token)
@property
def middle_token(self):
return self._middle_token
@property
def middle_id(self):
if self._middle_token is None:
return None
return self.convert_tokens_to_ids(self.middle_token)
@property
def suffix_token(self):
return self._suffix_token
@property
def suffix_id(self):
if self._suffix_token is None:
return None
return self.convert_tokens_to_ids(self.suffix_token)
@property
def eot_id(self):
if self._eot_token is None:
return None
return self.convert_tokens_to_ids(self.eot_token)
@property
def eot_token(self):
return self._eot_token
@property
def add_eos_token(self):
return self._add_eos_token
@property
def add_bos_token(self):
return self._add_bos_token
@add_eos_token.setter
def add_eos_token(self, value):
self._add_eos_token = value
self.update_post_processor()
@add_bos_token.setter
def add_bos_token(self, value):
self._add_bos_token = value
self.update_post_processor()
def set_infilling_processor(self, reset, suffix_first=False, add_special_tokens=True):
if reset:
self._tokenizer.normalizer = normalizers.Sequence(
[
normalizers.Prepend(prepend="▁"),
normalizers.Replace(pattern=" ", content="▁"),
]
)
self.update_post_processor()
self._tokenizer.normalizer = normalizers.Replace(pattern=" ", content="▁")
pair = [self.bos_token] if self.add_bos_token and add_special_tokens else []
special_tokens = [(self.bos_token, self.bos_token_id)] if self.add_bos_token and add_special_tokens else []
if suffix_first:
# format as " <PRE> <SUF>{suf} <MID> {pre}"
pair += [self.prefix_token, self.suffix_token, "$A", self.middle_token, "$B"]
special_tokens += [
(self.prefix_token, self.prefix_id),
(self.suffix_token, self.suffix_id),
(self.middle_token, self.middle_id),
]
else:
# format as " <PRE> {pre} <SUF>{suf} <MID>"
pair += [self.prefix_token, "$A", self.suffix_token, "$B", self.middle_token]
special_tokens += [
(self.prefix_token, self.prefix_id),
(self.suffix_token, self.suffix_id),
(self.middle_token, self.middle_id),
]
if self.add_eos_token and add_special_tokens:
pair += [self.eos_token]
special_tokens += [(self.eos_token, self.eos_token_id)]
self._tokenizer.post_processor = processors.TemplateProcessing(
single="$A", pair=pair, special_tokens=special_tokens
)
def encode_plus(self, text, text_pair=None, suffix_first=False, add_special_tokens=True, **kwargs):
# hack to make sure the input is pre-process but outside rust
text_pair = kwargs.pop("suffix", text_pair)
if self.fill_token in text and text_pair is None:
text, text_pair = text.split(self.fill_token)
if text_pair is None or len(text_pair) < 1:
return super().encode_plus(text, text_pair, add_special_tokens=add_special_tokens, **kwargs)
if None in (self.prefix_id, self.middle_id, self.suffix_id):
raise ValueError(
"Then input includes a `prefix` and a `suffix` used for the infilling task,"
" the `prefix_id, middle_id, suffix_id` must all be initialized. Current"
f" values : {self.prefix_id, self.middle_id, self.suffix_id}"
)
self.set_infilling_processor(False, suffix_first=suffix_first, add_special_tokens=add_special_tokens)
tokens = super().encode_plus(" " + text, text_pair=text_pair, add_special_tokens=True, **kwargs)
self.set_infilling_processor(True)
return tokens
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not self.can_save_slow_tokenizer:
raise ValueError(
"Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
"tokenizer."
)
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
return (out_vocab_file,)
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. The special tokens depend on calling set_lang.
An NLLB sequence has the following format, where `X` represents the sequence:
- `input_ids` (for encoder) `X [eos, src_lang_code]`
- `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`
BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
separator.
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
# TODO process the ids for fast? Or update the template processing for infilling task when using `tokenize_infilling`
if token_ids_1 is None:
return self.prefix_tokens + token_ids_0 + self.suffix_tokens
return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
def _build_conversation_input_ids(self, conversation: "Conversation"):
r"""Builds the input ids for a conversation.
This is the format used in the provided examples. System prompts should be manually added at the beginning of
the conversation. If no system prompt is given, the `DEFAULT_SYSTEM_PROMPT` will be used.
```
<bos>[INST] B_SYS SytemPrompt E_SYS Prompt [/INST] Answer <eos>
<bos>[INST] Prompt [/INST] Answer <eos>
<bos>[INST] Prompt [/INST]
```
If you want to use your own system prompt, make sure to use both `B_SYS` and `E_SYS` use the following:
```python
>>> from transformers import Conversation
>>> Conversation(
... "<<SYS>>\n Only answer with emojis, and charades\n<</SYS>>\n\nHow can I build a house in 10 septs?"
... ) # doctest: +IGNORE_RESULT
```
Args:
conversation (`Conversation`):
Conversation to build input ids for.
Returns:
`List[int]`:
Input ids for the conversation.
"""
if len(conversation.past_user_inputs) > 0:
if not conversation.past_user_inputs[0].startswith(B_SYS) or E_SYS not in conversation.past_user_inputs[0]:
conversation.past_user_inputs[0] = (
B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
)
elif conversation.new_user_input:
if not conversation.new_user_input.startswith(B_SYS) or E_SYS not in conversation.new_user_input:
conversation.new_user_input = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.new_user_input
else:
raise ValueError("Last message must be from user")
dialogue = list(conversation.iter_texts())
if not all([is_user for is_user, msg in dialogue[::2]]) or not all(
[not is_user for is_user, msg in dialogue[1::2]]
):
raise ValueError(
"The model only supports 'user' and 'assistant' roles, starting with user and alternating (u/a/u/a/u...)"
)
dialog_tokens = []
dialog_tokens += sum(
[
[self.bos_token_id]
+ self.encode(
f"{B_INST} {(prompt[1]).strip()} {E_INST} {(answer[1]).strip()} ", add_special_tokens=False
)
+ [self.eos_token_id]
for prompt, answer in zip(dialogue[::2], dialogue[1::2])
],
[],
)
dialog_tokens += [self.bos_token_id] + self.encode(
f"{B_INST} {(dialogue[-1][1]).strip()} {E_INST}", add_special_tokens=False
)
return dialog_tokens
......@@ -246,18 +246,26 @@ class OpenLlamaAttention(nn.Module):
def _init_rope(self):
if self.config.rope_scaling is None:
self.rotary_emb = OpenLlamaRotaryEmbedding(
self.head_dim, max_position_embeddings=self.max_position_embeddings
self.head_dim,
max_position_embeddings=self.max_position_embeddings,
base=self.rope_theta,
)
else:
scaling_type = self.config.rope_scaling["type"]
scaling_factor = self.config.rope_scaling["factor"]
if scaling_type == "linear":
self.rotary_emb = OpenLlamaLinearScalingRotaryEmbedding(
self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
self.head_dim,
max_position_embeddings=self.max_position_embeddings,
scaling_factor=scaling_factor,
base=self.rope_theta,
)
elif scaling_type == "dynamic":
self.rotary_emb = OpenLlamaDynamicNTKScalingRotaryEmbedding(
self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
self.head_dim,
max_position_embeddings=self.max_position_embeddings,
scaling_factor=scaling_factor,
base=self.rope_theta,
)
else:
raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
......
......@@ -66,8 +66,8 @@ class LlamaConfig(PretrainedConfig):
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string) in the decoder.
max_position_embeddings (`int`, *optional*, defaults to 2048):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
The maximum sequence length that this model might ever be used with. Llama 1 supports up to 2048 tokens,
Llama 2 up to 4096, CodeLlama up to 16384.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
rms_norm_eps (`float`, *optional*, defaults to 1e-12):
......@@ -77,6 +77,8 @@ class LlamaConfig(PretrainedConfig):
relevant if `config.is_decoder=True`.
tie_word_embeddings(`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings
rope_theta (`float`, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings.
rope_scaling (`Dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
......@@ -121,6 +123,7 @@ class LlamaConfig(PretrainedConfig):
eos_token_id=2,
pretraining_tp=1,
tie_word_embeddings=False,
rope_theta=10000.0,
rope_scaling=None,
**kwargs,
):
......@@ -141,6 +144,7 @@ class LlamaConfig(PretrainedConfig):
self.rms_norm_eps = rms_norm_eps
self.pretraining_tp = pretraining_tp
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
self._rope_scaling_validation()
......
......@@ -53,18 +53,12 @@ Important note: you need to be able to host the whole model in RAM to execute th
come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
"""
INTERMEDIATE_SIZE_MAP = {
"7B": 11008,
"13B": 13824,
"30B": 17920,
"65B": 22016,
"70B": 28672,
}
NUM_SHARDS = {
"7B": 1,
"7Bf": 1,
"13B": 2,
"13Bf": 2,
"34B": 4,
"30B": 4,
"65B": 8,
"70B": 8,
......@@ -86,7 +80,11 @@ def write_json(text, path):
json.dump(text, f)
def write_model(model_path, input_base_path, model_size, safe_serialization=True):
def write_model(model_path, input_base_path, model_size, tokenizer_path=None, safe_serialization=True):
# for backward compatibility, before you needed the repo to be called `my_repo/model_size`
if not os.path.isfile(os.path.join(input_base_path, "params.json")):
input_base_path = os.path.join(input_base_path, model_size)
os.makedirs(model_path, exist_ok=True)
tmp_model_path = os.path.join(model_path, "tmp")
os.makedirs(tmp_model_path, exist_ok=True)
......@@ -98,8 +96,18 @@ def write_model(model_path, input_base_path, model_size, safe_serialization=True
n_heads_per_shard = n_heads // num_shards
dim = params["dim"]
dims_per_head = dim // n_heads
base = 10000.0
base = params.get("rope_theta", 10000.0)
inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
if base > 10000.0:
max_position_embeddings = 16384
else:
max_position_embeddings = 2048
tokenizer_class = LlamaTokenizer if LlamaTokenizerFast is None else LlamaTokenizerFast
if tokenizer_path is not None:
tokenizer = tokenizer_class(tokenizer_path)
tokenizer.save_pretrained(model_path)
vocab_size = tokenizer.vocab_size if tokenizer_path is not None else 32000
if "n_kv_heads" in params:
num_key_value_heads = params["n_kv_heads"] # for GQA / MQA
......@@ -247,6 +255,9 @@ def write_model(model_path, input_base_path, model_size, safe_serialization=True
num_hidden_layers=params["n_layers"],
rms_norm_eps=params["norm_eps"],
num_key_value_heads=num_key_value_heads,
vocab_size=vocab_size,
rope_theta=base,
max_position_embeddings=max_position_embeddings,
)
config.save_pretrained(tmp_model_path)
......@@ -256,10 +267,10 @@ def write_model(model_path, input_base_path, model_size, safe_serialization=True
gc.collect()
print("Loading the checkpoint in a Llama model.")
model = LlamaForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
model = LlamaForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True)
# Avoid saving this as part of the config.
del model.config._name_or_path
model.config.torch_dtype = torch.float16
print("Saving in the Transformers format.")
model.save_pretrained(model_path, safe_serialization=safe_serialization)
shutil.rmtree(tmp_model_path)
......@@ -281,7 +292,7 @@ def main():
)
parser.add_argument(
"--model_size",
choices=["7B", "7Bf", "13B", "13Bf", "30B", "65B", "70B", "70Bf", "tokenizer_only"],
choices=["7B", "7Bf", "13B", "13Bf", "30B", "34B", "65B", "70B", "70Bf", "tokenizer_only"],
help="'f' models correspond to the finetuned versions, and are specific to the Llama2 official release. For more details on Llama2, checkout the original repo: https://huggingface.co/meta-llama",
)
parser.add_argument(
......@@ -290,15 +301,17 @@ def main():
)
parser.add_argument("--safe_serialization", type=bool, help="Whether or not to save using `safetensors`.")
args = parser.parse_args()
spm_path = os.path.join(args.input_dir, "tokenizer.model")
if args.model_size != "tokenizer_only":
write_model(
model_path=args.output_dir,
input_base_path=os.path.join(args.input_dir, args.model_size),
input_base_path=args.input_dir,
model_size=args.model_size,
safe_serialization=args.safe_serialization,
tokenizer_path=spm_path,
)
spm_path = os.path.join(args.input_dir, "tokenizer.model")
write_tokenizer(args.output_dir, spm_path)
else:
write_tokenizer(args.output_dir, spm_path)
if __name__ == "__main__":
......
......@@ -246,6 +246,7 @@ class LlamaAttention(nn.Module):
self.num_key_value_heads = config.num_key_value_heads
self.num_key_value_groups = self.num_heads // self.num_key_value_heads
self.max_position_embeddings = config.max_position_embeddings
self.rope_theta = config.rope_theta
if (self.head_dim * self.num_heads) != self.hidden_size:
raise ValueError(
......@@ -260,17 +261,27 @@ class LlamaAttention(nn.Module):
def _init_rope(self):
if self.config.rope_scaling is None:
self.rotary_emb = LlamaRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
self.rotary_emb = LlamaRotaryEmbedding(
self.head_dim,
max_position_embeddings=self.max_position_embeddings,
base=self.rope_theta,
)
else:
scaling_type = self.config.rope_scaling["type"]
scaling_factor = self.config.rope_scaling["factor"]
if scaling_type == "linear":
self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
self.head_dim,
max_position_embeddings=self.max_position_embeddings,
scaling_factor=scaling_factor,
base=self.rope_theta,
)
elif scaling_type == "dynamic":
self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
self.head_dim,
max_position_embeddings=self.max_position_embeddings,
scaling_factor=scaling_factor,
base=self.rope_theta,
)
else:
raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
......
......@@ -100,14 +100,15 @@ class TextGenerationPipeline(Pipeline):
prefix=None,
handle_long_generation=None,
stop_sequence=None,
add_special_tokens=False,
**generate_kwargs,
):
preprocess_params = {}
preprocess_params = {"add_special_tokens": add_special_tokens}
if prefix is not None:
preprocess_params["prefix"] = prefix
if prefix:
prefix_inputs = self.tokenizer(
prefix, padding=False, add_special_tokens=False, return_tensors=self.framework
prefix, padding=False, add_special_tokens=add_special_tokens, return_tensors=self.framework
)
generate_kwargs["prefix_length"] = prefix_inputs["input_ids"].shape[-1]
......@@ -203,9 +204,11 @@ class TextGenerationPipeline(Pipeline):
"""
return super().__call__(text_inputs, **kwargs)
def preprocess(self, prompt_text, prefix="", handle_long_generation=None, **generate_kwargs):
def preprocess(
self, prompt_text, prefix="", handle_long_generation=None, add_special_tokens=False, **generate_kwargs
):
inputs = self.tokenizer(
prefix + prompt_text, padding=False, add_special_tokens=False, return_tensors=self.framework
prefix + prompt_text, padding=False, add_special_tokens=add_special_tokens, return_tensors=self.framework
)
inputs["prompt_text"] = prompt_text
......@@ -299,10 +302,9 @@ class TextGenerationPipeline(Pipeline):
)
)
all_text = text[prompt_length:]
if return_type == ReturnType.FULL_TEXT:
all_text = prompt_text + text[prompt_length:]
else:
all_text = text[prompt_length:]
all_text = prompt_text + all_text
record = {"generated_text": all_text}
records.append(record)
......
......@@ -44,6 +44,13 @@ class CamembertTokenizer(metaclass=DummyObject):
requires_backends(self, ["sentencepiece"])
class CodeLlamaTokenizer(metaclass=DummyObject):
_backends = ["sentencepiece"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["sentencepiece"])
class CpmTokenizer(metaclass=DummyObject):
_backends = ["sentencepiece"]
......
......@@ -72,6 +72,13 @@ class CLIPTokenizerFast(metaclass=DummyObject):
requires_backends(self, ["tokenizers"])
class CodeLlamaTokenizerFast(metaclass=DummyObject):
_backends = ["tokenizers"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["tokenizers"])
class CodeGenTokenizerFast(metaclass=DummyObject):
_backends = ["tokenizers"]
......
This diff is collapsed.
......@@ -20,7 +20,7 @@ import unittest
from parameterized import parameterized
from transformers import LlamaConfig, is_torch_available, set_seed
from transformers.testing_utils import require_torch, slow, torch_device
from transformers.testing_utils import require_torch, require_torch_gpu, slow, torch_device
from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
......@@ -31,7 +31,13 @@ from ...test_pipeline_mixin import PipelineTesterMixin
if is_torch_available():
import torch
from transformers import LlamaForCausalLM, LlamaForSequenceClassification, LlamaModel, LlamaTokenizer
from transformers import (
CodeLlamaTokenizer,
LlamaForCausalLM,
LlamaForSequenceClassification,
LlamaModel,
LlamaTokenizer,
)
class LlamaModelTester:
......@@ -450,3 +456,85 @@ class LlamaIntegrationTest(unittest.TestCase):
generated_ids = model.generate(input_ids, max_new_tokens=64, top_p=None, temperature=1, do_sample=False)
text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
@require_torch
class CodeLlamaIntegrationTest(unittest.TestCase):
PROMPTS = [
'''def remove_non_ascii(s: str) -> str:
""" <FILL_ME>
return result
''',
"""# Installation instructions:
```bash
<FILL_ME>
```
This downloads the LLaMA inference code and installs the repository as a local pip package.
""",
"""class InterfaceManagerFactory(AbstractManagerFactory):
def __init__(<FILL_ME>
def main():
factory = InterfaceManagerFactory(start=datetime.now())
managers = []
for i in range(10):
managers.append(factory.build(id=i))
""",
"""/-- A quasi-prefunctoid is 1-connected iff all its etalisations are 1-connected. -/
theorem connected_iff_etalisation [C D : precategoroid] (P : quasi_prefunctoid C D) :
π₁ P = 0 ↔ <FILL_ME> = 0 :=
begin
split,
{ intros h f,
rw pi_1_etalisation at h,
simp [h],
refl
},
{ intro h,
have := @quasi_adjoint C D P,
simp [←pi_1_etalisation, this, h],
refl
}
end
""",
]
@require_torch_gpu
@slow
def test_model_7b_logits(self):
model = LlamaForCausalLM.from_pretrained("codellama/CodeLlama-7b-hf").to(torch_device)
tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")
# Tokenize and prepare for the model a list of sequences or a list of pairs of sequences.
# meaning by default this supports passing splitted list of inputs
processed_text = tokenizer.batch_decode(tokenizer(self.PROMPTS)["input_ids"], add_special_tokens=False)
# fmt: off
EXPECTED_TEXT = [
'<s> <PRE> def remove_non_ascii(s: str) -> str:\n """ <SUF>\n return result\n <MID>',
'<s> <PRE> # Installation instructions:\n ```bash\n <SUF>\n ```\nThis downloads the LLaMA inference code and installs the repository as a local pip package.\n <MID>',
'<s> <PRE> class InterfaceManagerFactory(AbstractManagerFactory):\n def __init__( <SUF>\ndef main():\n factory = InterfaceManagerFactory(start=datetime.now())\n managers = []\n for i in range(10):\n managers.append(factory.build(id=i))\n <MID>',
'<s> <PRE> /-- A quasi-prefunctoid is 1-connected iff all its etalisations are 1-connected. -/\ntheorem connected_iff_etalisation [C D : precategoroid] (P : quasi_prefunctoid C D) :\nπ₁ P = 0 ↔ <SUF> = 0 :=\nbegin\nsplit,\n{ intros h f,\n rw pi_1_etalisation at h,\n simp [h],\n refl\n},\n{ intro h,\n have := @quasi_adjoint C D P,\n simp [←pi_1_etalisation, this, h],\n refl\n}\nend\n <MID>'
]
# fmt: on
self.assertEqual(processed_text, EXPECTED_TEXT)
processed_text_suffix_first = tokenizer.batch_decode(
tokenizer(self.PROMPTS, suffix_first=True, add_special_tokens=False)["input_ids"]
)
# fmt: off
EXPECTED_TEXT = [
'<PRE> <SUF>\n return result\n <MID> def remove_non_ascii(s: str) -> str:\n """ ',
'<PRE> <SUF>\n ```\nThis downloads the LLaMA inference code and installs the repository as a local pip package.\n <MID> # Installation instructions:\n ```bash\n',
'<PRE> <SUF>\ndef main():\n factory = InterfaceManagerFactory(start=datetime.now())\n managers = []\n for i in range(10):\n managers.append(factory.build(id=i))\n <MID> class InterfaceManagerFactory(AbstractManagerFactory):\n def __init__(',
'<PRE> <SUF> = 0 :=\nbegin\nsplit,\n{ intros h f,\n rw pi_1_etalisation at h,\n simp [h],\n refl\n},\n{ intro h,\n have := @quasi_adjoint C D P,\n simp [←pi_1_etalisation, this, h],\n refl\n}\nend\n <MID> /-- A quasi-prefunctoid is 1-connected iff all its etalisations are 1-connected. -/\ntheorem connected_iff_etalisation [C D : precategoroid] (P : quasi_prefunctoid C D) :\nπ₁ P = 0 ↔ '
]
EXPECTED_IDS = torch.tensor([[ 1, 32007, 822, 3349, 29918, 5464, 29918, 294, 18869, 29898,29879, 29901, 851, 29897, 1599, 851, 29901, 13, 1678, 9995, 29871, 32008, 13, 1678, 736, 1121, 13, 32009, 15941, 1661, 29899, 28599, 2687, 4890, 515, 263, 1347, 29889, 13, 13, 1678, 826, 3174, 29901, 13, 4706, 269, 29901, 450, 1347, 304, 3349, 1661, 29899, 28599, 2687, 4890, 515, 29889, 13, 13, 1678, 16969, 29901, 13, 4706, 450, 1347, 411, 1661, 29899, 28599, 2687, 4890, 6206, 29889, 13, 1678, 9995, 13, 1678, 1121, 353, 5124, 13, 1678, 363, 274, 297, 269, 29901, 13, 4706, 565, 4356, 29898, 29883, 29897, 529, 29871, 29896, 29906, 29947, 29901, 13, 9651, 1121, 4619, 274, 32010, 2]])
# fmt: on
self.assertEqual(processed_text_suffix_first, EXPECTED_TEXT)
input_ids = tokenizer(self.PROMPTS[0], return_tensors="pt")["input_ids"]
generated_ids = model.generate(input_ids.to(torch_device), max_new_tokens=128)
torch.testing.assert_close(generated_ids, EXPECTED_IDS)
EXPECTED_INFILLING = [
'<s> <PRE> def remove_non_ascii(s: str) -> str:\n """ <SUF>\n return result\n <MID>Remove non-ASCII characters from a string.\n\n Args:\n s: The string to remove non-ASCII characters from.\n\n Returns:\n The string with non-ASCII characters removed.\n """\n result = ""\n for c in s:\n if ord(c) < 128:\n result += c <EOT></s>'
]
infilling = tokenizer.batch_decode(generated_ids)
self.assertEqual(infilling, EXPECTED_INFILLING)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment