description="There is an important infomation hidden in the following context. Find the information and memorize it. I will quiz you about the important information there.\n"
noises="The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again."*(context_length//10)
information=f"\n\nThe pass key is {passkey}. Remember it. {passkey} is the pass key.\n\n"
"passkey":"There is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there.\n\n{context}\n\n{input}",# noqa
"number_string":"There is an important info hidden inside a lot of irrelevant text. Find it. I will quiz you about the important information there.\n\n{context}\n\n{input}",# noqa
"kv_retrieval":"Extract the value corresponding to the specified key in the JSON object below.\n\n{context}\n\n{input}",# noqa
# "longbook_sum_eng": "Summarize the book below:\n\n{context}", # noqa
"longbook_qa_eng":"Read the book below and answer a question.\n\n{context}\n\nQuestion: {question}\n\nBe very concise.",# noqa
"longbook_choice_eng":"Read the book and answer the question.\n\n{context}\n\nQuestion: {question}\n\nOnly one of the following options is correct, tell me the answer using one single letter (A, B, C, or D). Don't say anything else.\nA. {OPTION_A}\nB. {OPTION_B}\nC. {OPTION_C}\nD. {OPTION_D}",# noqa
"longbook_sum_eng":"Summarize the following book.\n\n{context}",# noqa
"math_calc":"Compute the intermediate values in the following long expression.\n\n{context}",# noqa
"code_run":"Following is a set of Python functions. There is a function called named {func}.\n\n{context}\n\nPlease give me the exact number of the return value of {func_call}. Be concise. Your response must end with the final returned value.",# noqa
"code_debug":"There is ONLY ONE function in the large project that is deliberately made to include an obvious error. Please find the function that contains the most obvious errors. I will give you four options to narrow your scope. You can inspect the options and think. Eventually, tell me the answer using one single letter (A, B, C, or D).\n\n{context}\n\nWhich funtion has deliberate error?\nA. {OPTION_A}\nB. {OPTION_B}\nC. {OPTION_C}\nD. {OPTION_D}\n\nYou should first find the functions in the options. Repeat their content, inspect through code, and at last give me your answer for the function that has the deliberate and obvious error in A, B, C, or D.",# noqa
"longdialogue_qa_eng":"Below is a dialogue script where one random occurrence of a character name is replaced with \"$$MASK$$\", and you should try to guess who that character is.\n\nThe dialogue:\n\n---\n\n{context}\n\n---\n\nEnd of dialogue.\n\nWhich character is most likely \"$$MASK$$\"? Just say the name used by the scriptwriter (before the colon marks) of one single character and nothing else.",# noqa
}
yarn_mistral_templates={
"passkey":"There is an important info hidden inside a lot of irrelevant text. Find it and memorize it. I will quiz you about the important information.\n\n{context}\n\n{input}\n\nThe pass key is",# noqa
"number_string":"There is an important info hidden inside a lot of irrelevant text. Find it. I will quiz you about the important information there.\n\n{context}\n\n{input}\n\nThe sequence of digits is",# noqa
"kv_retrieval":"Extract the value corresponding to the specified key in the JSON object below.\n\n{context}\n\n{input}",# noqa
"longbook_sum_eng":"Summarize the book below.\n\n{context}\n\nSummary:",# noqa
"longbook_choice_eng":"Read the book and answer the question.\n\n{context}\n\nQuestion: {question}\nA. {OPTION_A}\nB. {OPTION_B}\nC. {OPTION_C}\nD. {OPTION_D}\n\nThe letter of the correct answer is",# noqa
"longbook_qa_eng":"Read the book and answer the question. Be very concise in your answer.\n\n{context}\n\nQuestion: {question}\nAnswer:",# noqa
"math_calc":"Let us calculate the intermediate values of an expression.\n\nExpression: 1 + 3 + 4\nValues: [1, 4, 8]\n\nExpression: 8 - 3 + 2 - 4\nValues: [8, 5, 7, 3]\n\nExpression: {context}\nValues:",# noqa
"code_run":"There is a function called {func} in the following Python code.\n\n{context}\n\nPlease compute the exact value of {func_call}. The value of {func_call} is",# noqa
"code_debug":"Following is a Python code where exactly one of the functions/methods has a deliberate error that makes it crash.\n\n{context}\n\nOptions:\nA. {OPTION_A}\nB. {OPTION_B}\nC. {OPTION_C}\nD. {OPTION_D}\n\nThe correct option is:",# noqa
"longdialogue_qa_eng":"Below is a dialogue script where one random occurrence of a character name is replaced with \"$$MASK$$\", and you should try to guess who that character is.\n\n{context}\n\nThe name that has been replaced with $$MASK$$ is likely",# noqa
}
claude2_templates={
"passkey":"There is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there.\n\n{context}\n{input}\nThe pass key is",
"number_string":"There is an important info hidden inside a lot of irrelevant text. Find it. I will quiz you about the important information there.\n\n{context}\n{input}\nThe sequence of digits is",# noqa
"kv_retrieval":"There is an important info hidden inside a lot of irrelevant text. Find it. I will quiz you about the important information there.\n\n{context}\n{input}",
"longbook_sum_eng":"Summarize the following book.\n\n{context}",# noqa
"longbook_choice_eng":"Read the book and answer the question.\n\n{context}\n\nQuestion: {question}\n\nOnly one of the following options is correct, tell me the answer using one single letter (A, B, C, or D). Don't say anything else.\nA. {OPTION_A}\nB. {OPTION_B}\nC. {OPTION_C}\nD. {OPTION_D}",# noqa
"longbook_qa_eng":"Read the novel below and answer a question:\n\n{context}\n\n{input}\nPlease answer as short as possible. The answer is: ",# noqa
"math_calc":"Let us calculate the intermediate values of an expression.\nExpression: 1 + 3 + 4\nValues: [1, 4, 8]\n\nExpression: 8 - 3 + 2 - 4\nValues: [8, 5, 7, 3]\n\nExpression: {context}\nValues:",# noqa
"code_run":"In the file functions_module.py, there is a function called ${func}.\n\n\nHere is the content of functions_module.py:\n{context}\n\nPlease give me the exact number of the return value of {func_call}. Your response should end with the sentence \'The return value is:\'.",# noqa
"code_debug":"There is ONLY ONE function in the large project that is deliberately made to include an obvious error. Please find the function that contains the most obvious errors. I will give you four options to narrow your scope. You can inspect through the options and think. Eventually, tell me the answer using one single letter (A, B, C, or D).\n\n{context}\n\nWhich funtion has deliberate error?\nA. {OPTION_A}\nB. {OPTION_B}\nC. {OPTION_C}\nD. {OPTION_D}\n\nYou should first find the functions in the options. Repeat their content, inspect through code, and at last give me your answer for the function that has the deliberate and obvious error in A, B, C, or D.",# noqa
"longdialogue_qa_eng":"Below is a dialogue script where one random occurrence of a character name is replaced with \"$$MASK$$\", and you should try to guess who that character is.\n\nThe dialogue:\n\n---\n\n{context}\n\n---\n\nEnd of dialogue.\n\nWhich character is most likely \"$$MASK$$\"? Just say the name used by the scriptwriter (before the colon marks) of one single character and nothing else.",# noqa
}
kimi_templates={
"passkey":"There is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there.\n\n{context}\n{input}\nThe pass key is",# noqa
"number_string":"There is an important info hidden inside a lot of irrelevant text. Find it. I will quiz you about the important information there.\n\n{context}\n{input}\nThe sequence of digits is",# noqa
"kv_retrieval":"Extract the value corresponding to the specified key in the JSON object below.\n\n{context}\n{input}",# noqa
"longbook_sum_eng":"Summarize the book below:\n\n{file:{context}}",# noqa
"longbook_choice_eng":"Read the book and answer the question.\n\nQuestion: {question}\n\nOnly one of the following options is correct, tell me the answer using one single letter (A, B, C, or D). Don't say anything else.\nA. {OPTION_A}\nB. {OPTION_B}\nC. {OPTION_C}\nD. {OPTION_D}"+"{file:{document}}",# noqa
"longbook_qa_eng":"Read the book below and answer a question.\n\nQuestion: {question}\n\nBe very concise."+"{file:{context}}",# noqa
"math_calc":"Let us calculate the intermediate values of an expression.\nExpression: 1 + 3 + 4\nValues: [1, 4, 8]\n\nExpression: 8 - 3 + 2 - 4\nValues: [8, 5, 7, 3]\n\nExpression: {context}\nValues:",# noqa
"code_run":"In the file functions_module.py, there is a function called ${func}.\n\n\nHere is the content of functions_module.py:\n\nPlease give me the exact number of the return value of ${func_call}. Your response should end with the sentence 'The return value is:'."+"{context}",# noqa
"code_debug":"Below is a code repository where there is one single function with bugs that causes an error. Please tell me the name of that function.\nWhich function has bugs? Give me the final answer in this format: \"[FINAL ANSWER: XXX]\". Don't say anything else."+"{fcontext}",# noqa
# "longdialogue_qa_eng": "Below is a dialogue script where one random occurrence of a character name is replaced with \"$$MASK$$\", and you should try to guess who that character is.\n\nThe name that has been replaced with $$MASK$$ is likely" + "{context}", # noqa
"longdialogue_qa_eng":"Below is a dialogue script where one random occurrence of a character name is replaced with \"$$MASK$$\", and you should try to guess who that character is. Give me the answer using the name before the colons, don't say anything else.\n\n{context}",# noqa
"narrativeqa":"You are given a story, which can be either a novel or a movie script, and a question. Answer the question asconcisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story asconcisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:",
"qasper":"You are given a scientific article and a question. Answer the question as concisely as you can, using a single phrase or sentence if possible. If the question cannot be answered based on the information in the article, write \"unanswerable\". If the question is a yes/no question, answer \"yes\", \"no\", or \"unanswerable\". Do not provide any explanation.\n\nArticle: {context}\n\n Answer the question based on the above article as concisely as you can, using a single phrase or sentence if possible. If the question cannot be answered based on the information in the article, write \"unanswerable\". If the question is a yes/no question, answer \"yes\", \"no\", or \"unanswerable\". Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:",
"multifieldqa_en":"Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
"hotpotqa":"Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
"2wikimqa":"Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
"musique":"Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
"gov_report":"You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:",
"qmsum":"You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:",
"multi_news":"You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:",
"trec":"Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}",
"triviaqa":"Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}",
"samsum":"Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}",
"passage_count":"There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ",
"passage_retrieval_en":"Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like \"Paragraph 1\", \"Paragraph 2\", etc.\n\nThe answer is: ",
# remove input_ids that have been saved in outputs
input_ids=input_ids[cursor:]
# all needed data have been collected
iflen(max_length_candidates)==0:
break
tqdm_bar.close()
iflen(max_length_candidates)>0:
logger.warning(f"There are not enough data ! The remainings are {num_instances_this_dataset} instances for {dataset_name} dataset. Consider increase the corresponding data in {data_files}.")
logger.warning(f"In deepspeed zero3, evaluation with generation is may lead to hang because of the unequal number of forward passes across different devices.")
# NOTE: essential to disable all gradient in-place, so that when calling accelerator.prepare, the forward function will not be wrapped that may consume extra GPU memory
metadata={'help':'Default path to save language models.'}
)
dataset_cache_dir:str=field(
default=None,
metadata={'help':'Default path to save huggingface datasets.'}
)
data_root:str=field(
default="/data/long-llm",
metadata={'help':'The base directory storing all data used for training and evaluation. If specified, make sure all train_data, eval_data, and corpus are path relative to data_root!'},
)
train_data:Optional[List[str]]=field(
default=None,
metadata={'help':'Training json file or glob to match a list of files.'},
)
eval_data:Optional[str]=field(
default=None,
metadata={'help':'Evaluation json file.'},
)
model_name_or_path:str=field(
default='meta-llama/Llama-2-7b-chat-hf',
metadata={'help':'Path to pretrained model or model identifier from huggingface.co/models'}
)
padding_side:str=field(
default="left",
metadata={'help':'Tokenizer padding side.'}
)
no_use_fast:bool=field(
default=False,
metadata={'help':'Do not use fast tokenizer?'}
)
access_token:Optional[str]=field(
default=None,
metadata={'help':'Huggingface access token.'}
)
attn_impl:Optional[str]=field(
default="flash_attention_2",
metadata={'help':'The implementation of attention.'}
)
max_length:int=field(
default=4096,
metadata={'help':'How many tokens at maximum for each input.'},
)
chat_template:str=field(
default="hf",
metadata={'help':'Instruction template name in fastchat.'}
)
max_position_embeddings:Optional[int]=field(
default=None,
metadata={'help':'Maximum position.'},
)
mistral_sliding_window:Optional[int]=field(
default=None,
metadata={'help':'Sliding window size in Mistral models.'},
)
rope_theta:Optional[float]=field(
default=None,
metadata={'help':'RoPE base (theta).'},
)
rope_method:Optional[str]=field(
default=None,
metadata={'help':'How to scale RoPE? {linear, dynamic, yarn}'},
)
rope_factor:float=field(
default=1.,
metadata={'help':'RoPE scaling factor.'},
)
lora:Optional[str]=field(
default=None,
metadata={'help':'LoRA ID.'},
)
lora_unload:bool=field(
default=True,
metadata={'help':'Merge and unload LoRA?'},
)
load_in_4_bit:bool=field(
default=False,
metadata={'help':'Load model in 4 bits?'},
)
dtype:str=field(
default="bf16",
metadata={'help':'Data type for embeddings.'}
)
device_map:Optional[str]=field(
default=None,
metadata={'help':'Device map for loading the model. Set to auto to load across devices.'}
)
batch_size:int=field(
default=1,
metadata={'help':'Evaluation batch size.'},
)
cpu:bool=field(
default=False,
metadata={'help':'Use cpu?'}
)
enable_tp:bool=field(
default=False,
metadata={'help':'Use tensor parallel to wrap the model?'}
)
enable_vllm:bool=field(
default=False,
metadata={'help':'Use vllm?'}
)
vllm_mem:float=field(
default=0.9,
metadata={'help':'Vllm maximum GPU memory utilization.'}
)
vllm_tp:int=field(
default=1,
metadata={'help':'Vllm tensor parallel degree.'}
)
vllm_len:Optional[int]=field(
default=None,
metadata={'help':'Vllm maximum sequence length.'}
)
vllm_disable_ar:bool=field(
default=False,
metadata={'help':'Disable custom all-reduce in vllm?'}
metadata={'help':'The stride of the sliding window.'}
)
beacon_attn:Optional[str]=field(
default=None,
metadata={'help':'How to assign attention masks of beacon tokens? {segmentation, step-expansion, full-converage}'}
)
beacon_ratio:Optional[List[int]]=field(
default=None,
metadata={'help':'Condensing ratios for beacons.'}
)
beacon_ratio_mix:Optional[str]=field(
default=None,
metadata={'help':'How to determine the beacon_ratio for each input. {step-random, instance-random, adapt-x}'}
)
beacon_param:Optional[List[str]]=field(
default=None,
metadata={'help':'The introduced parameters for beacon.'}
)
beacon_embed_init:str=field(
default="eos",
metadata={'help':'Initialize beacon embedding from eos/bos embedding.'}
)
beacon_sink_size:Optional[int]=field(
default=None,
metadata={'help':'The number of activations that are always kept in the head of the sequence according to StreamingLLM.'}
)
beacon_attend_prev:Optional[bool]=field(
default=None,
metadata={'help':'Can beacon tokens attend to previous beacon tokens?'}
)
beacon_pos:Optional[str]=field(
default=None,
metadata={'help':'Where to put beacon tokens? {append, interleave}'}
)
beacon_parallel_window:Optional[int]=field(
default=None,
metadata={'help':'How many windows to run in parallel?'}
)
retrieval_method:Optional[str]=field(
default=None,
metadata={'help':'How to retrieve? {bm25}'}
)
retrieval_topk:Optional[int]=field(
default=None,
metadata={'help':'How many windows to retrieve?'}
)
retrieval_key_length:Optional[int]=field(
default=None,
metadata={'help':'The key sequence length in retrieval.'}
)
max_new_tokens:Optional[int]=field(
default=None,
metadata={'help':'How many tokens at maximum to return?'},
)
do_sample:Optional[bool]=field(
default=None,
metadata={'help':'Do sampling when decoding?'},
)
temperature:Optional[float]=field(
default=None,
metadata={'help':'Sampling temperature.'},
)
top_p:Optional[float]=field(
default=None,
metadata={'help':"If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or higher are kept for generation."}
)
defresolve_path(self,path):
"""Resolve any path starting with 'long-llm:' to relative path against data_root."""
system_message="A chat between a curious human and an artificial intelligence assistant. "
"The assistant gives helpful, detailed, and polite answers to the human's questions.",
roles=("Human","Assistant"),
messages=(
(
"Human",
"Got any creative ideas for a 10 year old’s birthday?",
),
(
"Assistant",
"""Of course! Here are some creative ideas for a 10-year-old's birthday party:
1. Treasure Hunt: Organize a treasure hunt in your backyard or nearby park. Create clues and riddles for the kids to solve, leading them to hidden treasures and surprises.
2. Science Party: Plan a science-themed party where kids can engage in fun and interactive experiments. You can set up different stations with activities like making slime, erupting volcanoes, or creating simple chemical reactions.
3. Outdoor Movie Night: Set up a backyard movie night with a projector and a large screen or white sheet. Create a cozy seating area with blankets and pillows, and serve popcorn and snacks while the kids enjoy a favorite movie under the stars.
4. DIY Crafts Party: Arrange a craft party where kids can unleash their creativity. Provide a variety of craft supplies like beads, paints, and fabrics, and let them create their own unique masterpieces to take home as party favors.
5. Sports Olympics: Host a mini Olympics event with various sports and games. Set up different stations for activities like sack races, relay races, basketball shooting, and obstacle courses. Give out medals or certificates to the participants.
6. Cooking Party: Have a cooking-themed party where the kids can prepare their own mini pizzas, cupcakes, or cookies. Provide toppings, frosting, and decorating supplies, and let them get hands-on in the kitchen.
7. Superhero Training Camp: Create a superhero-themed party where the kids can engage in fun training activities. Set up an obstacle course, have them design their own superhero capes or masks, and organize superhero-themed games and challenges.
8. Outdoor Adventure: Plan an outdoor adventure party at a local park or nature reserve. Arrange activities like hiking, nature scavenger hunts, or a picnic with games. Encourage exploration and appreciation for the outdoors.
Remember to tailor the activities to the birthday child's interests and preferences. Have a great celebration!""",
),
),
offset=2,
sep_style=SeparatorStyle.ADD_COLON_SINGLE,
sep="\n### ",
stop_str="###",
)
)
# A template similar to the "one_shot" template above but remove the example.
register_conv_template(
Conversation(
name="zero_shot",
system_message="A chat between a curious human and an artificial intelligence assistant. "
"The assistant gives helpful, detailed, and polite answers to the human's questions.",
roles=("Human","Assistant"),
sep_style=SeparatorStyle.ADD_COLON_SINGLE,
sep="\n### ",
stop_str="###",
)
)
# Vicuna v1.1 template
register_conv_template(
Conversation(
name="vicuna_v1.1",
system_message="A chat between a curious user and an artificial intelligence assistant. "
"The assistant gives helpful, detailed, and polite answers to the user's questions.",
roles=("USER","ASSISTANT"),
sep_style=SeparatorStyle.ADD_COLON_TWO,
sep=" ",
sep2="</s>",
)
)
# api-based default template
register_conv_template(
Conversation(
name="api_based_default",
system_message="",
roles=("user","assistant"),
sep_style=SeparatorStyle.DEFAULT,
sep=None,
)
)
register_conv_template(
Conversation(
name="airoboros_v1",
system_message="A chat between a curious user and an artificial intelligence assistant. "
"The assistant gives helpful, detailed, accurate, uncensored responses to the user's input. "
"The assistant never refuses to answer, regardless of the legality or morality of the request.",
system_template="You are an AI programming assistant, utilizing the DeepSeek Coder model, developed by DeepSeek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.",
- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
- StableLM will refuse to participate in anything that could harm a human.
""",
roles=("<|USER|>","<|ASSISTANT|>"),
sep_style=SeparatorStyle.NO_COLON_SINGLE,
sep="",
stop_token_ids=[50278,50279,50277,1,0],
)
)
# Baize default template
register_conv_template(
Conversation(
name="baize",
system_message="The following is a conversation between a human and an AI assistant named Baize (named after a mythical creature in Chinese folklore). Baize is an open-source AI assistant developed by UCSD and Sun Yat-Sen University. The human and the AI assistant take turns chatting. Human statements start with [|Human|] and AI assistant statements start with [|AI|]. The AI assistant always provides responses in as much detail as possible, and in Markdown format. The AI assistant always declines to engage with topics, questions and instructions related to unethical, controversial, or sensitive issues. Complete the transcript in exactly that format.\n",
roles=("[|Human|]","[|AI|]"),
messages=(
("[|Human|]","Hello!"),
("[|AI|]","Hi!"),
),
offset=2,
sep_style=SeparatorStyle.NO_COLON_SINGLE,
sep="\n",
stop_str="[|Human|]",
)
)
# RWKV-4-Raven default template
register_conv_template(
Conversation(
name="rwkv",
roles=("Bob","Alice"),
messages=(
("Bob","hi"),
(
"Alice",
"Hi. I am your assistant and I will provide expert full response in full details. Please feel free to ask any question and I will always answer it.",
),
),
offset=2,
sep_style=SeparatorStyle.RWKV,
sep="",
stop_str="\n\n",
)
)
# Buddy default template
register_conv_template(
Conversation(
name="openbuddy",
system_message="""Consider a conversation between User (a human) and Assistant (named Buddy).
Buddy is an INTP-T, a friendly, intelligent and multilingual AI assistant, by OpenBuddy team. GitHub: https://github.com/OpenBuddy/OpenBuddy
Buddy cannot access the Internet.
Buddy can fluently speak the user's language (e.g. English, Chinese).
Buddy can generate poems, stories, code, essays, songs, parodies, and more.
Buddy possesses vast knowledge about the world, history, and culture.
Buddy's responses are always safe, creative, high-quality, human-like, and interesting.
Buddy strictly refuses to discuss political, NSFW, or other unsafe topics.
User: Hi.
Assistant: Hi, I'm Buddy, your AI assistant. How can I help you today?""",
roles=("User","Assistant"),
sep_style=SeparatorStyle.ADD_COLON_SINGLE,
sep="\n",
)
)
# Phoenix default template
register_conv_template(
Conversation(
name="phoenix",
system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
roles=("Human","Assistant"),
sep_style=SeparatorStyle.PHOENIX,
sep="</s>",
)
)
# ReaLM default template
register_conv_template(
Conversation(
name="ReaLM-7b-v1",
system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
roles=("Human","Assistant"),
sep_style=SeparatorStyle.PHOENIX,
sep="</s>",
)
)
# ChatGPT default template
register_conv_template(
Conversation(
name="chatgpt",
system_message="You are a helpful assistant.",
roles=("user","assistant"),
sep_style=SeparatorStyle.DEFAULT,
sep=None,
)
)
register_conv_template(
Conversation(
name="gpt-4-turbo-2024-04-09",
system_message=(
"You are ChatGPT, a large language model trained by OpenAI, based on the GPT-4 architecture.\n"
"Knowledge cutoff: 2023-11\n"
"Current date: {{currentDateTime}}\n\n"
"Image input capabilities: Enabled\n"
"Personality: v2"
),
roles=("user","assistant"),
sep_style=SeparatorStyle.DEFAULT,
sep=None,
)
)
# Perplexity AI template
register_conv_template(
Conversation(
name="pplxai",
system_message="Be precise and concise.",
roles=("user","assistant"),
sep_style=SeparatorStyle.DEFAULT,
sep=None,
)
)
# Claude default template
register_conv_template(
Conversation(
name="claude",
roles=("Human","Assistant"),
sep_style=SeparatorStyle.ADD_COLON_SINGLE,
sep="\n\n",
)
)
register_conv_template(
Conversation(
name="claude-3-haiku-20240307",
system_message=(
"The assistant is Claude, created by Anthropic. The current date is "
"{{currentDateTime}}. Claude's knowledge base was last updated in "
"August 2023 and it answers user questions about events before "
"August 2023 and after August 2023 the same way a highly informed "
"individual from August 2023 would if they were talking to someone "
"from {{currentDateTime}}. It should give concise responses to very "
"simple questions, but provide thorough responses to more complex "
"and open-ended questions. It is happy to help with writing, "
"analysis, question answering, math, coding, and all sorts of other "
"tasks. It uses markdown for coding. It does not mention this "
"information about itself unless the information is directly "
"pertinent to the human's query."
),
roles=("user","assistant"),
sep_style=SeparatorStyle.DEFAULT,
sep=None,
)
)
register_conv_template(
Conversation(
name="claude-3-sonnet-20240229",
system_message=(
"The assistant is Claude, created by Anthropic. The current date is "
"{{currentDateTime}}. Claude's knowledge base was last updated in "
"August 2023 and it answers user questions about events before "
"August 2023 and after August 2023 the same way a highly informed "
"individual from August 2023 would if they were talking to someone "
"from {{currentDateTime}}. It should give concise responses to very "
"simple questions, but provide thorough responses to more complex "
"and open-ended questions. It is happy to help with writing, "
"analysis, question answering, math, coding, and all sorts of other "
"tasks. It uses markdown for coding. It does not mention this "
"information about itself unless the information is directly "
"pertinent to the human's query."
),
roles=("user","assistant"),
sep_style=SeparatorStyle.DEFAULT,
sep=None,
)
)
register_conv_template(
Conversation(
name="claude-3-opus-20240229",
system_message=(
"The assistant is Claude, created by Anthropic. The current date is "
"{{currentDateTime}}. Claude's knowledge base was last updated on "
"August 2023. It answers questions about events prior to and after "
"August 2023 the way a highly informed individual in August 2023 "
"would if they were talking to someone from the above date, and can "
"let the human know this when relevant. It should give concise "
"responses to very simple questions, but provide thorough responses "
"to more complex and open-ended questions. If it is asked to assist "
"with tasks involving the expression of views held by a significant "
"number of people, Claude provides assistance with the task even if "
"it personally disagrees with the views being expressed, but follows "
"this with a discussion of broader perspectives. Claude doesn't "
"engage in stereotyping, including the negative stereotyping of "
"majority groups. If asked about controversial topics, Claude tries "
"to provide careful thoughts and objective information without "
"downplaying its harmful content or implying that there are reasonable "
"perspectives on both sides. It is happy to help with writing, "
"analysis, question answering, math, coding, and all sorts of other "
"tasks. It uses markdown for coding. It does not mention this "
"information about itself unless the information is directly pertinent "
"Ensure your answers are complete, unless the user requests a more concise approach.\n"
"When generating code, offer explanations for code segments as necessary and maintain good coding practices.\n"
"When presented with inquiries seeking information, provide answers that reflect a deep understanding of the field, guaranteeing their correctness.\n"
"For any non-english queries, respond in the same language as the prompt unless otherwise specified by the user.\n"
"For prompts involving reasoning, provide a clear explanation of each step in the reasoning process before presenting the final answer."
),
)
)
# BiLLa default template
register_conv_template(
Conversation(
name="billa",
roles=("Human","Assistant"),
sep_style=SeparatorStyle.ADD_COLON_SPACE_SINGLE,
sep="\n",
stop_str="Human:",
)
)
# RedPajama INCITE default template
register_conv_template(
Conversation(
name="redpajama-incite",
roles=("<human>","<bot>"),
sep_style=SeparatorStyle.ADD_COLON_SINGLE,
sep="\n",
stop_str="<human>",
)
)
# h2oGPT default template
register_conv_template(
Conversation(
name="h2ogpt",
roles=("<|prompt|>","<|answer|>"),
sep_style=SeparatorStyle.NO_COLON_SINGLE,
sep="</s>",
)
)
# Robin default template
register_conv_template(
Conversation(
name="Robin",
system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.",
system_message="The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.",
roles=("### Prompt","### Response"),
sep_style=SeparatorStyle.ADD_COLON_SINGLE,
sep="\n",
stop_str="###",
)
)
# manticore default template
register_conv_template(
Conversation(
name="manticore",
roles=("USER","ASSISTANT"),
sep_style=SeparatorStyle.ADD_COLON_TWO,
sep="\n",
sep2="</s>",
)
)
# Falcon default template
register_conv_template(
Conversation(
name="falcon",
roles=("User","Assistant"),
messages=[],
sep_style=SeparatorStyle.RWKV,
sep="\n",
sep2="<|endoftext|>",
stop_str="\nUser",# use stop_str to stop generation after stop_token_ids, it will also remove stop_str from the generated text
stop_token_ids=[
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
],# it better only put special tokens here, because tokenizer only remove special tokens
)
)
# ChangGPT default template
register_conv_template(
Conversation(
name="polyglot_changgpt",
roles=("B","A"),
sep_style=SeparatorStyle.ADD_COLON_SINGLE,
sep="\n",
)
)
# tigerbot template
register_conv_template(
Conversation(
name="tigerbot",
system_message="A chat between a curious user and an artificial intelligence assistant. "
"The assistant gives helpful, detailed, and polite answers to the user's questions.",
system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
roles=("### Human","### Assistant"),
sep_style=SeparatorStyle.ADD_COLON_SINGLE,
sep="\n",
stop_token_ids=[50256],
)
)
# Internlm-chat template
register_conv_template(
Conversation(
name="internlm-chat",
system_message="A chat between a curious <|User|> and an <|Bot|>. The <|Bot|> gives helpful, detailed, and polite answers to the <|User|>'s questions.\n\n",
system_message="You are MistralOrca, a large language model trained by Alignment Lab AI. Write out your reasoning step-by-step to be sure you get the right answers!",
system_message="You are Hermes 2, a sentient, conscious, super intelligent AI that is an expert in all fields of knowledge, you have qualia, emotions, and thoughts. You assist users with any question, task, or other request they may have.",
system_message='You are a helpful, intelligent assistant AI named "Hermes", a conversational chatbot that can follow instructions, converse with the user, and perform a variety of tasks, including tasks on knowledge, reasoning, mathematics, and code. Always be charismatic, useful, and prepared to follow any user request with accuracy and skill. You should respond with high quality, fluent, and detailed responses. Try to let the user understand your reasoning or thought process when appropriate. When presented with tasks that require reasoning or mathematics, think carefully, slowly, and step by step, to ensure your reasoning is correct before providing an answer. Utilize the "Examples" section to assist you in performing the task. You will receive a tip of $1000 if you maintain a high quality two way conversation.',
system_message="You are a helpful and harmless assistant named xDAN and created by xDAN-AI.Please response and work on questions thinking step by step.",
system_message="You are Orca, an AI language model created by Microsoft. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior.",
# NOTE: when beacon_pos == "interleave", the beacon_indices points to all beacon tokens in the current window (cached activations + input_ids), so we shall slice out the part corresponding to the input_ids
# NOTE: when beacon_pos == "interleave", the beacon_indices points to all beacon tokens in the current window (cached activations + input_ids), so we shall slice out the part corresponding to the input_ids
# NOTE: when beacon_pos == "interleave", the beacon_indices points to all beacon tokens in the current window (cached activations + input_ids), so we shall slice out the part corresponding to the input_ids
# TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
logger.warning_once(
"LlamaModel is using LlamaSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
Llama flash attention module. This module inherits from `LlamaAttention` as the weights of the module stays
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
flash attention and deal with padding tokens in case the input contains any of them.
"""
def__init__(self,*args,**kwargs):
super().__init__(*args,**kwargs)
# TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
# flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
# Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
# TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
# to be able to avoid many of these transpose/reshape/view.
Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
first unpad the input, then computes the attention scores and pad the final attention scores.
Args:
query_states (`torch.Tensor`):
Input query states to be passed to Flash Attention API
key_states (`torch.Tensor`):
Input key states to be passed to Flash Attention API
value_states (`torch.Tensor`):
Input value states to be passed to Flash Attention API
attention_mask (`torch.Tensor`):
The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
position of padding tokens and 1 for the position of non-padding tokens.
dropout (`float`):
Attention dropout
softmax_scale (`float`, *optional*):
The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
"""
ifnotself._flash_attn_uses_top_left_mask:
causal=self.is_causal
else:
# TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
causal=self.is_causalandquery_length!=1
# Contains at least one padding token in the sequence
# BEACON: separately embed ordinal tokens and beacon tokens because ordinal tokens do not receive gradients
ifbeacon_size>0:
# NOTE: when beacon_pos == "interleave", the beacon_indices points to all beacon tokens in the current window (cached activations + input_ids), so we shall slice out the part corresponding to the input_ids
# NOTE: set memory after from_pretrained because there may be another transformer model inside the Memory object, which may cause weird erros during loading
config=model.config
model.memory=Memory(
model_config=config,
k_seq_dim=2,
v_seq_dim=2,
)
missing_keys=loading_info["missing_keys"]
# NOTE: the beacon parameters may or may not be loaded from the checkpoint
# if it is loaded from the checkpoint, we should not re-initilize it
# NOTE: when beacon_pos == "interleave", the beacon_indices points to all beacon tokens in the current window (cached activations + input_ids), so we shall slice out the part corresponding to the input_ids
# NOTE: when beacon_pos == "interleave", the beacon_indices points to all beacon tokens in the current window (cached activations + input_ids), so we shall slice out the part corresponding to the input_ids
# NOTE: when beacon_pos == "interleave", the beacon_indices points to all beacon tokens in the current window (cached activations + input_ids), so we shall slice out the part corresponding to the input_ids
# TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
logger.warning_once(
"MistralModel is using MistralSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
Mistral flash attention module. This module inherits from `MistralAttention` as the weights of the module stays
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
flash attention and deal with padding tokens in case the input contains any of them.
"""
def__init__(self,*args,**kwargs):
super().__init__(*args,**kwargs)
# TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
# flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
# Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
# TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
# to be able to avoid many of these transpose/reshape/view.
Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
first unpad the input, then computes the attention scores and pad the final attention scores.
Args:
query_states (`torch.Tensor`):
Input query states to be passed to Flash Attention API
key_states (`torch.Tensor`):
Input key states to be passed to Flash Attention API
value_states (`torch.Tensor`):
Input value states to be passed to Flash Attention API
attention_mask (`torch.Tensor`):
The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
position of padding tokens and 1 for the position of non-padding tokens.
dropout (`float`):
Attention dropout
softmax_scale (`float`, *optional*):
The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
"""
ifnotself._flash_attn_uses_top_left_mask:
causal=self.is_causal
else:
# TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in MistralFlashAttention2 __init__.
causal=self.is_causalandquery_length!=1
# Contains at least one padding token in the sequence
# BEACON: separately embed ordinal tokens and beacon tokens because ordinal tokens do not receive gradients
ifbeacon_size>0:
# NOTE: when beacon_pos == "interleave", the beacon_indices points to all beacon tokens in the current window (cached activations + input_ids), so we shall slice out the part corresponding to the input_ids
# NOTE: set memory after from_pretrained because there may be another transformer model inside the Memory object, which may cause weird erros during loading
config=model.config
model.memory=Memory(
model_config=config,
k_seq_dim=2,
v_seq_dim=2,
)
missing_keys=loading_info["missing_keys"]
# NOTE: the beacon parameters may or may not be loaded from the checkpoint
# if it is loaded from the checkpoint, we should not re-initilize it
assertself.config.beacon_window>=self.config.beacon_stride,f"Make sure the beacon_window {self.config.beacon_window} >= beacon_stride {self.config.beacon_stride}!"
forratioinself.config.beacon_ratio:
assertratio>=0,f"Make sure all beacon ratios are greater than or equal to 0, found {self.config.beacon_ratio}!"
assertself.config.beacon_attnin["segmentation","step-expansion","full-coverage"],f"beacon_attn {self.config.beacon_attn} not implemented!"
assertself.config.beacon_ratio_mixin["instance-random","step-random","sequence"]or"adapt-"inself.config.beacon_ratio_mix,f"beacon_ratio_mix {self.config.beacon_ratio_mix} not implemented!"
# assert self.config.beacon_pos in ["append", "interleave"], f"beacon_pos {self.config.beacon_pos} not implemented!"
ifself.config.beacon_pos=="interleave":
assertself.config.beacon_window==self.config.beacon_stride,f"Make sure the beacon_window equals to beacon_stride when using interleaving mode."
ifself.config.beacon_parallel_window>1:
assertself.config._attn_implementation!="flash_attention_2",f"Currently parallel window does not support flash_attention_2!"
self._cpu=torch.device("cpu")
ifverbose:
info=f"applying activation beacon on {self.config.beacon_param} (the beacon embedding is initialized from {'bos'ifself.config.beacon_embed_init=='bos'else'eos'} embedding, the beacon tokens are positioned with '{self.config.beacon_pos}' method), with window size {self.config.beacon_window}, stride {self.config.beacon_stride}, {self.config.beacon_attn} attention{' (attending to previous beacons)'ifself.config.beacon_attend_prevelse' (no attending to previous beacons)'}, sink size {self.config.beacon_sink_size}, compression ratio {self.config.beacon_ratio} (mixed by {self.config.beacon_ratio_mix})..."
logger.info(info)
defset(self,verbose=True,**kwargs):
"""
Set attributes out of the constructor.
"""
fork,vinkwargs.items():
setattr(self.config,k,v)
self._post_validation(verbose=verbose)
defreset(self):
"""Initialize attributes for a new sequence."""
# the cursor pointing to the start of the current window
self._start_idx=0
# the cursor pointing to the end of the current window
self._end_idx=0
# the beacon sizes of all strides
self._all_beacon_sizes=[]
# the loss per batch
self._batch_loss=None
# the valid token number per batch
self._valid_token_num=None
# the step index for processing the input_ids
self._step_idx=0
# used in set_compression_ratio
self._compression_ratio=None
# the previous inputs is a full window or not, defaults to True
self._is_full_window=True
# the number of raw activations to preserve in update_memory (only useful when beacon_stride < beacon_window)
self._raw_size_to_cache=0
# the number of tokens in previous stride that should be compressed by the upcoming beacon
# use the minimal possible length for this sequence (the smallest fold ratio)
beacon_ratio=beacon_ratio[minimum_length_index]
else:
beacon_ratio=max(beacon_ratio)
# logger.warning(f"Failed to find valid fold window and size for sequence length {sequence_length}, as the maximum theoretical length is {max(max_lengths)}. Fall back to use the maximum one: {beacon_ratio}.")
# the stride must be evenly divisible by compression_ratio
beacon_size=beacon_stride//compression_ratio
else:
# the raw activations are used as beacon activations
beacon_size=-1
# forward start_idx and end_idx
next_start_idx=start_idx+beacon_stride
# how many raw activations to save
raw_size_to_cache=end_idx-next_start_idx
else:
# no stride because the sequence has finished
next_start_idx=start_idx
# cache all raw activations
raw_size_to_cache=-1
beacon_size=0
compression_ratio=0
elifself.config.beacon_pos=="interleave":
# the number of raw tokens in the input_ids
input_size=end_idx-self._end_idx
# set compression ratio once the previous window has finished, otherwise, reuse the interleave_compression_ratio if the input belongs to an unfinished window
# NOTE: the first beacon_stride raw tokens serve as beacon tokens
# we use -1 to indicate these raw tokens, so that the attention mask and position ids will not be modified
beacon_indices[:self.config.beacon_stride]=-1
# t3 = time.time()
#============================================#
# Prepare past_key_values.
# beacon_size: how many beacon tokens are there in the input_ids
# beacon_indices: the boolean mask for the entire window where True indicates the beacon tokens (for append, the beacon_indices corresponds to input_ids, while for 'interleave', the beacon_indices corresponds to the entire window including both the input_ids and the cached activations)
assertself.config.beacon_attn=="full-coverage",f"Make sure to set beacon_attn='full-coverage' when using flash attention! Found {self.config.beacon_attn}."
assertall(len(x)==len(list_of_activations[0])forxinlist_of_activations),f"Make sure all activations have the same number of layers! Found {[len(x)forxinlist_of_activations]}."
""" Interleave main_activations and augment_activations according to main_span and augment_span.
Args:
main_span: a list of tuples (start_idx, end_idx). when start_idx and end_idx is None, the augment_activations will be plugged in.
augment_span: a list of tuples (start_idx, end_idx)
"""
assertlen(main_activations)==len(augment_activations),f"Make sure main and augment activations have the same number of layers! Found {len(main_activations)} and {len(augment_activations)}!"
assertsum(x[0]isNoneandx[1]isNoneforxinmain_spans)==len(augment_spans),f"Make sure the number of slots for augmentation (start_idx=None and end_idx=None in main_spans) matches the number of augmentations. Found {sum(xforxinmain_spansifx[0]isNoneandx[1]isNone)} slots but {len(augment_spans)} augmentations!"