Commit f604d6c4 authored by Baber's avatar Baber
Browse files

add more ruler tasks

parent 556a79b9
...@@ -121,78 +121,3 @@ async def get_essays() -> Dict[str, str]: ...@@ -121,78 +121,3 @@ async def get_essays() -> Dict[str, str]:
def get_all_essays() -> Dict[str, str]: def get_all_essays() -> Dict[str, str]:
"""Synchronous wrapper for get_essays()""" """Synchronous wrapper for get_essays()"""
return asyncio.run(get_essays()) return asyncio.run(get_essays())
# @cache
# def get_essays():
# temp_folder_repo = "essay_repo"
# temp_folder_html = "essay_html"
# os.makedirs(temp_folder_repo, exist_ok=True)
# os.makedirs(temp_folder_html, exist_ok=True)
#
# h = html2text.HTML2Text()
# h.ignore_images = True
# h.ignore_tables = True
# h.escape_all = True
# h.reference_links = False
# h.mark_code = False
#
# url = "https://raw.githubusercontent.com/NVIDIA/RULER/main/scripts/data/synthetic/json/PaulGrahamEssays_URLs.txt"
# response = requests.get(url)
# response.raise_for_status()
#
# # The content is now in memory as a string
# content = response.text
#
# # If you want to process it line by line:
# urls = content.splitlines()
#
# for url in tqdm(urls):
# if ".html" in url:
# filename = url.split("/")[-1].replace(".html", ".txt")
# try:
# with urllib.request.urlopen(url) as website:
# content = website.read().decode("unicode_escape", "utf-8")
# soup = BeautifulSoup(content, "html.parser")
# specific_tag = soup.find("font")
# parsed = h.handle(str(specific_tag))
#
# with open(os.path.join(temp_folder_html, filename), "w") as file:
# file.write(parsed)
#
# except Exception as e:
# print(f"Fail download {filename}, ({e})")
#
# else:
# filename = url.split("/")[-1]
# try:
# with urllib.request.urlopen(url) as website:
# content = website.read().decode("utf-8")
#
# with open(os.path.join(temp_folder_repo, filename), "w") as file:
# file.write(content)
#
# except Exception as e:
# print(f"Fail download {filename}, ({e})")
#
# files_repo = sorted(glob.glob(os.path.join(temp_folder_repo, "*.txt")))
# files_html = sorted(glob.glob(os.path.join(temp_folder_html, "*.txt")))
# print(
# f"Download {len(files_repo)} essays from `https://github.com/gkamradt/LLMTest_NeedleInAHaystack/`"
# )
# print(f"Download {len(files_html)} essays from `http://www.paulgraham.com/`")
#
# text = ""
# for file in files_repo + files_html:
# with open(file, "r") as f:
# text += f.read()
#
# shutil.rmtree(temp_folder_repo)
# shutil.rmtree(temp_folder_html)
# return {"text": text}
#
# # with open('PaulGrahamEssays.json', 'w') as f:
# # json.dump({"text": text}, f)
# #
# # shutil.rmtree(temp_folder_repo)
# # shutil.rmtree(temp_folder_html)
...@@ -14,12 +14,12 @@ metric_list: ...@@ -14,12 +14,12 @@ metric_list:
- metric: "4096" - metric: "4096"
aggregation: !function utils.aggregate_metrics aggregation: !function utils.aggregate_metrics
higher_is_better: true higher_is_better: true
# - metric: "8192" - metric: "8192"
# aggregation: !function utils.aggregate_metrics aggregation: !function utils.aggregate_metrics
# higher_is_better: true higher_is_better: true
# - metric: "16384" - metric: "16384"
# aggregation: !function utils.aggregate_metrics aggregation: !function utils.aggregate_metrics
# higher_is_better: true higher_is_better: true
# - metric: "32768" # - metric: "32768"
# aggregation: !function utils.aggregate_metrics # aggregation: !function utils.aggregate_metrics
# higher_is_better: true # higher_is_better: true
......
task: niah_2
include: niah_1.yaml
download_dataset: !function utils.niah_single_2
task: niah_3
include: niah_1.yaml
download_dataset: !function utils.niah_single_3
task: niah_4
include: niah_1.yaml
download_dataset: !function utils.niah_multikey_1
task: niah_5
include: niah_1.yaml
download_dataset: !function utils.niah_multikey_2
task: niah_6
include: niah_1.yaml
download_dataset: !function utils.niah_multikey_3
task: niah_7
include: niah_1.yaml
download_dataset: !function utils.niah_multivalue
task: niah_7
include: niah_1.yaml
download_dataset: !function vt_utils.get_vt_dataset
...@@ -21,7 +21,14 @@ NUM_SAMPLES = 500 ...@@ -21,7 +21,14 @@ NUM_SAMPLES = 500
REMOVE_NEWLINE_TAB = "" REMOVE_NEWLINE_TAB = ""
STOP_WORDS = "" STOP_WORDS = ""
RANDOM_SEED = 42 RANDOM_SEED = 42
SEQ_LENGTHS = (
# 131072,
# 65536,
# 32768,
16384,
8192,
4096,
)
# # Define Needle/Haystack Format # # Define Needle/Haystack Format
NEEDLE = "One of the special magic {type_needle_v} for {key} is: {value}." NEEDLE = "One of the special magic {type_needle_v} for {key} is: {value}."
......
include: niah_1.yaml
task: ruler_qa
dataset_path: rajpurkar/squad_v2
dataset_name: main
process_docs: !function qa_utils.process_squad_docs
test_split: validation
download_dataset: ""
# import random
# from collections import defaultdict
#
# from datasets import tqdm
#
#
# # def process_qa_docs(df: datasets.Dataset) -> datasets.Dataset:
# # df = df.rename_column("question", "query")
# # df = df.rename_column("answers", "outputs")
# # df = df.map(lambda x: {"outputs": x["outputs"].get("text", [])})
# # return df
#
#
# def process_qa_docs(dataset):
# contexts = sorted(list(set(dataset["context"])))
# ctx_to_id = {c: i for i, c in enumerate(contexts)}
#
# title_contexts = defaultdict(list)
# for ex in dataset:
# title_contexts[ex["title"]].append(ctx_to_id[ex["context"]])
#
# # Process function for map
# def process_example(example):
# if len(example["answers"]["text"]) == 0:
# return None
#
# ctx_id = ctx_to_id[example["context"]]
# return {
# "query": example["question"],
# "outputs": example["answers"]["text"],
# "context": [ctx_id],
# "more_context": [
# i for i in title_contexts[example["title"]] if i != ctx_id
# ],
# }
#
# return dataset.map(
# process_example,
# remove_columns=dataset.column_names,
# filter_fn=lambda x: x is not None,
# )
#
#
# def generate_input_output(index, num_docs, dataset, random_seed=42):
# # Get current example
# example = dataset[index]
# curr_q = example["query"]
# curr_a = example["outputs"]
# curr_docs = example["context"]
# curr_more = example["more_context"]
# all_contexts = example["all_contexts"]
#
# if num_docs < len(all_contexts):
# if (num_docs - len(curr_docs)) > len(curr_more):
# # Need additional random docs
# addition_docs = [
# i for i in range(len(all_contexts)) if i not in curr_docs + curr_more
# ]
# all_docs = (
# curr_docs
# + curr_more
# + random.sample(
# addition_docs, max(0, num_docs - len(curr_docs) - len(curr_more))
# )
# )
# else:
# # Can just use some of the more_context
# all_docs = curr_docs + random.sample(curr_more, num_docs - len(curr_docs))
#
# all_docs = [all_contexts[idx] for idx in all_docs]
# else:
# all_docs = all_contexts
#
# random.Random(random_seed).shuffle(all_docs)
#
# # Assuming DOCUMENT_PROMPT is something like "Document {i}: {document}"
# DOCUMENT_PROMPT = "Document {i}: {document}"
# context = "\n\n".join(
# [DOCUMENT_PROMPT.format(i=i + 1, document=d) for i, d in enumerate(all_docs)]
# )
#
# # Assuming template is something like "{context}\nQuestion: {query}"
# template = "{context}\nQuestion: {query}"
# input_text = template.format(context=context, query=curr_q)
#
# return input_text, curr_a
#
#
# def get_total_docs(example):
# return len(example["all_contexts"]) if "all_contexts" in example else 0
#
#
# def generate_samples(
# num_samples: int, max_seq_length: int, save_dir: str, incremental: int = 10
# ):
# write_jsons = []
# tokens_to_generate = tokens_to_generate
#
# # Find the perfect num_docs
# num_docs = incremental
#
# total_tokens = 0 # Track the total tokens generated for this example
# while total_tokens + tokens_to_generate < max_seq_length:
# input_text, answer = generate_input_output(0, num_docs)
# # Calculate the number of tokens in the example
# total_tokens = len(TOKENIZER(input_text + f" {answer}").input_ids)
# print(
# f"Max length {max_seq_length} | Current length {total_tokens + tokens_to_generate} | Docs: {num_docs}"
# )
# if total_tokens + tokens_to_generate > max_seq_length:
# num_docs -= incremental
# break
#
# num_docs += incremental
# if num_docs > len(DOCS):
# num_docs = len(DOCS)
# break
# print("Number of documents:", num_docs)
#
# # Generate samples
# for index in tqdm(range(num_samples)):
# used_docs = num_docs
# while True:
# try:
# input_text, answer = generate_input_output(
# index + pre_samples, used_docs
# )
# length = len(TOKENIZER(input_text).input_ids) + tokens_to_generate
# assert length <= max_seq_length, f"{length} exceeds max_seq_length."
# break
# except:
# if used_docs > incremental:
# used_docs -= incremental
#
# if remove_newline_tab:
# input_text = " ".join(
# input_text.replace("\n", " ").replace("\t", " ").strip().split()
# )
#
# formatted_output = {
# "index": index,
# "input": input_text,
# "outputs": answer,
# "length": length,
# }
# write_jsons.append(formatted_output)
#
# return write_jsons
# noqa # noqa
import itertools import itertools
import json
import os
import re import re
from functools import partial, cache from functools import cache
from typing import Literal from typing import Literal
import datasets import datasets
from transformers import AutoTokenizer from transformers import AutoTokenizer
from lm_eval.tasks.ruler.essays import get_essays, get_all_essays from lm_eval.tasks.ruler.essays import get_all_essays
from lm_eval.tasks.ruler.prepare import generate_samples from lm_eval.tasks.ruler.prepare import generate_samples
...@@ -25,8 +23,8 @@ SEQ_LENGTHS = ( ...@@ -25,8 +23,8 @@ SEQ_LENGTHS = (
# 131072, # 131072,
# 65536, # 65536,
# 32768, # 32768,
# 16384, 16384,
# 8192, 8192,
4096, 4096,
) )
......
include: niah_1.yaml
task: ruler_vt
download_dataset: !function vt_utils.get_vt_dataset
generation_kwargs:
do_sample: false
temperature: 0.0
max_gen_toks: 128
until: []
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# adapted from https://github.com/NVIDIA/RULER/blob/main/scripts/data/synthetic/variable_tracking.py
import itertools
import random
import string
from functools import cache
import datasets
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer
from lm_eval.tasks.ruler.prepare import SEQ_LENGTHS
TASKS = {
"variable_tracking": {
"tokens_to_generate": 30,
"template": """Memorize and track the chain(s) of variable assignment hidden in the following text.\n\n{context}\nQuestion: Find all variables that are assigned the value {query} in the text above.""",
"answer_prefix": """ Answer: According to the chain(s) of variable assignment in the text above, {num_v} variables are assgined the value {query}, they are: """,
},
}
TEMPLATE = (
"""Memorize and track the chain(s) of variable assignment hidden in the following text.\n\n{context}\nQuestion: Find all variables that are assigned the value {query} in the text above."""
+ TASKS["variable_tracking"]["answer_prefix"]
)
@cache
def get_tokenizer(pretrained):
return AutoTokenizer.from_pretrained(pretrained, trust_remote_code=True)
def generate_chains(num_chains, num_hops, is_icl=False):
vars_all = []
k = 5 if not is_icl else 3
num_hops = num_hops if not is_icl else min(10, num_hops)
vars_all = [
"".join(random.choices(string.ascii_uppercase, k=k)).upper()
for _ in range((num_hops + 1) * num_chains)
]
while len(set(vars_all)) < num_chains * (num_hops + 1):
vars_all.append("".join(random.choices(string.ascii_uppercase, k=k)).upper())
vars_ret = []
chains_ret = []
for i in range(0, len(vars_all), num_hops + 1):
this_vars = vars_all[i : i + num_hops + 1]
vars_ret.append(this_vars)
this_chain = [f"VAR {this_vars[0]} = {np.random.randint(10000, 99999)}"]
for j in range(num_hops):
this_chain.append(f"VAR {this_vars[j + 1]} = VAR {this_vars[j]} ")
chains_ret.append(this_chain)
return vars_ret, chains_ret
def generate_input_output(num_noises, num_chains, num_hops, is_icl=False):
vars, chains = generate_chains(num_chains, num_hops, is_icl=is_icl)
noise = "The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again.\n"
# Create a list of the repeated noise
sentences = [noise] * num_noises
if len(sentences) <= len(chains[0]):
sentences = [
n + "." if len(n.strip()) > 0 else n
for n in [x for noise in sentences for x in noise.split(".")]
]
try:
assert len(sentences) > len(
chains[0]
), "Noises too short, unable to generate data"
# ruff: noqa
except:
print("reduces chain length for not enough noises")
chains = [chain[: len(sentences) - 1] for chain in chains]
# sample random positions to insert variable assignment
for chain_i in chains:
# sample random positions (sorted) to insert variable assignment
positions = list(sorted(random.sample(range(len(sentences)), len(chain_i))))
for insert_pi, j in zip(positions, range(len(chain_i))):
sentences.insert(insert_pi + j, chain_i[j])
# Insert the passkey sentence at the random position
context = " ".join(sentences)
context = context.replace(". \n", ".\n")
template = TEMPLATE
if (
is_icl
and template
!= TASKS["variable_tracking"]["template"]
+ TASKS["variable_tracking"]["answer_prefix"]
):
# remove model template
cutoff = template.index(TASKS["variable_tracking"]["template"][:20])
cutoff_ans = template.index(TASKS["variable_tracking"]["answer_prefix"][:10])
template = (
" ".join(template[cutoff:cutoff_ans].split()[:-1]) + template[cutoff_ans:]
)
value = chains[0][0].split("=")[-1].strip()
input_text = template.format(context=context, query=value, num_v=num_hops + 1)
return input_text, vars[0]
def randomize_icl(icl_example):
icl_tgt_cut = icl_example.index(TASKS["variable_tracking"]["answer_prefix"][-10:])
icl_tgt = icl_example[icl_tgt_cut + 10 :].strip().split()
for item in icl_tgt:
new_item = "".join(random.choices(string.ascii_uppercase, k=len(item))).upper()
icl_example = icl_example.replace(item, new_item)
return icl_example
def sys_vartrack_w_noise_random(
tokenizer,
num_samples: int,
max_seq_length: int,
incremental: int = 10,
num_chains: int = 1,
num_hops: int = 4,
add_fewshot: bool = True,
tokens_to_generate=30,
icl_example: dict = None,
remove_newline_tab=False,
):
write_jsons = []
# Find the perfect num_noises
num_noises = incremental
TOKENIZER = tokenizer
total_tokens = 0 # Track the total tokens generated for this example
example_tokens = 0
if add_fewshot and (icl_example is not None):
icl_example_out = " ".join(icl_example["outputs"])
icl_example = icl_example["input"] + " " + icl_example_out + "\n\n"
example_tokens = len(TOKENIZER(icl_example).input_ids)
while total_tokens + tokens_to_generate + example_tokens < max_seq_length:
input_text, answer = generate_input_output(
num_noises, num_chains, num_hops, is_icl=add_fewshot & (icl_example is None)
)
# Calculate the number of tokens in the example
total_tokens = len(TOKENIZER(input_text + f" {answer}").input_ids)
print(
f"Max length {max_seq_length} | Current length {total_tokens + tokens_to_generate + example_tokens} | Noises: {num_noises}"
)
if total_tokens + tokens_to_generate + example_tokens > max_seq_length:
num_noises -= incremental
break
num_noises += incremental
print("Num noises:", num_noises)
# Generate samples
for index in tqdm(range(num_samples)):
used_noises = num_noises
while True:
try:
input_text, answer = generate_input_output(
used_noises,
num_chains,
num_hops,
is_icl=add_fewshot & (icl_example is None),
)
length = (
len(TOKENIZER(input_text).input_ids)
+ tokens_to_generate
+ example_tokens
)
assert length <= max_seq_length, f"{length} exceeds max_seq_length."
break
# ruff: noqa
except:
if used_noises > incremental:
used_noises -= incremental
if add_fewshot and (icl_example is not None):
# insert icl_example between model template and input
cutoff = input_text.index(TASKS["variable_tracking"]["template"][:20])
input_text = (
input_text[:cutoff]
+ randomize_icl(icl_example)
+ "\n\n"
+ input_text[cutoff:]
)
if remove_newline_tab:
input_text = " ".join(
input_text.replace("\n", " ").replace("\t", " ").strip().split()
)
formatted_output = {
"index": index,
"input": input_text,
"outputs": answer,
"length": length,
"max_length": max_seq_length,
}
write_jsons.append(formatted_output)
return write_jsons
def get_dataset(pretrained, seq=None, **kwargs):
tokenizer = get_tokenizer(pretrained)
icl_example = sys_vartrack_w_noise_random(
tokenizer=tokenizer, num_samples=1, max_seq_length=500, incremental=5
)[0]
write_jsons = sys_vartrack_w_noise_random(
tokenizer=tokenizer,
num_samples=500,
max_seq_length=seq,
icl_example=icl_example,
)
return write_jsons
def get_vt_dataset(pretrained=None):
df = (get_dataset(pretrained, seq=seq) for seq in SEQ_LENGTHS)
return {
"test": datasets.Dataset.from_list(
list(itertools.chain.from_iterable(df)), split=datasets.Split.TEST
)
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment