Commit c3aaad59 authored by Baber's avatar Baber
Browse files

add niah

parent d684b9eb
...@@ -60,6 +60,7 @@ class TaskConfig(dict): ...@@ -60,6 +60,7 @@ class TaskConfig(dict):
# HF dataset options. # HF dataset options.
# which dataset to use, # which dataset to use,
# and what splits for what purpose # and what splits for what purpose
download_dataset: Optional[bool] = None
dataset_path: Optional[str] = None dataset_path: Optional[str] = None
dataset_name: Optional[str] = None dataset_name: Optional[str] = None
dataset_kwargs: Optional[dict] = None dataset_kwargs: Optional[dict] = None
...@@ -817,7 +818,10 @@ class ConfigurableTask(Task): ...@@ -817,7 +818,10 @@ class ConfigurableTask(Task):
) )
self._higher_is_better[metric_name] = is_higher_better(metric_name) self._higher_is_better[metric_name] = is_higher_better(metric_name)
self.download(self.config.dataset_kwargs) if self.config.download_dataset is None:
self.download(self.config.dataset_kwargs)
else:
self.dataset = self.config.download_dataset()
self._training_docs = None self._training_docs = None
self._fewshot_docs = None self._fewshot_docs = None
......
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
import glob
import os
import shutil
import urllib.request
from functools import cache
import html2text
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
@cache
def get_essays():
temp_folder_repo = "essay_repo"
temp_folder_html = "essay_html"
os.makedirs(temp_folder_repo, exist_ok=True)
os.makedirs(temp_folder_html, exist_ok=True)
h = html2text.HTML2Text()
h.ignore_images = True
h.ignore_tables = True
h.escape_all = True
h.reference_links = False
h.mark_code = False
url = "https://raw.githubusercontent.com/NVIDIA/RULER/main/scripts/data/synthetic/json/PaulGrahamEssays_URLs.txt"
response = requests.get(url)
response.raise_for_status()
# The content is now in memory as a string
content = response.text
# If you want to process it line by line:
urls = content.splitlines()
for url in tqdm(urls):
if ".html" in url:
filename = url.split("/")[-1].replace(".html", ".txt")
try:
with urllib.request.urlopen(url) as website:
content = website.read().decode("unicode_escape", "utf-8")
soup = BeautifulSoup(content, "html.parser")
specific_tag = soup.find("font")
parsed = h.handle(str(specific_tag))
with open(os.path.join(temp_folder_html, filename), "w") as file:
file.write(parsed)
except Exception as e:
print(f"Fail download {filename}, ({e})")
else:
filename = url.split("/")[-1]
try:
with urllib.request.urlopen(url) as website:
content = website.read().decode("utf-8")
with open(os.path.join(temp_folder_repo, filename), "w") as file:
file.write(content)
except Exception as e:
print(f"Fail download {filename}, ({e})")
files_repo = sorted(glob.glob(os.path.join(temp_folder_repo, "*.txt")))
files_html = sorted(glob.glob(os.path.join(temp_folder_html, "*.txt")))
print(
f"Download {len(files_repo)} essays from `https://github.com/gkamradt/LLMTest_NeedleInAHaystack/`"
)
print(f"Download {len(files_html)} essays from `http://www.paulgraham.com/`")
text = ""
for file in files_repo + files_html:
with open(file, "r") as f:
text += f.read()
shutil.rmtree(temp_folder_repo)
shutil.rmtree(temp_folder_html)
return {"text": text}
# with open('PaulGrahamEssays.json', 'w') as f:
# json.dump({"text": text}, f)
#
# shutil.rmtree(temp_folder_repo)
# shutil.rmtree(temp_folder_html)
import os
import random
import uuid
import numpy as np
import wonderwords
from nltk import sent_tokenize
from tqdm import tqdm
from transformers import AutoTokenizer
TOKENIZER = AutoTokenizer.from_pretrained(os.environ.get("TOKENIZER"))
COUNT = 0
NUM_SAMPLES = 500
REMOVE_NEWLINE_TAB = ""
STOP_WORDS = ""
RANDOM_SEED = 42
# # Define Needle/Haystack Format
NEEDLE = "One of the special magic {type_needle_v} for {key} is: {value}."
# Words
nouns = wonderwords.random_word._get_words_from_text_file("nounlist.txt")
adjs = wonderwords.random_word._get_words_from_text_file("adjectivelist.txt")
verbs = wonderwords.random_word._get_words_from_text_file("verblist.txt")
words = [f"{adj}-{noun}" for adj in adjs for noun in nouns]
WORDS = sorted(list(set(words)))
# Positions
DEPTHS = list(np.round(np.linspace(0, 100, num=40, endpoint=True)).astype(int))
def generate_random_number(num_digits=7):
lower_bound = 10 ** (num_digits - 1)
upper_bound = 10**num_digits - 1
return str(random.randint(lower_bound, upper_bound))
def generate_random_word():
word = random.choice(WORDS)
return word
def generate_random_uuid():
return str(uuid.UUID(int=random.getrandbits(128), version=4))
def generate_random(type_needle: str):
if type_needle == "numbers":
return generate_random_number()
elif type_needle == "words":
return generate_random_word()
elif type_needle == "uuids":
return generate_random_uuid()
else:
raise NotImplementedError(f"{type_needle} is not implemented.")
def generate_input_output(
num_haystack: int,
haystack: list[str] | str,
*,
type_haystack: str,
num_needle_k: int,
type_needle_k: str,
num_needle_v: int,
type_needle_v: str,
template: str,
num_needle_q: int = 1,
random_seed: int = RANDOM_SEED,
):
NEEDLE = "One of the special magic {type_needle_v} for {key} is: {value}."
keys, values, needles = [], [], []
for _ in range(num_needle_k):
keys.append(generate_random(type_needle_k))
value = []
for _ in range(num_needle_v):
value.append(generate_random(type_needle_v))
needles.append(
NEEDLE.format(
type_needle_v=type_needle_v,
key=keys[-1],
value=value[-1],
)
)
values.append(value)
random.Random(random_seed).shuffle(needles)
# Context
if type_haystack == "essay":
assert isinstance(haystack, list)
text = " ".join(haystack[:num_haystack])
document_sents = sent_tokenize(text.strip())
insertion_positions = (
[0]
+ sorted(
[
int(len(document_sents) * (depth / 100))
for depth in random.sample(DEPTHS, len(needles))
]
)
+ [len(document_sents)]
)
document_sents_list = []
for i in range(1, len(insertion_positions)):
last_pos = insertion_positions[i - 1]
next_pos = insertion_positions[i]
document_sents_list.append(" ".join(document_sents[last_pos:next_pos]))
if i - 1 < len(needles):
document_sents_list.append(needles[i - 1])
context = " ".join(document_sents_list)
else:
if type_haystack == "repeat":
sentences = [haystack] * num_haystack
elif type_haystack == "needle":
sentences = [
haystack.format(
type_needle_v=type_needle_v,
key=generate_random(type_needle_k),
value=generate_random(type_needle_v),
)
for _ in range(num_haystack)
]
indexes = sorted(random.sample(range(num_haystack), len(needles)), reverse=True)
for index, element in zip(indexes, needles):
sentences.insert(index, element)
context = "\n".join(sentences)
## Query and Answer
indices = random.sample(range(num_needle_k), num_needle_q)
queries = [keys[i] for i in indices]
answers = [a for i in indices for a in values[i]]
query = (
", ".join(queries[:-1]) + ", and " + queries[-1]
if len(queries) > 1
else queries[0]
)
template = template
type_needle_v = type_needle_v
if num_needle_q * num_needle_v == 1:
template = template.replace("Some", "A")
template = template.replace("are all", "is")
template = template.replace("are", "is")
template = template.replace("answers", "answer")
type_needle_v = type_needle_v[:-1] # remove "s"
input_text = template.format(
type_needle_v=type_needle_v,
context=context,
query=query,
)
return input_text, answers
def generate_samples(
haystack,
*,
max_seq_length: int,
type_haystack: str,
type_needle_k: str,
type_needle_v: str,
template: str,
num_samples: int = 500,
tokens_to_generate: int = 128,
num_needle_v: int = 1,
num_needle_k: int = 1,
num_needle_q=1,
incremental: int = 500,
remove_newline_tab: bool = False,
random_seed: int = 42,
):
global COUNT
num_needle_k = max(num_needle_k, num_needle_q)
write_jsons = []
tokens_to_generate = tokens_to_generate
if type_haystack == "essay":
incremental = 500
elif type_haystack == "repeat":
incremental = 25
elif type_haystack == "needle":
incremental = 25
if type_haystack != "essay" and max_seq_length < 4096:
incremental = 5
num_haystack = incremental
total_tokens = 0 # Track the total tokens generated for the first example
while total_tokens + tokens_to_generate < max_seq_length:
input_text, answer = generate_input_output(
num_haystack,
haystack,
type_haystack=type_haystack,
num_needle_k=num_needle_k,
type_needle_k=type_needle_k,
num_needle_v=num_needle_v,
type_needle_v=type_needle_v,
template=template,
num_needle_q=num_needle_q,
random_seed=random_seed,
)
# Calculate the number of tokens in the example
total_tokens = len(TOKENIZER(input_text + " ".join(answer)).input_ids)
# print(
# f"Max length {max_seq_length} | Current length {total_tokens + tokens_to_generate} | Haystack: {num_haystack}"
# )
if total_tokens + tokens_to_generate > max_seq_length:
num_haystack -= incremental
break
if type_haystack == "essay" and num_haystack > len(haystack):
num_haystack = len(haystack)
break
num_haystack += incremental
print("Num haystack:", num_haystack)
# Generate samples
for index in tqdm(range(num_samples)):
used_haystack = num_haystack
while True:
try:
input_text, answer = generate_input_output(
used_haystack,
haystack,
type_haystack=type_haystack,
num_needle_k=num_needle_k,
type_needle_k=type_needle_k,
num_needle_v=num_needle_v,
type_needle_v=type_needle_v,
template=template,
num_needle_q=num_needle_q,
random_seed=random_seed,
)
length = len(TOKENIZER(input_text).input_ids) + tokens_to_generate
assert length <= max_seq_length, f"{length} exceeds max_seq_length."
break
# ruff: noqa
except:
if used_haystack > incremental:
used_haystack -= incremental
if remove_newline_tab:
input_text = " ".join(
input_text.replace("\n", " ").replace("\t", " ").strip().split()
)
formatted_output = {
"index": index,
"input": input_text,
"outputs": answer,
"length": length,
"max_length": max_seq_length,
}
if formatted_output["outputs"][0] not in formatted_output["input"]:
COUNT += 1
write_jsons.append(formatted_output)
print(COUNT)
return write_jsons
tag:
- ruler
task: niah_1
dataset_path: ""
dataset_name: ""
output_type: generate_until
test_split: test
download_dataset: !function utils.niah_single_2
doc_to_text: "{{input}}"
doc_to_target: "{{outputs[0]}}" #" {{answer.split('### ')[-1].rstrip()}}"
process_results: !function utils.process_results
metric_list:
- metric: "4096"
aggregation: !function utils.aggregate_metrics
higher_is_better: true
- metric: "8192"
aggregation: !function utils.aggregate_metrics
higher_is_better: true
- metric: "16384"
aggregation: !function utils.aggregate_metrics
higher_is_better: true
- metric: "32768"
aggregation: !function utils.aggregate_metrics
higher_is_better: true
- metric: "65536"
aggregation: !function utils.aggregate_metrics
higher_is_better: true
- metric: "131072"
aggregation: !function utils.aggregate_metrics
higher_is_better: true
generation_kwargs:
do_sample: true
temperature: 1.0
max_gen_toks: 128
until: []
repeats: 1
metadata:
version: 3.0
# noqa
import itertools
import json
import os
import re
from functools import partial
from typing import Literal
import datasets
from transformers import AutoTokenizer
from lm_eval.tasks.ruler.essays import get_essays
from lm_eval.tasks.ruler.prepare import generate_samples
TOKENIZER = AutoTokenizer.from_pretrained(os.environ.get("TOKENIZER"))
TEMPLATE = """Some special magic {type_needle_v} are hidden within the following text. Make sure to memorize it. I will quiz you about the {type_needle_v} afterwards.\n{context}\nWhat are all the special magic {type_needle_v} for {query} mentioned in the provided text?"""
SEQ_LENGTHS = (
131072,
65536,
32768,
16384,
8192,
4096,
)
NUM_SAMPLES = 500
REMOVE_NEWLINE_TAB = ""
STOP_WORDS = ""
RANDOM_SEED = 42
def get_haystack(type_haystack: Literal["essay", "repeat", "needle"]):
NEEDLE = "One of the special magic {type_needle_v} for {key} is: {value}."
if type_haystack == "essay":
essay = get_essays()["text"]
# essay = json.load(open(essay))["text"]
haystack = re.sub(r"\s+", " ", essay).split(" ")
elif type_haystack == "repeat":
haystack = "The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again."
elif type_haystack == "needle":
haystack = NEEDLE
else:
raise NotImplementedError(f"{type_haystack} is not implemented.")
return haystack
def flatten(df):
return {
"test": datasets.Dataset.from_list(
list(itertools.chain.from_iterable(df)), split=datasets.Split.TEST
)
}
# ruff: noqa
niah_single_1 = lambda: flatten(
generate_samples(
get_haystack(type_haystack="repeat"),
max_seq_length=seq,
template=TEMPLATE,
type_haystack="repeat",
type_needle_k="words",
type_needle_v="numbers",
)
for seq in SEQ_LENGTHS
)
# ruff: noqa
niah_single_2 = lambda: flatten(
generate_samples(
get_haystack(type_haystack="essay"),
max_seq_length=seq,
template=TEMPLATE,
type_haystack="essay",
type_needle_k="words",
type_needle_v="numbers",
)
for seq in SEQ_LENGTHS
)
# noqa
niah_single_3 = lambda: flatten(
generate_samples(
get_haystack(type_haystack="essay"),
max_seq_length=seq,
template=TEMPLATE,
type_haystack="essay",
type_needle_k="words",
type_needle_v="uuids",
)
for seq in SEQ_LENGTHS
)
# noqa
niah_multikey_1 = lambda: flatten(
generate_samples(
get_haystack(type_haystack="essay"),
max_seq_length=seq,
template=TEMPLATE,
type_haystack="essay",
type_needle_k="words",
type_needle_v="numbers",
num_needle_k=4,
)
for seq in SEQ_LENGTHS
)
# noqa
niah_multikey_2 = lambda: flatten(
generate_samples(
get_haystack(type_haystack="needle"),
max_seq_length=seq,
template=TEMPLATE,
type_haystack="needle",
type_needle_k="words",
type_needle_v="numbers",
)
for seq in SEQ_LENGTHS
)
# noqa
niah_multikey_3 = lambda: flatten(
generate_samples(
get_haystack(type_haystack="needle"),
max_seq_length=seq,
template=TEMPLATE,
type_haystack="needle",
type_needle_k="uuids",
type_needle_v="uuids",
)
for seq in SEQ_LENGTHS
)
# noqa
niah_multivalue = lambda: flatten(
generate_samples(
get_haystack(type_haystack="essay"),
max_seq_length=seq,
template=TEMPLATE,
type_haystack="essay",
type_needle_k="words",
type_needle_v="numbers",
num_needle_v=4,
)
for seq in SEQ_LENGTHS
)
# noqa
niah_multiquery = lambda: flatten(
generate_samples(
get_haystack(type_haystack="essay"),
max_seq_length=seq,
template=TEMPLATE,
type_haystack="essay",
type_needle_k="words",
type_needle_v="numbers",
num_needle_q=4,
)
for seq in SEQ_LENGTHS
)
def postprocess_pred(predict_str: str):
predict_str = predict_str.strip()
# Remove all non-printable characters
np_pattern = re.compile(r"[\x00-\x1f]")
predict_str = np_pattern.sub("\n", predict_str).strip()
return predict_str
def process_results(doc, results):
metrics = {str(length): -1.0 for length in SEQ_LENGTHS}
input_len = doc["max_length"]
acc = 1.0 if postprocess_pred(results[0]) in doc["input"] else 0.0
metrics[str(next(length for length in SEQ_LENGTHS if input_len <= length))] = acc
return metrics
def aggregate_metrics(metrics):
return {
length: sum(metric[length] for metric in metrics) / len(metrics)
for length in SEQ_LENGTHS
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment