add more ruler tasks

f604d6c4 · Baber · 556a79b9 · f604d6c4 · f604d6c4 · f604d6c4
Commit f604d6c4 authored Dec 20, 2024 by Baber
15 changed files
--- a/lm_eval/tasks/ruler/essays.py
+++ b/lm_eval/tasks/ruler/essays.py
@@ -121,78 +121,3 @@ async def get_essays() -> Dict[str, str]:
 def get_all_essays() -> Dict[str, str]:
    """Synchronous wrapper for get_essays()"""
    return asyncio.run(get_essays())
-# @cache
-# def get_essays():
-#     temp_folder_repo = "essay_repo"
-#     temp_folder_html = "essay_html"
-#     os.makedirs(temp_folder_repo, exist_ok=True)
-#     os.makedirs(temp_folder_html, exist_ok=True)
-#
-#     h = html2text.HTML2Text()
-#     h.ignore_images = True
-#     h.ignore_tables = True
-#     h.escape_all = True
-#     h.reference_links = False
-#     h.mark_code = False
-#
-#     url = "https://raw.githubusercontent.com/NVIDIA/RULER/main/scripts/data/synthetic/json/PaulGrahamEssays_URLs.txt"
-#     response = requests.get(url)
-#     response.raise_for_status()
-#
-#     # The content is now in memory as a string
-#     content = response.text
-#
-#     # If you want to process it line by line:
-#     urls = content.splitlines()
-#
-#     for url in tqdm(urls):
-#         if ".html" in url:
-#             filename = url.split("/")[-1].replace(".html", ".txt")
-#             try:
-#                 with urllib.request.urlopen(url) as website:
-#                     content = website.read().decode("unicode_escape", "utf-8")
-#                     soup = BeautifulSoup(content, "html.parser")
-#                     specific_tag = soup.find("font")
-#                     parsed = h.handle(str(specific_tag))
-#
-#                     with open(os.path.join(temp_folder_html, filename), "w") as file:
-#                         file.write(parsed)
-#
-#             except Exception as e:
-#                 print(f"Fail download {filename}, ({e})")
-#
-#         else:
-#             filename = url.split("/")[-1]
-#             try:
-#                 with urllib.request.urlopen(url) as website:
-#                     content = website.read().decode("utf-8")
-#
-#                 with open(os.path.join(temp_folder_repo, filename), "w") as file:
-#                     file.write(content)
-#
-#             except Exception as e:
-#                 print(f"Fail download {filename}, ({e})")
-#
-#     files_repo = sorted(glob.glob(os.path.join(temp_folder_repo, "*.txt")))
-#     files_html = sorted(glob.glob(os.path.join(temp_folder_html, "*.txt")))
-#     print(
-#         f"Download {len(files_repo)} essays from `https://github.com/gkamradt/LLMTest_NeedleInAHaystack/`"
-#     )
-#     print(f"Download {len(files_html)} essays from `http://www.paulgraham.com/`")
-#
-#     text = ""
-#     for file in files_repo + files_html:
-#         with open(file, "r") as f:
-#             text += f.read()
-#
-#     shutil.rmtree(temp_folder_repo)
-#     shutil.rmtree(temp_folder_html)
-#     return {"text": text}
-#
-#     # with open('PaulGrahamEssays.json', 'w') as f:
-#     #     json.dump({"text": text}, f)
-#     #
-#     # shutil.rmtree(temp_folder_repo)
-#     # shutil.rmtree(temp_folder_html)
--- a/lm_eval/tasks/ruler/sniah_1.yaml
+++ b/lm_eval/tasks/ruler/sniah_1.yaml
@@ -14,12 +14,12 @@ metric_list:
  - metric: "4096"
    aggregation: !function utils.aggregate_metrics
    higher_is_better: true
-#  - metric: "8192"
+  - metric: "8192"
-#    aggregation: !function utils.aggregate_metrics
+    aggregation: !function utils.aggregate_metrics
-#    higher_is_better: true
+    higher_is_better: true
-#  - metric: "16384"
+  - metric: "16384"
-#    aggregation: !function utils.aggregate_metrics
+    aggregation: !function utils.aggregate_metrics
-#    higher_is_better: true
+    higher_is_better: true
 #  - metric: "32768"
 #    aggregation: !function utils.aggregate_metrics
 #    higher_is_better: true

--- a/lm_eval/tasks/ruler/niah_2.yaml
+++ b/lm_eval/tasks/ruler/niah_2.yaml
+task: niah_2
+include: niah_1.yaml
+download_dataset: !function utils.niah_single_2
--- a/lm_eval/tasks/ruler/niah_3.yaml
+++ b/lm_eval/tasks/ruler/niah_3.yaml
+task: niah_3
+include: niah_1.yaml
+download_dataset: !function utils.niah_single_3
--- a/lm_eval/tasks/ruler/niah_4.yaml
+++ b/lm_eval/tasks/ruler/niah_4.yaml
+task: niah_4
+include: niah_1.yaml
+download_dataset: !function utils.niah_multikey_1
--- a/lm_eval/tasks/ruler/niah_5.yaml
+++ b/lm_eval/tasks/ruler/niah_5.yaml
+task: niah_5
+include: niah_1.yaml
+download_dataset: !function utils.niah_multikey_2
--- a/lm_eval/tasks/ruler/niah_6.yaml
+++ b/lm_eval/tasks/ruler/niah_6.yaml
+task: niah_6
+include: niah_1.yaml
+download_dataset: !function utils.niah_multikey_3
--- a/lm_eval/tasks/ruler/niah_7.yaml
+++ b/lm_eval/tasks/ruler/niah_7.yaml
+task: niah_7
+include: niah_1.yaml
+download_dataset: !function utils.niah_multivalue
--- a/lm_eval/tasks/ruler/niah_8.yaml
+++ b/lm_eval/tasks/ruler/niah_8.yaml
+task: niah_7
+include: niah_1.yaml
+download_dataset: !function vt_utils.get_vt_dataset
--- a/lm_eval/tasks/ruler/prepare.py
+++ b/lm_eval/tasks/ruler/prepare.py
@@ -21,7 +21,14 @@ NUM_SAMPLES = 500
 REMOVE_NEWLINE_TAB = ""
 STOP_WORDS = ""
 RANDOM_SEED = 42
+SEQ_LENGTHS = (
+    # 131072,
+    # 65536,
+    # 32768,
+    16384,
+    8192,
+    4096,
+)
 # # Define Needle/Haystack Format
 NEEDLE = "One of the special magic {type_needle_v} for {key} is: {value}."

--- a/lm_eval/tasks/ruler/qa.yaml
+++ b/lm_eval/tasks/ruler/qa.yaml
+include: niah_1.yaml
+task: ruler_qa
+dataset_path: rajpurkar/squad_v2
+dataset_name: main
+process_docs: !function qa_utils.process_squad_docs
+test_split: validation
+download_dataset: ""
--- a/lm_eval/tasks/ruler/qa_utils.py
+++ b/lm_eval/tasks/ruler/qa_utils.py
+# import random
+# from collections import defaultdict
+#
+# from datasets import tqdm
+#
+#
+# # def process_qa_docs(df: datasets.Dataset) -> datasets.Dataset:
+# #     df = df.rename_column("question", "query")
+# #     df = df.rename_column("answers", "outputs")
+# #     df = df.map(lambda x: {"outputs": x["outputs"].get("text", [])})
+# #     return df
+#
+#
+# def process_qa_docs(dataset):
+#     contexts = sorted(list(set(dataset["context"])))
+#     ctx_to_id = {c: i for i, c in enumerate(contexts)}
+#
+#     title_contexts = defaultdict(list)
+#     for ex in dataset:
+#         title_contexts[ex["title"]].append(ctx_to_id[ex["context"]])
+#
+#     # Process function for map
+#     def process_example(example):
+#         if len(example["answers"]["text"]) == 0:
+#             return None
+#
+#         ctx_id = ctx_to_id[example["context"]]
+#         return {
+#             "query": example["question"],
+#             "outputs": example["answers"]["text"],
+#             "context": [ctx_id],
+#             "more_context": [
+#                 i for i in title_contexts[example["title"]] if i != ctx_id
+#             ],
+#         }
+#
+#     return dataset.map(
+#         process_example,
+#         remove_columns=dataset.column_names,
+#         filter_fn=lambda x: x is not None,
+#     )
+#
+#
+# def generate_input_output(index, num_docs, dataset, random_seed=42):
+#     # Get current example
+#     example = dataset[index]
+#     curr_q = example["query"]
+#     curr_a = example["outputs"]
+#     curr_docs = example["context"]
+#     curr_more = example["more_context"]
+#     all_contexts = example["all_contexts"]
+#
+#     if num_docs < len(all_contexts):
+#         if (num_docs - len(curr_docs)) > len(curr_more):
+#             # Need additional random docs
+#             addition_docs = [
+#                 i for i in range(len(all_contexts)) if i not in curr_docs + curr_more
+#             ]
+#             all_docs = (
+#                 curr_docs
+#                 + curr_more
+#                 + random.sample(
+#                     addition_docs, max(0, num_docs - len(curr_docs) - len(curr_more))
+#                 )
+#             )
+#         else:
+#             # Can just use some of the more_context
+#             all_docs = curr_docs + random.sample(curr_more, num_docs - len(curr_docs))
+#
+#         all_docs = [all_contexts[idx] for idx in all_docs]
+#     else:
+#         all_docs = all_contexts
+#
+#     random.Random(random_seed).shuffle(all_docs)
+#
+#     # Assuming DOCUMENT_PROMPT is something like "Document {i}: {document}"
+#     DOCUMENT_PROMPT = "Document {i}: {document}"
+#     context = "\n\n".join(
+#         [DOCUMENT_PROMPT.format(i=i + 1, document=d) for i, d in enumerate(all_docs)]
+#     )
+#
+#     # Assuming template is something like "{context}\nQuestion: {query}"
+#     template = "{context}\nQuestion: {query}"
+#     input_text = template.format(context=context, query=curr_q)
+#
+#     return input_text, curr_a
+#
+#
+# def get_total_docs(example):
+#     return len(example["all_contexts"]) if "all_contexts" in example else 0
+#
+#
+# def generate_samples(
+#     num_samples: int, max_seq_length: int, save_dir: str, incremental: int = 10
+# ):
+#     write_jsons = []
+#     tokens_to_generate = tokens_to_generate
+#
+#     # Find the perfect num_docs
+#     num_docs = incremental
+#
+#     total_tokens = 0  # Track the total tokens generated for this example
+#     while total_tokens + tokens_to_generate < max_seq_length:
+#         input_text, answer = generate_input_output(0, num_docs)
+#         # Calculate the number of tokens in the example
+#         total_tokens = len(TOKENIZER(input_text + f" {answer}").input_ids)
+#         print(
+#             f"Max length {max_seq_length} | Current length {total_tokens + tokens_to_generate} | Docs: {num_docs}"
+#         )
+#         if total_tokens + tokens_to_generate > max_seq_length:
+#             num_docs -= incremental
+#             break
+#
+#         num_docs += incremental
+#         if num_docs > len(DOCS):
+#             num_docs = len(DOCS)
+#             break
+#     print("Number of documents:", num_docs)
+#
+#     # Generate samples
+#     for index in tqdm(range(num_samples)):
+#         used_docs = num_docs
+#         while True:
+#             try:
+#                 input_text, answer = generate_input_output(
+#                     index + pre_samples, used_docs
+#                 )
+#                 length = len(TOKENIZER(input_text).input_ids) + tokens_to_generate
+#                 assert length <= max_seq_length, f"{length} exceeds max_seq_length."
+#                 break
+#             except:
+#                 if used_docs > incremental:
+#                     used_docs -= incremental
+#
+#         if remove_newline_tab:
+#             input_text = " ".join(
+#                 input_text.replace("\n", " ").replace("\t", " ").strip().split()
+#             )
+#
+#         formatted_output = {
+#             "index": index,
+#             "input": input_text,
+#             "outputs": answer,
+#             "length": length,
+#         }
+#         write_jsons.append(formatted_output)
+#
+#     return write_jsons
--- a/lm_eval/tasks/ruler/utils.py
+++ b/lm_eval/tasks/ruler/utils.py
 # noqa
 import itertools
-import json
-import os
 import re
-from functools import partial, cache
+from functools import cache
 from typing import Literal
 import datasets
 from transformers import AutoTokenizer
-from lm_eval.tasks.ruler.essays import get_essays, get_all_essays
+from lm_eval.tasks.ruler.essays import get_all_essays
 from lm_eval.tasks.ruler.prepare import generate_samples
@@ -25,8 +23,8 @@ SEQ_LENGTHS = (
    # 131072,
    # 65536,
    # 32768,
-    # 16384,
+    16384,
-    # 8192,
+    8192,
    4096,
 )

--- a/lm_eval/tasks/ruler/vt.yaml
+++ b/lm_eval/tasks/ruler/vt.yaml
+include: niah_1.yaml
+task: ruler_vt
+download_dataset: !function vt_utils.get_vt_dataset
+generation_kwargs:
+  do_sample: false
+  temperature: 0.0
+  max_gen_toks: 128
+  until: []
--- a/lm_eval/tasks/ruler/vt_utils.py
+++ b/lm_eval/tasks/ruler/vt_utils.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# adapted from https://github.com/NVIDIA/RULER/blob/main/scripts/data/synthetic/variable_tracking.py
+import itertools
+import random
+import string
+from functools import cache
+import datasets
+import numpy as np
+from tqdm import tqdm
+from transformers import AutoTokenizer
+from lm_eval.tasks.ruler.prepare import SEQ_LENGTHS
+TASKS = {
+    "variable_tracking": {
+        "tokens_to_generate": 30,
+        "template": """Memorize and track the chain(s) of variable assignment hidden in the following text.\n\n{context}\nQuestion: Find all variables that are assigned the value {query} in the text above.""",
+        "answer_prefix": """ Answer: According to the chain(s) of variable assignment in the text above, {num_v} variables are assgined the value {query}, they are: """,
+    },
+}
+TEMPLATE = (
+    """Memorize and track the chain(s) of variable assignment hidden in the following text.\n\n{context}\nQuestion: Find all variables that are assigned the value {query} in the text above."""
+    + TASKS["variable_tracking"]["answer_prefix"]
+)
+@cache
+def get_tokenizer(pretrained):
+    return AutoTokenizer.from_pretrained(pretrained, trust_remote_code=True)
+def generate_chains(num_chains, num_hops, is_icl=False):
+    vars_all = []
+    k = 5 if not is_icl else 3
+    num_hops = num_hops if not is_icl else min(10, num_hops)
+    vars_all = [
+        "".join(random.choices(string.ascii_uppercase, k=k)).upper()
+        for _ in range((num_hops + 1) * num_chains)
+    ]
+    while len(set(vars_all)) < num_chains * (num_hops + 1):
+        vars_all.append("".join(random.choices(string.ascii_uppercase, k=k)).upper())
+    vars_ret = []
+    chains_ret = []
+    for i in range(0, len(vars_all), num_hops + 1):
+        this_vars = vars_all[i : i + num_hops + 1]
+        vars_ret.append(this_vars)
+        this_chain = [f"VAR {this_vars[0]} = {np.random.randint(10000, 99999)}"]
+        for j in range(num_hops):
+            this_chain.append(f"VAR {this_vars[j + 1]} = VAR {this_vars[j]} ")
+        chains_ret.append(this_chain)
+    return vars_ret, chains_ret
+def generate_input_output(num_noises, num_chains, num_hops, is_icl=False):
+    vars, chains = generate_chains(num_chains, num_hops, is_icl=is_icl)
+    noise = "The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again.\n"
+    # Create a list of the repeated noise
+    sentences = [noise] * num_noises
+    if len(sentences) <= len(chains[0]):
+        sentences = [
+            n + "." if len(n.strip()) > 0 else n
+            for n in [x for noise in sentences for x in noise.split(".")]
+        ]
+        try:
+            assert len(sentences) > len(
+                chains[0]
+            ), "Noises too short, unable to generate data"
+        # ruff: noqa
+        except:
+            print("reduces chain length for not enough noises")
+            chains = [chain[: len(sentences) - 1] for chain in chains]
+    # sample random positions to insert variable assignment
+    for chain_i in chains:
+        # sample random positions (sorted) to insert variable assignment
+        positions = list(sorted(random.sample(range(len(sentences)), len(chain_i))))
+        for insert_pi, j in zip(positions, range(len(chain_i))):
+            sentences.insert(insert_pi + j, chain_i[j])
+    # Insert the passkey sentence at the random position
+    context = " ".join(sentences)
+    context = context.replace(". \n", ".\n")
+    template = TEMPLATE
+    if (
+        is_icl
+        and template
+        != TASKS["variable_tracking"]["template"]
+        + TASKS["variable_tracking"]["answer_prefix"]
+    ):
+        # remove model template
+        cutoff = template.index(TASKS["variable_tracking"]["template"][:20])
+        cutoff_ans = template.index(TASKS["variable_tracking"]["answer_prefix"][:10])
+        template = (
+            " ".join(template[cutoff:cutoff_ans].split()[:-1]) + template[cutoff_ans:]
+        )
+    value = chains[0][0].split("=")[-1].strip()
+    input_text = template.format(context=context, query=value, num_v=num_hops + 1)
+    return input_text, vars[0]
+def randomize_icl(icl_example):
+    icl_tgt_cut = icl_example.index(TASKS["variable_tracking"]["answer_prefix"][-10:])
+    icl_tgt = icl_example[icl_tgt_cut + 10 :].strip().split()
+    for item in icl_tgt:
+        new_item = "".join(random.choices(string.ascii_uppercase, k=len(item))).upper()
+        icl_example = icl_example.replace(item, new_item)
+    return icl_example
+def sys_vartrack_w_noise_random(
+    tokenizer,
+    num_samples: int,
+    max_seq_length: int,
+    incremental: int = 10,
+    num_chains: int = 1,
+    num_hops: int = 4,
+    add_fewshot: bool = True,
+    tokens_to_generate=30,
+    icl_example: dict = None,
+    remove_newline_tab=False,
+):
+    write_jsons = []
+    # Find the perfect num_noises
+    num_noises = incremental
+    TOKENIZER = tokenizer
+    total_tokens = 0  # Track the total tokens generated for this example
+    example_tokens = 0
+    if add_fewshot and (icl_example is not None):
+        icl_example_out = " ".join(icl_example["outputs"])
+        icl_example = icl_example["input"] + " " + icl_example_out + "\n\n"
+        example_tokens = len(TOKENIZER(icl_example).input_ids)
+    while total_tokens + tokens_to_generate + example_tokens < max_seq_length:
+        input_text, answer = generate_input_output(
+            num_noises, num_chains, num_hops, is_icl=add_fewshot & (icl_example is None)
+        )
+        # Calculate the number of tokens in the example
+        total_tokens = len(TOKENIZER(input_text + f" {answer}").input_ids)
+        print(
+            f"Max length {max_seq_length} | Current length {total_tokens + tokens_to_generate + example_tokens} | Noises: {num_noises}"
+        )
+        if total_tokens + tokens_to_generate + example_tokens > max_seq_length:
+            num_noises -= incremental
+            break
+        num_noises += incremental
+    print("Num noises:", num_noises)
+    # Generate samples
+    for index in tqdm(range(num_samples)):
+        used_noises = num_noises
+        while True:
+            try:
+                input_text, answer = generate_input_output(
+                    used_noises,
+                    num_chains,
+                    num_hops,
+                    is_icl=add_fewshot & (icl_example is None),
+                )
+                length = (
+                    len(TOKENIZER(input_text).input_ids)
+                    + tokens_to_generate
+                    + example_tokens
+                )
+                assert length <= max_seq_length, f"{length} exceeds max_seq_length."
+                break
+            # ruff: noqa
+            except:
+                if used_noises > incremental:
+                    used_noises -= incremental
+        if add_fewshot and (icl_example is not None):
+            # insert icl_example between model template and input
+            cutoff = input_text.index(TASKS["variable_tracking"]["template"][:20])
+            input_text = (
+                input_text[:cutoff]
+                + randomize_icl(icl_example)
+                + "\n\n"
+                + input_text[cutoff:]
+            )
+        if remove_newline_tab:
+            input_text = " ".join(
+                input_text.replace("\n", " ").replace("\t", " ").strip().split()
+            )
+        formatted_output = {
+            "index": index,
+            "input": input_text,
+            "outputs": answer,
+            "length": length,
+            "max_length": max_seq_length,
+        }
+        write_jsons.append(formatted_output)
+    return write_jsons
+def get_dataset(pretrained, seq=None, **kwargs):
+    tokenizer = get_tokenizer(pretrained)
+    icl_example = sys_vartrack_w_noise_random(
+        tokenizer=tokenizer, num_samples=1, max_seq_length=500, incremental=5
+    )[0]
+    write_jsons = sys_vartrack_w_noise_random(
+        tokenizer=tokenizer,
+        num_samples=500,
+        max_seq_length=seq,
+        icl_example=icl_example,
+    )
+    return write_jsons
+def get_vt_dataset(pretrained=None):
+    df = (get_dataset(pretrained, seq=seq) for seq in SEQ_LENGTHS)
+    return {
+        "test": datasets.Dataset.from_list(
+            list(itertools.chain.from_iterable(df)), split=datasets.Split.TEST
+        )
+    }