Merge branch 'main' into longcxt

7f22572a · Baber · 5e2979d2 · f724be69 · 7f22572a · 7f22572a
Commit 7f22572a authored Jan 19, 2025 by Baber
19 changed files
--- a/lm_eval/tasks/catalan_bench/flores_ca/create_yamls_flores_ca.py
+++ b/lm_eval/tasks/catalan_bench/flores_ca/create_yamls_flores_ca.py
@@ -259,7 +259,7 @@ def doc_to_text(src: str, tgt: str) -> str:
    src_name, tgt_name = map(code_to_language_name, [src, tgt])
    return f"""\
-{src_name} sentence: {jinja_var('sentence_' + src)}
+{src_name} sentence: {jinja_var("sentence_" + src)}
 {tgt_name} sentence:"""

--- a/lm_eval/tasks/csatqa/utils.py
+++ b/lm_eval/tasks/csatqa/utils.py
@@ -7,7 +7,7 @@ def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
 ### Context: {doc["context"]}
 ### Question: {doc["question"]}
 ### Options:
-(1) {doc['option#1']}\n(2) {doc["option#2"]}\n(3) {doc["option#3"]}\n(4) {doc['option#4']}\n(5) {doc['option#5']}
+(1) {doc["option#1"]}\n(2) {doc["option#2"]}\n(3) {doc["option#3"]}\n(4) {doc["option#4"]}\n(5) {doc["option#5"]}
 ### Answer: 주어진 문제의 정답은"""
        out_doc = {

--- a/lm_eval/tasks/galician_bench/flores_gl/create_yamls_flores_gl.py
+++ b/lm_eval/tasks/galician_bench/flores_gl/create_yamls_flores_gl.py
@@ -258,7 +258,7 @@ def doc_to_text(src: str, tgt: str) -> str:
    src_name, tgt_name = map(code_to_language_name, [src, tgt])
    return f"""\
-{src_name} sentence: {jinja_var('sentence_' + src)}
+{src_name} sentence: {jinja_var("sentence_" + src)}
 {tgt_name} sentence:"""

--- a/lm_eval/tasks/ifeval/instructions.py
+++ b/lm_eval/tasks/ifeval/instructions.py
@@ -722,7 +722,7 @@ class RephraseChecker(Instruction):
        if not self.is_change(value):
            raise ValueError(
-                f"value {value} does not contain " "changes in the form of *change me*."
+                f"value {value} does not contain changes in the form of *change me*."
            )
        response_without_changes = self.strip_changes(value)

--- a/lm_eval/tasks/ifeval/instructions_util.py
+++ b/lm_eval/tasks/ifeval/instructions_util.py
@@ -35,10 +35,11 @@ RANK = os.environ.get("LOCAL_RANK", "0")
 def download_nltk_resources():
    """Download 'punkt' if not already installed"""
-    assert (
+    assert (nltk_version := parse_version(version("nltk"))) >= parse_version(
-        (nltk_version := parse_version(version("nltk")))
+        NLTK_MIN_VERSION
-        >= parse_version(NLTK_MIN_VERSION)
+    ), (
-    ), f"`nltk` version {nltk_version} is not >= {NLTK_MIN_VERSION}. Please update `nltk` before proceeding--older versions are vulnerable to a remote code execution vulnerability."
+        f"`nltk` version {nltk_version} is not >= {NLTK_MIN_VERSION}. Please update `nltk` before proceeding--older versions are vulnerable to a remote code execution vulnerability."
+    )
    try:
        nltk.data.find("tokenizers/punkt_tab")

--- a/lm_eval/tasks/japanese_leaderboard/ja_leaderboard_mgsm.py
+++ b/lm_eval/tasks/japanese_leaderboard/ja_leaderboard_mgsm.py
@@ -23,9 +23,9 @@ def _extract_answer(completion):
 def process_results(doc, results):
-    assert (
+    assert len(results) == 1, (
-        len(results) == 1
+        f"results should be a list with 1 str element, but is {results}"
-    ), f"results should be a list with 1 str element, but is {results}"
+    )
    completion = results[0]
    extracted_answer = _extract_answer(completion)

--- a/lm_eval/tasks/leaderboard/ifeval/instructions.py
+++ b/lm_eval/tasks/leaderboard/ifeval/instructions.py
@@ -722,7 +722,7 @@ class RephraseChecker(Instruction):
        if not self.is_change(value):
            raise ValueError(
-                f"value {value} does not contain " "changes in the form of *change me*."
+                f"value {value} does not contain changes in the form of *change me*."
            )
        response_without_changes = self.strip_changes(value)

--- a/lm_eval/tasks/leaderboard/ifeval/instructions_util.py
+++ b/lm_eval/tasks/leaderboard/ifeval/instructions_util.py
@@ -34,9 +34,9 @@ NLTK_MIN_VERSION = "3.9.1"
 def download_nltk_resources():
    """Download 'punkt' if not already installed"""
    nltk_version = pkg_resources.get_distribution("nltk").version
-    assert (
+    assert version.parse(nltk_version) >= version.parse(NLTK_MIN_VERSION), (
-        version.parse(nltk_version) >= version.parse(NLTK_MIN_VERSION)
+        f"`nltk` version {nltk_version} is not >= {NLTK_MIN_VERSION}. Please update `nltk` before proceeding--older versions are vulnerable to a remote code execution vulnerability."
-    ), f"`nltk` version {nltk_version} is not >= {NLTK_MIN_VERSION}. Please update `nltk` before proceeding--older versions are vulnerable to a remote code execution vulnerability."
+    )
    try:
        nltk.data.find("tokenizers/punkt_tab")

--- a/lm_eval/tasks/leaderboard/musr/utils.py
+++ b/lm_eval/tasks/leaderboard/musr/utils.py
@@ -8,7 +8,7 @@ def doc_to_choice(doc):
    return ast.literal_eval(doc["choices"])
-DOC_TO_TEXT = "{narrative}\n\n" "{question}\n\n" "{choices}\n" "Answer:"
+DOC_TO_TEXT = "{narrative}\n\n{question}\n\n{choices}\nAnswer:"
 def doc_to_text(doc):
@@ -17,7 +17,7 @@ def doc_to_text(doc):
    """
    choices = ""
    for i, choice in enumerate(ast.literal_eval(doc["choices"])):
-        choices += f"{i+1} - {choice}\n"
+        choices += f"{i + 1} - {choice}\n"
    text = DOC_TO_TEXT.format(
        narrative=doc["narrative"], question=doc["question"], choices=choices

--- a/lm_eval/tasks/lingoly/utils.py
+++ b/lm_eval/tasks/lingoly/utils.py
@@ -14,13 +14,13 @@ def load_questionsheet(qsheet: dict, no_context: bool = False):
            all_subquestions += "\n"
    if no_context:
-        prompt = f"""{qsheet['preamble']}
+        prompt = f"""{qsheet["preamble"]}
                 {all_subquestions}
                 """
    else:
-        prompt = f"""{qsheet['preamble']}
+        prompt = f"""{qsheet["preamble"]}
-                 {qsheet['context']}
+                 {qsheet["context"]}
                 {all_subquestions}
                 """

--- a/lm_eval/tasks/portuguese_bench/flores_pt/create_yamls_flores_pt.py
+++ b/lm_eval/tasks/portuguese_bench/flores_pt/create_yamls_flores_pt.py
@@ -258,7 +258,7 @@ def doc_to_text(src: str, tgt: str) -> str:
    src_name, tgt_name = map(code_to_language_name, [src, tgt])
    return f"""\
-{src_name} sentence: {jinja_var('sentence_' + src)}
+{src_name} sentence: {jinja_var("sentence_" + src)}
 {tgt_name} sentence:"""

--- a/lm_eval/tasks/score/non_greedy_summarizer.py
+++ b/lm_eval/tasks/score/non_greedy_summarizer.py
@@ -127,9 +127,9 @@ def main():
    for seed in range(1, N_SEEDS + 1):
        # Checking if directories exist
        seed_log_dir = os.path.join(args.log_dir, f"seed_{seed}")
-        assert os.path.exists(
+        assert os.path.exists(seed_log_dir), (
-            seed_log_dir
+            f"No logs found for seed={seed}. No directory found at {seed_log_dir}"
-        ), f"No logs found for seed={seed}. No directory found at {seed_log_dir}"
+        )
        subtasks = None
        if args.dataset == "agieval":
            agieval_subtasks = [

--- a/lm_eval/tasks/spanish_bench/flores_es/create_yamls_flores_es.py
+++ b/lm_eval/tasks/spanish_bench/flores_es/create_yamls_flores_es.py
@@ -258,7 +258,7 @@ def doc_to_text(src: str, tgt: str) -> str:
    src_name, tgt_name = map(code_to_language_name, [src, tgt])
    return f"""\
-{src_name} sentence: {jinja_var('sentence_' + src)}
+{src_name} sentence: {jinja_var("sentence_" + src)}
 {tgt_name} sentence:"""

--- a/lm_eval/tasks/squadv2/task.py
+++ b/lm_eval/tasks/squadv2/task.py
@@ -58,9 +58,9 @@ class SQuAD2(ConfigurableTask):
        super().__init__(config={"metadata": {"version": self.VERSION}})
    # HF changed squad on us so we have to make sure we aren't running the old one
-    assert version.parse(datasets.__version__) >= version.parse(
+    assert version.parse(datasets.__version__) >= version.parse("1.11.0"), (
-        "1.11.0"
+        "datasets v1.11.0 or later required for SQuAD"
-    ), "datasets v1.11.0 or later required for SQuAD"
+    )
    def has_training_docs(self):
        return True

--- a/lm_eval/tasks/tmlu/default/_generate_configs.py
+++ b/lm_eval/tasks/tmlu/default/_generate_configs.py
@@ -14,7 +14,8 @@ categories = {
    "STEM": [
        "biology",
        "chemistry",
-        "mathematics" "physics",
+        "mathematics",
+        "physics",
        "earth science",
    ],
    "humanities": ["Chinese", "history", "Tour", "law"],

--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -48,9 +48,9 @@ def escaped_split(text, sep_char, maxsplit=-1):
    is not specified or less than 0, then there is no limit on the
    number of splits (all possible splits are made).
    """
-    assert (
+    assert len(sep_char) == 1, (
-        len(sep_char) == 1
+        "separation string must be a single character for escaped splitting"
-    ), "separation string must be a single character for escaped splitting"
+    )
    if maxsplit == 0:
        return text

--- a/scripts/model_comparator.py
+++ b/scripts/model_comparator.py
@@ -17,7 +17,7 @@ eval_logger = utils.eval_logger
 def memory_stats():
    eval_logger.info(
-        f"Memory allocated: {torch.cuda.memory_allocated() / 1024 ** 2}, reserved: {torch.cuda.memory_reserved() // 1024 ** 2}"
+        f"Memory allocated: {torch.cuda.memory_allocated() / 1024**2}, reserved: {torch.cuda.memory_reserved() // 1024**2}"
    )

--- a/scripts/zeno_visualize.py
+++ b/scripts/zeno_visualize.py
@@ -66,9 +66,9 @@ def main():
                f"All models must have the same tasks. {model} has tasks: {model_tasks} but have already recorded tasks: {old_tasks}. Taking intersection {tasks}"
            )
-    assert (
+    assert len(tasks) > 0, (
-        len(tasks) > 0
+        "Must provide at least one task in common amongst models to compare."
-    ), "Must provide at least one task in common amongst models to compare."
+    )
    for task in tasks:
        # Upload data for all models

--- a/tests/test_tasks.py
+++ b/tests/test_tasks.py
@@ -87,7 +87,9 @@ class TestNewTasks:
                    (x[-1].isspace() is False if len(x) > 0 else True)
                    if target_delimiter.isspace()
                    else True
-                ), "doc_to_text ends in a whitespace and target delimiter also a whitespace"
+                ), (
+                    "doc_to_text ends in a whitespace and target delimiter also a whitespace"
+                )
        else:
            pass