Merge branch 'main' into tasklist

# Conflicts: # pyproject.toml

Merge branch 'main' into tasklist
# Conflicts: # pyproject.toml
b58e5556 · Baber · 6e1866f5 · 4f8195f1 · b58e5556 · b58e5556
Commit b58e5556 authored Jul 27, 2025 by Baber
20 changed files
--- a/lm_eval/tasks/egymmlu/egymmlu_physics.yaml
+++ b/lm_eval/tasks/egymmlu/egymmlu_physics.yaml
+"dataset_name": "physics"
+"include": "_default_egymmlu_template_yaml"
+"tag":
+- "egymmlu_stem_tasks"
+- "egymmlu_ar_mmlu_tasks"
+"task": "egymmlu_physics"
+"task_alias": "physics"
--- a/lm_eval/tasks/egymmlu/egymmlu_political_science.yaml
+++ b/lm_eval/tasks/egymmlu/egymmlu_political_science.yaml
+"dataset_name": "political_science"
+"include": "_default_egymmlu_template_yaml"
+"tag":
+- "egymmlu_social_sciences_tasks"
+- "egymmlu_ar_mmlu_tasks"
+"task": "egymmlu_political_science"
+"task_alias": "political science"
--- a/lm_eval/tasks/egymmlu/egymmlu_professional_law.yaml
+++ b/lm_eval/tasks/egymmlu/egymmlu_professional_law.yaml
+"dataset_name": "professional_law"
+"include": "_default_egymmlu_template_yaml"
+"tag":
+- "egymmlu_humanities_tasks"
+- "egymmlu_mmlu_tasks"
+"task": "egymmlu_professional_law"
+"task_alias": "professional law"
--- a/lm_eval/tasks/egymmlu/egymmlu_professional_psychology.yaml
+++ b/lm_eval/tasks/egymmlu/egymmlu_professional_psychology.yaml
+"dataset_name": "professional_psychology"
+"include": "_default_egymmlu_template_yaml"
+"tag":
+- "egymmlu_social_sciences_tasks"
+- "egymmlu_mmlu_tasks"
+"task": "egymmlu_professional_psychology"
+"task_alias": "professional psychology"
--- a/lm_eval/tasks/egymmlu/egymmlu_public_relations.yaml
+++ b/lm_eval/tasks/egymmlu/egymmlu_public_relations.yaml
+"dataset_name": "public_relations"
+"include": "_default_egymmlu_template_yaml"
+"tag":
+- "egymmlu_social_sciences_tasks"
+- "egymmlu_mmlu_tasks"
+"task": "egymmlu_public_relations"
+"task_alias": "public relations"
--- a/lm_eval/tasks/egymmlu/egymmlu_security_studies.yaml
+++ b/lm_eval/tasks/egymmlu/egymmlu_security_studies.yaml
+"dataset_name": "security_studies"
+"include": "_default_egymmlu_template_yaml"
+"tag":
+- "egymmlu_social_sciences_tasks"
+- "egymmlu_mmlu_tasks"
+"task": "egymmlu_security_studies"
+"task_alias": "security studies"
--- a/lm_eval/tasks/egymmlu/egymmlu_social_science.yaml
+++ b/lm_eval/tasks/egymmlu/egymmlu_social_science.yaml
+"dataset_name": "social_science"
+"include": "_default_egymmlu_template_yaml"
+"tag":
+- "egymmlu_social_sciences_tasks"
+- "egymmlu_ar_mmlu_tasks"
+"task": "egymmlu_social_science"
+"task_alias": "social science"
--- a/lm_eval/tasks/egymmlu/egymmlu_sociology.yaml
+++ b/lm_eval/tasks/egymmlu/egymmlu_sociology.yaml
+"dataset_name": "sociology"
+"include": "_default_egymmlu_template_yaml"
+"tag":
+- "egymmlu_social_sciences_tasks"
+- "egymmlu_mmlu_tasks"
+"task": "egymmlu_sociology"
+"task_alias": "sociology"
--- a/lm_eval/tasks/egymmlu/egymmlu_world_religions.yaml
+++ b/lm_eval/tasks/egymmlu/egymmlu_world_religions.yaml
+"dataset_name": "world_religions"
+"include": "_default_egymmlu_template_yaml"
+"tag":
+- "egymmlu_humanities_tasks"
+- "egymmlu_mmlu_tasks"
+"task": "egymmlu_world_religions"
+"task_alias": "world religions"
--- a/lm_eval/tasks/egymmlu/utils.py
+++ b/lm_eval/tasks/egymmlu/utils.py
+PROMPT = "ده سؤال متعدد الاختيار (مع إجابته) على {}\n\n{}\n{}\n الجواب:"
+alpha = ["A.", "B.", "C.", "D.", "E."]
+def doc_to_text(doc):
+    subject = doc["egy_subject"]  # subject_egyptian
+    question = (
+        doc["question"]
+        if doc["context"] == ""
+        else f"{doc['context']}\n\n{doc['question']}"
+    )
+    options = []
+    for i, opt in enumerate(doc["choices"]):
+        options.append(f"{alpha[i]} {opt}")
+    doc_text = PROMPT.format(subject, question, "\n".join(options))
+    return doc_text
+def doc_to_choice(doc):
+    return [alpha[i][0] for i in range(len(doc["choices"]))]
--- a/lm_eval/tasks/galician_bench/flores_gl/_flores_common_yaml
+++ b/lm_eval/tasks/galician_bench/flores_gl/_flores_common_yaml
@@ -23,5 +23,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/glue/cola/default.yaml
+++ b/lm_eval/tasks/glue/cola/default.yaml
 tag: glue
 task: cola
-dataset_path: glue
+dataset_path: nyu-mll/glue
 dataset_name: cola
 output_type: multiple_choice
 training_split: train

--- a/lm_eval/tasks/glue/mnli/default.yaml
+++ b/lm_eval/tasks/glue/mnli/default.yaml
 tag: glue
 task: mnli
-dataset_path: glue
+dataset_path: nyu-mll/glue
 dataset_name: mnli
 output_type: multiple_choice
 training_split: train

--- a/lm_eval/tasks/glue/mrpc/default.yaml
+++ b/lm_eval/tasks/glue/mrpc/default.yaml
 tag: glue
 task: mrpc
-dataset_path: glue
+dataset_path: nyu-mll/glue
 dataset_name: mrpc
 output_type: multiple_choice
 training_split: train

--- a/lm_eval/tasks/glue/qnli/default.yaml
+++ b/lm_eval/tasks/glue/qnli/default.yaml
 tag: glue
 task: qnli
-dataset_path: glue
+dataset_path: nyu-mll/glue
 dataset_name: qnli
 output_type: multiple_choice
 training_split: train

--- a/lm_eval/tasks/glue/qqp/default.yaml
+++ b/lm_eval/tasks/glue/qqp/default.yaml
 tag: glue
 task: qqp
-dataset_path: glue
+dataset_path: nyu-mll/glue
 dataset_name: qqp
 output_type: multiple_choice
 training_split: train

--- a/lm_eval/tasks/glue/rte/default.yaml
+++ b/lm_eval/tasks/glue/rte/default.yaml
 tag: glue
 task: rte
-dataset_path: glue
+dataset_path: nyu-mll/glue
 dataset_name: rte
 output_type: multiple_choice
 training_split: train

--- a/lm_eval/tasks/glue/sst2/default.yaml
+++ b/lm_eval/tasks/glue/sst2/default.yaml
 tag: glue
 task: sst2
-dataset_path: glue
+dataset_path: nyu-mll/glue
 dataset_name: sst2
 output_type: multiple_choice
 training_split: train

--- a/lm_eval/tasks/glue/wnli/default.yaml
+++ b/lm_eval/tasks/glue/wnli/default.yaml
 tag: glue
 task: wnli
-dataset_path: glue
+dataset_path: nyu-mll/glue
 dataset_name: wnli
 output_type: multiple_choice
 training_split: train

--- a/lm_eval/tasks/groundcocoa/groundcocoa.yaml
+++ b/lm_eval/tasks/groundcocoa/groundcocoa.yaml
@@ -14,5 +14,4 @@ metric_list:
    aggregation: mean
    higher_is_better: true
 dataset_kwargs:
-  trust_remote_code: true
  streaming: true