diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index 002a3a8734a05947dacbf1cbeabc6f925adc9287..b9a448642dbb5ef70339a324a6b58641942b2506 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -79,36 +79,36 @@ jobs:
           path: |
             test_logs/*
 
-  testmodels:
-    name: External LM Tests
-    runs-on: ubuntu-latest
-    timeout-minutes: 30
-    steps:
-      - name: Checkout Code
-        uses: actions/checkout@v4
-      - name: Set up Python 3.9
-        uses: actions/setup-python@v5
-        with:
-          python-version: 3.9
-          cache: pip
-          cache-dependency-path: pyproject.toml
-
-      # Cache HuggingFace cache directory for External LM tests
-      - name: Cache HuggingFace cache (External LM tests)
-        uses: actions/cache@v3
-        id: cache-hf-lm
-        with:
-          path: ~/.cache/huggingface
-          key: ${{ runner.os }}-hf-cache-external-lm
-          restore-keys: |
-            ${{ runner.os }}-hf-cache-external-lm
-
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install -e '.[dev,optimum,deepsparse,sparseml,api]' --extra-index-url https://download.pytorch.org/whl/cpu
-          pip install -U transformers peft accelerate
-
-      - name: Test with pytest
-        run: python -m pytest tests/models --showlocals -s -vv
-        continue-on-error: true  # Continue workflow even if tests fail
+#  testmodels:
+#    name: External LM Tests
+#    runs-on: ubuntu-latest
+#    timeout-minutes: 30
+#    steps:
+#      - name: Checkout Code
+#        uses: actions/checkout@v4
+#      - name: Set up Python 3.9
+#        uses: actions/setup-python@v5
+#        with:
+#          python-version: 3.9
+#          cache: pip
+#          cache-dependency-path: pyproject.toml
+#
+#      # Cache HuggingFace cache directory for External LM tests
+#      - name: Cache HuggingFace cache (External LM tests)
+#        uses: actions/cache@v3
+#        id: cache-hf-lm
+#        with:
+#          path: ~/.cache/huggingface
+#          key: ${{ runner.os }}-hf-cache-external-lm
+#          restore-keys: |
+#            ${{ runner.os }}-hf-cache-external-lm
+#
+#      - name: Install dependencies
+#        run: |
+#          python -m pip install --upgrade pip
+#          pip install -e '.[dev,optimum,deepsparse,sparseml,api]' --extra-index-url https://download.pytorch.org/whl/cpu
+#          pip install -U transformers peft accelerate
+#
+#      - name: Test with pytest
+#        run: python -m pytest tests/models --showlocals -s -vv
+#        continue-on-error: true  # Continue workflow even if tests fail
diff --git a/.gitignore b/.gitignore
index 56bb8038644e6e3f745cf99da87282922882c239..9ae167be97686d8e332e469e3d84708879860091 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,25 +1,47 @@
-env
-*.pyc
-output/
-data/
-lm_cache
-.idea
-build
-dist
-*.egg-info
-venv
+# macOS system files
+.DS_Store
+
+# Virtual environments
 .venv/
+venv/
+ENV/
+env/
+*.env
+
+# Python bytecode and build artifacts
+__pycache__/
+*.py[cod]
+*.so
+*.egg-info/
+build/
+dist/
+
+# IDE & editor settings
 .vscode/
-temp
-__pycache__
-.ipynb_checkpoints
-temp
-test_logs/
-# IPython
+.idea/
+
+# Jupyter
+.ipynb_checkpoints/
 profile_default/
 ipython_config.py
-# don't track (the default location of) the cached requests
+
+# Output and data
+output/
+data/
+temp/
+test_logs/
+
+# Caching
 lm_eval/caching/.cache
-# don't track files created by wandb
-wandb
-examples/wandb
+lm_cache/
+
+# Logging
+*.log
+logs/
+
+# wandb experiment tracking
+wandb/
+examples/wandb/
+
+# PyInstaller
+*.spec
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1aecc758adaf13372960a13ed42861e002b9dbbb..af3f9f086976f5caf5046e623e9cf2d9f2785057 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -29,7 +29,7 @@ repos:
       - id: mixed-line-ending
         args: [--fix=lf]
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.11.0
+    rev: v0.11.10
     hooks:
       # Run the linter.
       - id: ruff
@@ -50,7 +50,7 @@ repos:
     rev: v0.9.29
     hooks:
       - id: pymarkdown
-        exclude: ^lm_eval/tasks/
+        exclude: ^(lm_eval/tasks/.*|docs/footguns\.md)$
         args: [fix, -r]
 #  - repo: https://github.com/pre-commit/mirrors-mypy
 #    rev: v1.5.1
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000000000000000000000000000000000000..93f181def4d46b013259fae732c03ce172127da8
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1 @@
+recursive-include tests
diff --git a/README.md b/README.md
index b08165065d744ae59131e65d89f793bf8da73688..f325ae478dad8bba17e8a35d0ec940834e545f63 100644
--- a/README.md
+++ b/README.md
@@ -614,7 +614,7 @@ Extras dependencies can be installed via `pip install -e ".[NAME]"`
 ```text
 @misc{eval-harness,
   author       = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy},
-  title        = {A framework for few-shot language model evaluation},
+  title        = {The Language Model Evaluation Harness},
   month        = 07,
   year         = 2024,
   publisher    = {Zenodo},
diff --git a/docs/footguns.md b/docs/footguns.md
new file mode 100644
index 0000000000000000000000000000000000000000..3343c764e79d8d8023594a74b67366cdd825b450
--- /dev/null
+++ b/docs/footguns.md
@@ -0,0 +1,58 @@
+# Common Pitfalls and Troubleshooting Guide
+
+This document highlights common pitfalls and troubleshooting tips when using this library. We'll continue to add more tips as we discover them.
+
+## YAML Configuration Issues
+
+### Newline Characters in YAML (`\n`)
+
+**Problem:** When specifying newline characters in YAML, they may be interpreted incorrectly depending on how you format them.
+
+```yaml
+# ❌ WRONG: Single quotes don't process escape sequences
+generation_kwargs:
+  until: ['\n']  # Gets parsed as the literal characters '\' and 'n' i.e "\\n"
+
+```
+```yaml
+# ✅ RIGHT: Use double quotes for escape sequences
+generation_kwargs:
+  until: ["\n"]  # Gets parsed as an actual newline character
+
+```
+
+**Solutions:**
+- Use double quotes for strings containing escape sequences
+- For multiline content, use YAML's block scalars (`|` or `>`)
+- When generating YAML programmatically, be careful with how template engines handle escape sequences
+
+### Quoting in YAML
+
+**When to use different types of quotes:**
+
+- **No quotes**: Simple values (numbers, booleans, alphanumeric strings without special characters)
+  ```yaml
+  simple_value: plain text
+  number: 42
+
+  ```
+
+- **Single quotes (')**:
+  - Preserves literal values
+  - Use when you need special characters to be treated literally
+  - Escape single quotes by doubling them: `'It''s working'`
+  ```yaml
+  literal_string: 'The newline character \n is not processed here'
+  path: 'C:\Users\name'  # Backslashes preserved
+
+  ```
+
+- **Double quotes (")**:
+  - Processes escape sequences like `\n`, `\t`, etc.
+  - Use for strings that need special characters interpreted
+  - Escape double quotes with backslash: `"He said \"Hello\""`
+  ```yaml
+  processed_string: "First line\nSecond line"  # Creates actual newline
+  unicode: "Copyright symbol: \u00A9"  # Unicode character
+
+  ```
diff --git a/examples/lm-eval-overview.ipynb b/examples/lm-eval-overview.ipynb
index 3a06e96ec6d6ad06570be8703a4297a94cc0c347..2a9053c1b501a7423af970f46bcba454a1bc56a4 100644
--- a/examples/lm-eval-overview.ipynb
+++ b/examples/lm-eval-overview.ipynb
@@ -79,48 +79,48 @@
       "  Switched to a new branch 'big-refactor'\n",
       "  Branch 'big-refactor' set up to track remote branch 'big-refactor' from 'origin'.\n",
       "  Resolved https://github.com/EleutherAI/lm-evaluation-harness.git to commit 42f486ee49b65926a444cb0620870a39a5b4b0a8\n",
-      "  Installing build dependencies ... \u001B[?25l\u001B[?25hdone\n",
-      "  Getting requirements to build wheel ... \u001B[?25l\u001B[?25hdone\n",
-      "  Preparing metadata (pyproject.toml) ... \u001B[?25l\u001B[?25hdone\n",
+      "  Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
+      "  Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
+      "  Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
       "Collecting accelerate>=0.21.0 (from lm-eval==1.0.0)\n",
       "  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)\n",
-      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m261.4/261.4 kB\u001B[0m \u001B[31m4.1 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
-      "\u001B[?25hCollecting evaluate (from lm-eval==1.0.0)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m261.4/261.4 kB\u001b[0m \u001b[31m4.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting evaluate (from lm-eval==1.0.0)\n",
       "  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)\n",
-      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m84.1/84.1 kB\u001B[0m \u001B[31m5.9 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
-      "\u001B[?25hCollecting datasets>=2.0.0 (from lm-eval==1.0.0)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.1/84.1 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting datasets>=2.0.0 (from lm-eval==1.0.0)\n",
       "  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)\n",
-      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m521.2/521.2 kB\u001B[0m \u001B[31m9.5 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
-      "\u001B[?25hCollecting jsonlines (from lm-eval==1.0.0)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m521.2/521.2 kB\u001b[0m \u001b[31m9.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting jsonlines (from lm-eval==1.0.0)\n",
       "  Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)\n",
       "Requirement already satisfied: numexpr in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (2.8.7)\n",
       "Collecting peft>=0.2.0 (from lm-eval==1.0.0)\n",
       "  Downloading peft-0.6.2-py3-none-any.whl (174 kB)\n",
-      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m174.7/174.7 kB\u001B[0m \u001B[31m7.2 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
-      "\u001B[?25hCollecting pybind11>=2.6.2 (from lm-eval==1.0.0)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m174.7/174.7 kB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting pybind11>=2.6.2 (from lm-eval==1.0.0)\n",
       "  Downloading pybind11-2.11.1-py3-none-any.whl (227 kB)\n",
-      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m227.7/227.7 kB\u001B[0m \u001B[31m12.9 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
-      "\u001B[?25hCollecting pytablewriter (from lm-eval==1.0.0)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m227.7/227.7 kB\u001b[0m \u001b[31m12.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting pytablewriter (from lm-eval==1.0.0)\n",
       "  Downloading pytablewriter-1.2.0-py3-none-any.whl (111 kB)\n",
-      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m111.1/111.1 kB\u001B[0m \u001B[31m8.3 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
-      "\u001B[?25hCollecting rouge-score>=0.0.4 (from lm-eval==1.0.0)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m111.1/111.1 kB\u001b[0m \u001b[31m8.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting rouge-score>=0.0.4 (from lm-eval==1.0.0)\n",
       "  Downloading rouge_score-0.1.2.tar.gz (17 kB)\n",
-      "  Preparing metadata (setup.py) ... \u001B[?25l\u001B[?25hdone\n",
+      "  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
       "Collecting sacrebleu>=1.5.0 (from lm-eval==1.0.0)\n",
       "  Downloading sacrebleu-2.3.2-py3-none-any.whl (119 kB)\n",
-      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m119.7/119.7 kB\u001B[0m \u001B[31m8.7 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
-      "\u001B[?25hRequirement already satisfied: scikit-learn>=0.24.1 in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (1.2.2)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m119.7/119.7 kB\u001b[0m \u001b[31m8.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: scikit-learn>=0.24.1 in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (1.2.2)\n",
       "Collecting sqlitedict (from lm-eval==1.0.0)\n",
       "  Downloading sqlitedict-2.1.0.tar.gz (21 kB)\n",
-      "  Preparing metadata (setup.py) ... \u001B[?25l\u001B[?25hdone\n",
+      "  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
       "Requirement already satisfied: torch>=1.8 in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (2.1.0+cu118)\n",
       "Collecting tqdm-multiprocess (from lm-eval==1.0.0)\n",
       "  Downloading tqdm_multiprocess-0.0.11-py3-none-any.whl (9.8 kB)\n",
       "Requirement already satisfied: transformers>=4.1 in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (4.35.2)\n",
       "Collecting zstandard (from lm-eval==1.0.0)\n",
       "  Downloading zstandard-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)\n",
-      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m5.4/5.4 MB\u001B[0m \u001B[31m29.2 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
-      "\u001B[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (1.23.5)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.4/5.4 MB\u001b[0m \u001b[31m29.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (1.23.5)\n",
       "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (23.2)\n",
       "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (5.9.5)\n",
       "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (6.0.1)\n",
@@ -130,15 +130,15 @@
       "  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)\n",
       "Collecting dill<0.3.8,>=0.3.0 (from datasets>=2.0.0->lm-eval==1.0.0)\n",
       "  Downloading dill-0.3.7-py3-none-any.whl (115 kB)\n",
-      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m115.3/115.3 kB\u001B[0m \u001B[31m14.4 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
-      "\u001B[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (1.5.3)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m14.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (1.5.3)\n",
       "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (2.31.0)\n",
       "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (4.66.1)\n",
       "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (3.4.1)\n",
       "Collecting multiprocess (from datasets>=2.0.0->lm-eval==1.0.0)\n",
       "  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)\n",
-      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m134.8/134.8 kB\u001B[0m \u001B[31m19.9 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
-      "\u001B[?25hRequirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (2023.6.0)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m19.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (2023.6.0)\n",
       "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (3.8.6)\n",
       "Collecting responses<0.19 (from evaluate->lm-eval==1.0.0)\n",
       "  Downloading responses-0.18.0-py3-none-any.whl (38 kB)\n",
@@ -193,13 +193,13 @@
       "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk->rouge-score>=0.0.4->lm-eval==1.0.0) (8.1.7)\n",
       "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.8->lm-eval==1.0.0) (1.3.0)\n",
       "Building wheels for collected packages: lm-eval, rouge-score, sqlitedict\n",
-      "  Building wheel for lm-eval (pyproject.toml) ... \u001B[?25l\u001B[?25hdone\n",
+      "  Building wheel for lm-eval (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
       "  Created wheel for lm-eval: filename=lm_eval-1.0.0-py3-none-any.whl size=994254 sha256=88356155b19f2891981ecef948326ad6ce8ca40a6009378410ec20d0e225995a\n",
       "  Stored in directory: /tmp/pip-ephem-wheel-cache-9v6ye7h3/wheels/17/01/26/599c0779e9858a70a73fa8a306699b5b9a868f820c225457b0\n",
-      "  Building wheel for rouge-score (setup.py) ... \u001B[?25l\u001B[?25hdone\n",
+      "  Building wheel for rouge-score (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
       "  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=6bb0d44e4881972c43ce194e7cb65233d309758cb15f0dec54590d3d2efcfc36\n",
       "  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4\n",
-      "  Building wheel for sqlitedict (setup.py) ... \u001B[?25l\u001B[?25hdone\n",
+      "  Building wheel for sqlitedict (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
       "  Created wheel for sqlitedict: filename=sqlitedict-2.1.0-py3-none-any.whl size=16863 sha256=5747f7dd73ddf3d8fbcebf51b5e4f718fabe1e94bccdf16d2f22a2e65ee7fdf4\n",
       "  Stored in directory: /root/.cache/pip/wheels/79/d6/e7/304e0e6cb2221022c26d8161f7c23cd4f259a9e41e8bbcfabd\n",
       "Successfully built lm-eval rouge-score sqlitedict\n",
@@ -886,348 +886,352 @@
   },
   "widgets": {
    "application/vnd.jupyter.widget-state+json": {
-    "46f521b73fd943c081c648fd873ebc0a": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DescriptionStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DescriptionStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "description_width": ""
-     }
-    },
-    "48763b6233374554ae76035c0483066f": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "ProgressStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "ProgressStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "bar_color": null,
-      "description_width": ""
-     }
-    },
-    "4986a21eb560448fa79f4b25cde48951": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "6b2d90209ec14230b3d58a74ac9b83bf": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "7c5689bc13684db8a22681f41863dddd": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "a1d3a8aa016544a78e8821c8f6199e06": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HBoxModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HBoxModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HBoxView",
-      "box_style": "",
-      "children": [
-       "IPY_MODEL_f61ed33fad754146bdd2ac9db1ba1c48",
-       "IPY_MODEL_bfa0af6aeff344c6845e1080a878e92e",
-       "IPY_MODEL_fd1ad9e0367d4004aae853b91c3a7617"
-      ],
-      "layout": "IPY_MODEL_6b2d90209ec14230b3d58a74ac9b83bf"
-     }
-    },
-    "a73f357065d34d7baf0453ae4a8d75e2": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "aed3acd2f2d74003b44079c333a0698e": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DescriptionStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DescriptionStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "description_width": ""
-     }
-    },
-    "bfa0af6aeff344c6845e1080a878e92e": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "FloatProgressModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "FloatProgressModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "ProgressView",
-      "bar_style": "success",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_7c5689bc13684db8a22681f41863dddd",
-      "max": 5669,
-      "min": 0,
-      "orientation": "horizontal",
-      "style": "IPY_MODEL_48763b6233374554ae76035c0483066f",
-      "value": 5669
-     }
-    },
-    "f61ed33fad754146bdd2ac9db1ba1c48": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HTMLModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HTMLModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HTMLView",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_a73f357065d34d7baf0453ae4a8d75e2",
-      "placeholder": "​",
-      "style": "IPY_MODEL_46f521b73fd943c081c648fd873ebc0a",
-      "value": "Downloading builder script: 100%"
+    "state": {
+     "46f521b73fd943c081c648fd873ebc0a": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "DescriptionStyleModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "DescriptionStyleModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "StyleView",
+       "description_width": ""
+      }
+     },
+     "48763b6233374554ae76035c0483066f": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "ProgressStyleModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "ProgressStyleModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "StyleView",
+       "bar_color": null,
+       "description_width": ""
+      }
+     },
+     "4986a21eb560448fa79f4b25cde48951": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "1.2.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "1.2.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "overflow_x": null,
+       "overflow_y": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "6b2d90209ec14230b3d58a74ac9b83bf": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "1.2.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "1.2.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "overflow_x": null,
+       "overflow_y": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "7c5689bc13684db8a22681f41863dddd": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "1.2.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "1.2.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "overflow_x": null,
+       "overflow_y": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "a1d3a8aa016544a78e8821c8f6199e06": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "HBoxModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "HBoxModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "1.5.0",
+       "_view_name": "HBoxView",
+       "box_style": "",
+       "children": [
+        "IPY_MODEL_f61ed33fad754146bdd2ac9db1ba1c48",
+        "IPY_MODEL_bfa0af6aeff344c6845e1080a878e92e",
+        "IPY_MODEL_fd1ad9e0367d4004aae853b91c3a7617"
+       ],
+       "layout": "IPY_MODEL_6b2d90209ec14230b3d58a74ac9b83bf"
+      }
+     },
+     "a73f357065d34d7baf0453ae4a8d75e2": {
+      "model_module": "@jupyter-widgets/base",
+      "model_module_version": "1.2.0",
+      "model_name": "LayoutModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/base",
+       "_model_module_version": "1.2.0",
+       "_model_name": "LayoutModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "LayoutView",
+       "align_content": null,
+       "align_items": null,
+       "align_self": null,
+       "border": null,
+       "bottom": null,
+       "display": null,
+       "flex": null,
+       "flex_flow": null,
+       "grid_area": null,
+       "grid_auto_columns": null,
+       "grid_auto_flow": null,
+       "grid_auto_rows": null,
+       "grid_column": null,
+       "grid_gap": null,
+       "grid_row": null,
+       "grid_template_areas": null,
+       "grid_template_columns": null,
+       "grid_template_rows": null,
+       "height": null,
+       "justify_content": null,
+       "justify_items": null,
+       "left": null,
+       "margin": null,
+       "max_height": null,
+       "max_width": null,
+       "min_height": null,
+       "min_width": null,
+       "object_fit": null,
+       "object_position": null,
+       "order": null,
+       "overflow": null,
+       "overflow_x": null,
+       "overflow_y": null,
+       "padding": null,
+       "right": null,
+       "top": null,
+       "visibility": null,
+       "width": null
+      }
+     },
+     "aed3acd2f2d74003b44079c333a0698e": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "DescriptionStyleModel",
+      "state": {
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "DescriptionStyleModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/base",
+       "_view_module_version": "1.2.0",
+       "_view_name": "StyleView",
+       "description_width": ""
+      }
+     },
+     "bfa0af6aeff344c6845e1080a878e92e": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "FloatProgressModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "FloatProgressModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "1.5.0",
+       "_view_name": "ProgressView",
+       "bar_style": "success",
+       "description": "",
+       "description_tooltip": null,
+       "layout": "IPY_MODEL_7c5689bc13684db8a22681f41863dddd",
+       "max": 5669,
+       "min": 0,
+       "orientation": "horizontal",
+       "style": "IPY_MODEL_48763b6233374554ae76035c0483066f",
+       "value": 5669
+      }
+     },
+     "f61ed33fad754146bdd2ac9db1ba1c48": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "HTMLModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "HTMLModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "1.5.0",
+       "_view_name": "HTMLView",
+       "description": "",
+       "description_tooltip": null,
+       "layout": "IPY_MODEL_a73f357065d34d7baf0453ae4a8d75e2",
+       "placeholder": "​",
+       "style": "IPY_MODEL_46f521b73fd943c081c648fd873ebc0a",
+       "value": "Downloading builder script: 100%"
+      }
+     },
+     "fd1ad9e0367d4004aae853b91c3a7617": {
+      "model_module": "@jupyter-widgets/controls",
+      "model_module_version": "1.5.0",
+      "model_name": "HTMLModel",
+      "state": {
+       "_dom_classes": [],
+       "_model_module": "@jupyter-widgets/controls",
+       "_model_module_version": "1.5.0",
+       "_model_name": "HTMLModel",
+       "_view_count": null,
+       "_view_module": "@jupyter-widgets/controls",
+       "_view_module_version": "1.5.0",
+       "_view_name": "HTMLView",
+       "description": "",
+       "description_tooltip": null,
+       "layout": "IPY_MODEL_4986a21eb560448fa79f4b25cde48951",
+       "placeholder": "​",
+       "style": "IPY_MODEL_aed3acd2f2d74003b44079c333a0698e",
+       "value": " 5.67k/5.67k [00:00&lt;00:00, 205kB/s]"
+      }
      }
     },
-    "fd1ad9e0367d4004aae853b91c3a7617": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HTMLModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HTMLModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HTMLView",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_4986a21eb560448fa79f4b25cde48951",
-      "placeholder": "​",
-      "style": "IPY_MODEL_aed3acd2f2d74003b44079c333a0698e",
-      "value": " 5.67k/5.67k [00:00&lt;00:00, 205kB/s]"
-     }
-    }
+    "version_major": 2,
+    "version_minor": 0
    }
   }
  },
diff --git a/lm_eval/__init__.py b/lm_eval/__init__.py
index fece9162482d018c2969a9e67603096d0ad21713..c50ad3edd2cb1dea76048d416624a7c7db7c3209 100644
--- a/lm_eval/__init__.py
+++ b/lm_eval/__init__.py
@@ -4,4 +4,4 @@ import os
 from .evaluator import evaluate, simple_evaluate
 
 
-__version__ = "0.4.8"
+__version__ = "0.4.9"
diff --git a/lm_eval/__main__.py b/lm_eval/__main__.py
index baf68e7de80349b59f11f95cefb77a2c92e4368c..283f66e0f661ca5363f5d93f2519fe2714d45efc 100644
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -164,7 +164,7 @@ def setup_parser() -> argparse.ArgumentParser:
         type=str,
         action=TrackExplicitAction,
         metavar="DIR|DIR/file.json",
-        help="The path to the output file where the result metrics will be saved. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used.",
+        help="Path where result metrics will be saved. Can be either a directory or a .json file. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used.",
     )
     parser.add_argument(
         "--limit",
diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index c1bc967af28fb28a4e6397cd85e373386bcb6866..ad334b48b0ed80575326a09f04298a296838842d 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -1481,7 +1481,10 @@ class ConfigurableTask(Task):
                 # here mutual info refers to calculating
                 # log(P(choice|ctx) / P(choice)) = log(P(choice|ctx)) - log(P(choice))
                 # in other words normalizing by subtracting the unconditional logprob of each choice.
-                aux_arguments = [("", f"{choice}") for choice in choices]
+                # TODO: should these be strided? will have to modify the processing in process_results if so
+                aux_arguments = [
+                    ("", f"{target_delimiter}{choice}") for choice in choices
+                ]
 
                 arguments.extend(aux_arguments)
 
@@ -1580,11 +1583,12 @@ class ConfigurableTask(Task):
             ):
                 # then we are doing mutual info.
                 # this stores the "dryrun" / unconditional answer loglikelihoods
-                lls_unconditional = lls[1::2]
+                # as we extend the args list with unconditional ("", continuation) pairs
+                lls_unconditional = lls[len(choices) :]
                 if len(lls_unconditional) != len(choices):
                     raise ValueError
                 # and this stores our "regular" conditional loglikelihoods
-                lls = lls[::2]
+                lls = lls[: len(choices)]
 
             pred = np.argmax(lls)
             pred_norm = np.argmax(lls / completion_len)
diff --git a/lm_eval/filters/extraction.py b/lm_eval/filters/extraction.py
index 9c8d796b6099d89fb6b6e5b2e17444cfa66f1b06..22ca883a9d00b2156c6aedc5df7448879a03da65 100644
--- a/lm_eval/filters/extraction.py
+++ b/lm_eval/filters/extraction.py
@@ -54,6 +54,51 @@ class RegexFilter(Filter):
             return filtered
 
         filtered_resps = list(map(lambda x: filter_set(x), resps))
+        return filtered_resps
+
+
+@register_filter("regex_pos")
+class POSFilter(Filter):
+    """ """
+
+    def __init__(
+        self,
+        regex_pattern: str = r"\['(.*?)'\]",
+        group_select=0,
+        fallback=None,
+    ) -> None:
+        """
+        pass a string `regex` to run `re.compile(r"regex")` on.
+        `fallback` defines the output returned if no matches for the regex are located.
+        """
+        if fallback is None:
+            fallback = ["invalid"]
+        self.regex_pattern = regex_pattern
+        self.regex = re.compile(regex_pattern)
+        self.group_select = group_select
+        self.fallback = fallback
+
+    def apply(self, resps, docs):
+        def extract_tagged_tokens(text):
+            # Extract tagged tokens list from text input using regex
+            tokens = re.findall(r"\('([^']*)', '([^']*)'\)", text)
+            return [(token, pos) for token, pos in tokens]
+
+        def extract_pos_tags(result):
+            pos_tags = []
+            if isinstance(result, str):
+                result = extract_tagged_tokens(result)
+            pos_tags.extend(pos for _, pos in result)
+            return pos_tags if pos_tags else self.fallback
+
+        def filter_set(inst):
+            filtered = []
+            for resp in inst:
+                match = extract_pos_tags(resp)
+                filtered.append(match)
+            return filtered
+
+        filtered_resps = map(lambda x: filter_set(x), resps)
 
         return filtered_resps
 
diff --git a/lm_eval/filters/transformation.py b/lm_eval/filters/transformation.py
index 1a3592b6dd4811dcef39ff090dfa42e926613b5c..722c67403c8adbc499283a611df17eb1743307b8 100644
--- a/lm_eval/filters/transformation.py
+++ b/lm_eval/filters/transformation.py
@@ -1,3 +1,5 @@
+import re
+
 from lm_eval.api.filter import Filter
 from lm_eval.api.registry import register_filter
 
@@ -54,3 +56,67 @@ class MapFilter(Filter):
             return [self.mapping_dict.get(resp, self.default_value) for resp in inst]
 
         return [filter_set(resp) for resp in resps]
+
+
+@register_filter("format_span")
+class SPANFilter(Filter):
+    def __init__(self) -> None:
+        pass
+
+    def apply(self, resps, docs):
+        def format_ner_text(text):
+            label_dict = {
+                "person": "PER",
+                "location": "LOC",
+                "organization": "ORG",
+                "counties": "LOC",
+                "places": "LOC",
+                "people": "PER",
+                "persons": "PER",
+                "company": "ORG",
+                "country": "LOC",
+                "continent": "LOC",
+                "time": "DATE",
+                "date": "DATE",
+                "per": "PER",
+                "loc": "LOC",
+                "org": "ORG",
+            }
+            text = text.lower()
+            for key, value in label_dict.items():
+                text = text.replace(key, value)
+
+            text = "$".join(i for i in text.split("$$"))
+            return text.rstrip("$$")
+
+        def format_named_entities(text):
+            """
+            Extract named entities from text and format them as 'label: value $$ label: value'.
+            Handles grouped entities (e.g., LOC: kenya, uganda) and excludes 'none' values.
+            """
+            # Regular expression to match label: entities pattern
+            pattern = r"\b(PER|LOC|ORG|DATE):\s*([^$]+)"
+            # Normalize newline characters
+            text = text.replace("\n", "$").strip()
+            matches = re.findall(pattern, text)
+
+            formatted_entities = []
+
+            for label, values in matches:
+                # Split multiple entities separated by commas and strip whitespace
+                entities = [value.strip() for value in values.split(",")]
+
+                # Exclude 'none' entities
+                for entity in entities:
+                    if entity.lower() != "none":
+                        formatted_entities.append(f"{label.lower()}: {entity}")
+
+            # Join entities with the desired separator
+            return " $ ".join(formatted_entities)
+
+        def filter_set(inst):
+            return [
+                format_named_entities(format_ner_text(resp.lower())) for resp in inst
+            ]
+
+        return [filter_set(resp) for resp in resps]
diff --git a/lm_eval/loggers/evaluation_tracker.py b/lm_eval/loggers/evaluation_tracker.py
index ef56965d2ad19162811767f50cb4372abe0096b3..634a62577439e89d7df23f93511f44e327e7f38e 100644
--- a/lm_eval/loggers/evaluation_tracker.py
+++ b/lm_eval/loggers/evaluation_tracker.py
@@ -229,11 +229,21 @@ class EvaluationTracker:
                 )
 
                 path = Path(self.output_path if self.output_path else Path.cwd())
-                path = path.joinpath(self.general_config_tracker.model_name_sanitized)
-                path.mkdir(parents=True, exist_ok=True)
-
                 self.date_id = datetime.now().isoformat().replace(":", "-")
-                file_results_aggregated = path.joinpath(f"results_{self.date_id}.json")
+                if path.suffix == ".json":
+                    path.parent.mkdir(parents=True, exist_ok=True)
+                    file_results_aggregated = path.with_name(
+                        f"{path.stem}_{self.date_id}.json"
+                    )
+                else:
+                    path = path.joinpath(
+                        self.general_config_tracker.model_name_sanitized
+                    )
+                    path.mkdir(parents=True, exist_ok=True)
+                    file_results_aggregated = path.joinpath(
+                        f"results_{self.date_id}.json"
+                    )
+
                 file_results_aggregated.open("w", encoding="utf-8").write(dumped)
 
                 if self.api and self.push_results_to_hub:
@@ -250,12 +260,10 @@ class EvaluationTracker:
                     )
                     self.api.upload_file(
                         repo_id=repo_id,
-                        path_or_fileobj=str(
-                            path.joinpath(f"results_{self.date_id}.json")
-                        ),
+                        path_or_fileobj=str(file_results_aggregated),
                         path_in_repo=os.path.join(
                             self.general_config_tracker.model_name,
-                            f"results_{self.date_id}.json",
+                            file_results_aggregated.name,
                         ),
                         repo_type="dataset",
                         commit_message=f"Adding aggregated results for {self.general_config_tracker.model_name}",
@@ -290,7 +298,12 @@ class EvaluationTracker:
                 eval_logger.info(f"Saving per-sample results for: {task_name}")
 
                 path = Path(self.output_path if self.output_path else Path.cwd())
-                path = path.joinpath(self.general_config_tracker.model_name_sanitized)
+                if path.suffix == ".json":
+                    path = path.parent
+                else:
+                    path = path.joinpath(
+                        self.general_config_tracker.model_name_sanitized
+                    )
                 path.mkdir(parents=True, exist_ok=True)
 
                 file_results_samples = path.joinpath(
diff --git a/lm_eval/models/__init__.py b/lm_eval/models/__init__.py
index cf5515591cf85534f0670fa27d174c1c49f039d5..8582f0198821166f67155d29d6a358a4869cdb5a 100644
--- a/lm_eval/models/__init__.py
+++ b/lm_eval/models/__init__.py
@@ -16,6 +16,7 @@ from . import (
     optimum_ipex,
     optimum_lm,
     sglang_causallms,
+    sglang_generate_API,
     textsynth,
     vllm_causallms,
     vllm_vlms,
diff --git a/lm_eval/models/api_models.py b/lm_eval/models/api_models.py
index 0142af94d930bcf9642e5d9c00b17409a30e51d3..23d122033ef2e8c7aaecc2bc29d9174612eb1a4c 100644
--- a/lm_eval/models/api_models.py
+++ b/lm_eval/models/api_models.py
@@ -6,6 +6,7 @@ import json
 import logging
 from functools import cached_property
 from typing import (
+    TYPE_CHECKING,
     Any,
     Awaitable,
     Callable,
@@ -30,7 +31,9 @@ except ModuleNotFoundError:
     pass
 
 
+import base64
 from importlib.util import find_spec
+from io import BytesIO
 
 from lm_eval import utils
 from lm_eval.api.instance import Instance
@@ -38,6 +41,10 @@ from lm_eval.api.model import TemplateLM
 from lm_eval.models.utils import Collator, chunks, configure_pad_token
 
 
+if TYPE_CHECKING:
+    from PIL import Image
+
+
 eval_logger = logging.getLogger(__name__)
 
 LogLikelihoodInputs = Tuple[Tuple[str, str], List[int], List[int]]
@@ -51,7 +58,52 @@ class JsonChatStr(NamedTuple):
         return self.prompt.encode(encoding)
 
 
+def create_image_prompt(
+    imgs: list["Image.Image"], chat: dict, fmt: str = "PNG"
+) -> dict:
+    """
+
+    Parameters
+    ----------
+    img : list[PIL.Image.Image]
+        The list of images to encode to base64
+    chat : dict
+    fmt : str, optional
+        Any format Pillow understands (e.g. "PNG", "JPEG").
+        Defaults to "PNG".
+
+    Returns
+    -------
+    dict
+    """
+    images = []
+    for img in imgs:
+        buf = BytesIO()
+        img.save(buf, format=fmt)
+        img_b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
+        img_dict = {
+            "type": "image_url",
+            "image_url": {"url": f"data:image/png;base64,{img_b64}", "detail": "auto"},
+        }
+        images.append(img_dict)
+
+    # chat is in format of list[dict["role": "user"/"system", "content": str, "type": "text"],...]
+    # with images, we need "content" to be a list of dicts with "type" and "text"/"image_url"
+    # currently we do not support few-shots so only one user message
+    # text content also has <image> placeholders, which apparently is not necessary for API class (confirm)
+
+    if isinstance(chat[-1]["content"], list):
+        chat[-1]["content"] = images + chat[-1]["content"]
+    else:
+        text_content = {"type": "text", "text": chat[-1]["content"]}
+        chat[-1]["content"] = images + [text_content]
+    chat[-1].pop("type")
+    return chat
+
+
 class TemplateAPI(TemplateLM):
+    MULTIMODAL = True
+
     def __init__(
         self,
         model: str = None,
@@ -83,6 +135,7 @@ class TemplateAPI(TemplateLM):
         eos_string: str = None,
         # timeout in seconds
         timeout: int = 300,
+        max_images: int = 1,
         **kwargs,
     ) -> None:
         super().__init__()
@@ -129,6 +182,7 @@ class TemplateAPI(TemplateLM):
         self.verify_certificate = verify_certificate
         self._eos_string = eos_string
         self.timeout = int(timeout)
+        self.max_images = int(max_images)
 
         eval_logger.info(f"Using tokenizer {self.tokenizer_backend}")
         if self.tokenizer_backend is None:
@@ -265,7 +319,12 @@ class TemplateAPI(TemplateLM):
             )
         else:
             # bit of a hack. We'll load back before sending to the API
-            return JsonChatStr(json.dumps(chat_history, ensure_ascii=False))
+            return JsonChatStr(
+                json.dumps(
+                    [{**item, "type": "text"} for item in chat_history],
+                    ensure_ascii=False,
+                )
+            )
 
     @cached_property
     def eot_token_id(self) -> Optional[int]:
@@ -578,7 +637,28 @@ class TemplateAPI(TemplateLM):
             return -len(_requests[0])
 
         # Let the API deal with tokenization
-        requests, all_gen_kwargs = zip(*(req.args for req in requests))
+        if len(requests[0].args) > 2:
+            assert self.tokenizer is None, (
+                "tokenizer is not supported for multimodal requests yet!"
+            )
+            eval_logger.info(
+                f"Using max_images {self.max_images}. Set in the model args."
+            )
+            requests, all_gen_kwargs, auxiliary_args = zip(
+                *(req.args for req in requests)
+            )
+            requests = tuple(
+                JsonChatStr(
+                    json.dumps(
+                        create_image_prompt(
+                            y["visual"][: self.max_images], json.loads(x.prompt)
+                        )
+                    )
+                )
+                for x, y in zip(requests, auxiliary_args)
+            )
+        else:
+            requests, all_gen_kwargs = zip(*(req.args for req in requests))
         if self.tokenized_requests:
             encodings_list = self.tok_encode(
                 requests, add_special_tokens=self.add_bos_token
@@ -597,6 +677,10 @@ class TemplateAPI(TemplateLM):
         chunked = re_ord.get_batched(
             n=self._batch_size if self._concurrent <= 1 else 0, batch_fn=None
         )
+        if not self.tokenized_requests:
+            eval_logger.info(
+                "Tokenized requests are disabled. Context + generation length is not checked."
+            )
         if self._concurrent <= 1:
             pbar = tqdm(desc="Requesting API", total=len(requests))
             for chunk in chunked:
@@ -615,10 +699,7 @@ class TemplateAPI(TemplateLM):
                         eval_logger.warning(
                             f"Some contexts exceeded (max length: ({self.max_length}) - max_gen_toks: ({max_gen_toks}). They were left truncated."
                         )
-                else:
-                    eval_logger.info(
-                        "Tokenized requests are disabled. Context + generation length is not checked."
-                    )
+
                 req = encodings_list if self.tokenized_requests else contexts
                 outputs = retry(
                     stop=stop_after_attempt(self.max_retries),
@@ -664,10 +745,7 @@ class TemplateAPI(TemplateLM):
                         eval_logger.warning(
                             f"Some contexts exceeded (max length: ({self.max_length}) - max_gen_toks ({max_gen_toks}). They were left truncated."
                         )
-                else:
-                    eval_logger.info(
-                        "Tokenized requests are disabled. Context + generation length is not checked."
-                    )
+
                 req = encodings_list if self.tokenized_requests else contexts
                 results = itertools.chain.from_iterable(
                     asyncio.run(
diff --git a/lm_eval/models/hf_vlms.py b/lm_eval/models/hf_vlms.py
index 073c0ba04e5d9416fbb889410391d439a2f08f27..8e5144a9714175cf4d270895dbf91a4075e0f17a 100644
--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
@@ -17,6 +17,7 @@ from lm_eval.models.utils import (
     handle_stop_sequences,
     pad_and_concat,
     replace_placeholders,
+    resize_image,
     stop_sequences_criteria,
 )
 
@@ -45,10 +46,23 @@ class HFMultimodalLM(HFLM):
         # TODO: handle whitespace in image placeholder (replacement)
         max_images: Optional[int] = 999,
         convert_img_format=False,
+        # For image resizing
         min_pixels: Optional[int] = None,
         max_pixels: Optional[int] = None,
+        image_width: Optional[int] = None,
+        image_height: Optional[int] = None,
+        image_max_side: Optional[int] = None,
         **kwargs,
     ):
+        self.image_width = image_width
+        self.image_height = image_height
+        self.image_max_side = image_max_side
+        if self.image_max_side and (self.image_width or self.image_height):
+            raise ValueError(
+                "Ambiguous config for image resize: you can not specify both "
+                "image_max_side and (image_width or image_height)"
+            )
+
         # init pixels before calling tokenizer creation to avoid errors
         self.pixels = ({"min_pixels": min_pixels} if min_pixels else {}) | (
             {"max_pixels": max_pixels} if max_pixels else {}
@@ -385,6 +399,9 @@ class HFMultimodalLM(HFLM):
         return batched_imgs
 
     def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
+        if requests and len(requests[0].args) < 3:
+            # Fall back to non-multimodal generation.
+            return super().loglikelihood_rolling(requests=requests)
         raise NotImplementedError(
             "model type `hf-multimodal` does not support loglikelihood_rolling. Use 'hf' model type for text-only loglikelihood_rolling tasks ",
             "this is because we do not support measuring the loglikelihood a model assigns to an image.",
@@ -393,6 +410,9 @@ class HFMultimodalLM(HFLM):
     def loglikelihood(
         self, requests: List[Instance], disable_tqdm: bool = False
     ) -> List[Tuple[float, bool]]:
+        if requests and len(requests[0].args) < 3:
+            # Fall back to non-multimodal generation.
+            return super().loglikelihood(requests=requests, disable_tqdm=disable_tqdm)
         raise NotImplementedError(
             "'loglikelihood' requests for model type `hf-multimodal` are not yet tested. This feature will be enabled when a loglikelihood-based multiple-choice VQA dataset is added!"
         )
@@ -419,9 +439,11 @@ class HFMultimodalLM(HFLM):
                 )
             )
 
-        return self._loglikelihood_tokens(new_reqs, disable_tqdm=disable_tqdm)
+        return self._multimodal_loglikelihood_tokens(
+            new_reqs, disable_tqdm=disable_tqdm
+        )
 
-    def _loglikelihood_tokens(
+    def _multimodal_loglikelihood_tokens(
         self,
         requests: List[
             Tuple[Tuple[None, str, str], List[int], List[int], List[int]]
@@ -610,7 +632,10 @@ class HFMultimodalLM(HFLM):
     def generate_until(
         self, requests: List[Instance], disable_tqdm: bool = False
     ) -> List[str]:
-        # TODO: back out to HFLM.generate_until() for all requests without aux_arguments (text-only reqs)
+        if requests and len(requests[0].args) < 3:
+            # Fall back to non-multimodal generation.
+            return super().generate_until(requests=requests, disable_tqdm=disable_tqdm)
+
         res = []
 
         def _collate(x):
@@ -646,7 +671,15 @@ class HFMultimodalLM(HFLM):
         for chunk in chunks:
             contexts, all_gen_kwargs, aux_arguments = zip(*chunk)
 
-            visuals = [arg["visual"] for arg in aux_arguments]
+            visuals = [
+                [
+                    resize_image(
+                        img, self.image_width, self.image_height, self.image_max_side
+                    )
+                    for img in arg["visual"]
+                ]
+                for arg in aux_arguments
+            ]
 
             if not isinstance(contexts, list):
                 contexts = list(
diff --git a/lm_eval/models/huggingface.py b/lm_eval/models/huggingface.py
index 492923f06e670a3eb841c6f0f9c0fefa6591be85..a6231570ad9021237ac04b3a581a00d11b8bf69d 100644
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -890,7 +890,10 @@ class HFLM(TemplateLM):
                     input_ids=inps, attention_mask=attn_mask, labels=labels
                 ).logits
             else:
-                assert self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
+                assert self.AUTO_MODEL_CLASS in (
+                    transformers.AutoModelForCausalLM,
+                    transformers.AutoModelForVision2Seq,
+                )
                 return self.model(inps).logits
 
     def _model_generate(self, context, max_length, stop, **generation_kwargs):
@@ -1136,7 +1139,7 @@ class HFLM(TemplateLM):
                 if self.backend == "causal":
                     total_length = len(context_enc) + len(continuation_enc)
                     if total_length > self.max_length + 1:
-                        eval_logger.warn(
+                        eval_logger.warning(
                             f"Combined length of context ({len(context_enc)}) and continuation ({len(continuation_enc)}) "
                             f"exceeds model's maximum length ({self.max_length}). "
                             f"Truncating {total_length - self.max_length + 1} tokens from the left."
@@ -1247,7 +1250,12 @@ class HFLM(TemplateLM):
                     cont_toks = torch.tensor(
                         cont_toks, dtype=torch.long, device=self.device
                     ).unsqueeze(0)  # [1, seq]
-                    max_equal = (greedy_tokens == cont_toks).all()
+                    # Use trailing slice [-cont_toks.shape[1]:] to handle variable length cont_len (but same ctx+cont[:-1]).
+                    # i.e. continuations can be sliced at diff points. Collator ensures we have sufficient greedy_tokens
+                    # by choosing key with longest cont if group_by="contexts".
+                    max_equal = (
+                        greedy_tokens[:, -cont_toks.shape[1] :] == cont_toks
+                    ).all()
 
                     # Obtain log-probs at the corresponding continuation token indices
                     # last_token_slice = logits[:, -1, :].squeeze(0).tolist()
diff --git a/lm_eval/models/sglang_generate_API.py b/lm_eval/models/sglang_generate_API.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b6582a3b4b4a7cb25cc9140f48b8aa4e24d44db
--- /dev/null
+++ b/lm_eval/models/sglang_generate_API.py
@@ -0,0 +1,100 @@
+from typing import Dict, List, Optional, Tuple, Union
+
+from lm_eval.api.registry import register_model
+from lm_eval.models.openai_completions import LocalCompletionsAPI
+from lm_eval.models.utils import handle_stop_sequences
+
+
+@register_model("sglang-generate")
+class SGLANGGENERATEAPI(LocalCompletionsAPI):
+    def __init__(
+        self,
+        base_url=None,
+        tokenizer_backend="huggingface",
+        **kwargs,
+    ):
+        super().__init__(
+            base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs
+        )
+
+    def _create_payload(
+        self,
+        messages: Union[List[List[int]], List[dict], List[str], str],
+        generate=False,
+        gen_kwargs: Optional[dict] = None,
+        seed: int = 1234,
+        eos=None,
+        **kwargs,
+    ) -> dict:
+        is_string = (
+            True
+            if (isinstance(messages, str) or isinstance(messages[0], str))
+            else False
+        )
+        if generate:
+            gen_kwargs.pop("do_sample", False)
+            if "max_tokens" in gen_kwargs:
+                max_tokens = gen_kwargs.pop("max_tokens")
+            else:
+                max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
+            temperature = gen_kwargs.pop("temperature", 0)
+            stop = handle_stop_sequences(gen_kwargs.pop("until", None), eos)
+            request = {
+                "sampling_params": {
+                    "max_new_tokens": max_tokens,
+                    "temperature": temperature,
+                    "stop": stop,
+                    **gen_kwargs,
+                },
+            }
+            request.update({"text": messages}) if is_string else request.update(
+                {"input_ids": messages}
+            )
+            return request
+        else:
+            assert not is_string, "Logprobs are only supported for tokenized inputs"
+            request = {
+                "input_ids": messages,
+                "sampling_params": {"max_new_tokens": 1, "temperature": 0},
+                "logprob_start_len": 0,
+                "top_logprobs_num": 1,
+                "return_logprob": True,
+            }
+            return request
+
+    @staticmethod
+    def parse_logprobs(
+        outputs: Union[Dict, List[Dict]],
+        tokens: List[List[int]] = None,
+        ctxlens: List[int] = None,
+        **kwargs,
+    ) -> List[Tuple[float, bool]]:
+        res = []
+        if not isinstance(outputs, list):
+            outputs = [outputs]
+        for choice, ctxlen in zip(outputs, ctxlens):
+            choice = choice["meta_info"]
+            assert ctxlen > 0, "Context length must be greater than 0"
+            logprobs = sum(x[0] for x in choice["input_token_logprobs"][ctxlen:])
+            is_greedy = all(
+                x[1] != y[0][1]
+                for x, y in zip(
+                    choice["input_token_logprobs"][ctxlen:],
+                    choice["input_top_logprobs"][ctxlen:],
+                )
+            )
+            res.append((logprobs, is_greedy))
+        return res
+
+    @staticmethod
+    def parse_generations(outputs: Union[Dict, List[Dict]], **kwargs) -> List[str]:
+        res = []
+        if not isinstance(outputs, list):
+            outputs = [outputs]
+        for out in outputs:
+            res.append(out["text"])
+        return res
+
+    @property
+    def api_key(self):
+        return ""
diff --git a/lm_eval/models/utils.py b/lm_eval/models/utils.py
index 2878de6ea500b1e4f6723c9ef63f7d6b90be5711..e17fa224b22fbbef442c94e13d4f7c237d3c647d 100644
--- a/lm_eval/models/utils.py
+++ b/lm_eval/models/utils.py
@@ -28,6 +28,7 @@ eval_logger = logging.getLogger(__name__)
 
 
 if TYPE_CHECKING:
+    from PIL import Image
     from transformers import PreTrainedTokenizerBase
     from transformers.configuration_utils import PretrainedConfig
 
@@ -427,9 +428,13 @@ class Collator:
                 batch = self.get_chunks(values, n=n, fn=batch_fn)
                 yield from batch
         elif self._group_by == "contexts":
-            # Get one sample from each key
+            # Get one sample from each key.
+            # Select longest continuation per group to ensure sufficient context logits
             values = self._reorder(
-                [value[0] for value in self._arr_with_indices.values()]
+                [
+                    max(value, key=lambda x: len(x[1][-1]))
+                    for value in self._arr_with_indices.values()
+                ]
             )
             batch = self.get_chunks(values, n=n, fn=batch_fn)
             yield from batch
@@ -729,3 +734,121 @@ def handle_stop_sequences(
     if eos is not None and eos not in until:
         until.append(eos)
     return until
+
+
+def resize_image(
+    image: "Image.Image",
+    width: Optional[int] = None,
+    height: Optional[int] = None,
+    max_dimension: Optional[int] = None,
+    keep_aspect_ratio: bool = True,
+    resample_filter: Union[int, str] = "Image.BICUBIC",
+    min_width: int = 1,
+    min_height: int = 1,
+) -> "Image.Image":
+    """
+    Resizes a PIL Image object with flexible options.
+
+    Args:
+        image: The PIL Image object to resize.
+        width: Target width in pixels.
+        height: Target height in pixels.
+        max_dimension: Maximum size for the longer dimension of the image.
+        keep_aspect_ratio: If True (default) and both width and height are provided,
+                          the image is resized to fit within these dimensions while
+                          maintaining its aspect ratio. If False, the image is stretched
+                          to the exact width and height.
+        resample_filter: The resampling filter to use for resizing.
+                        Defaults to Image.BICUBIC.
+        min_width: Minimum width for the resized image. Defaults to 1.
+        min_height: Minimum height for the resized image. Defaults to 1.
+
+    Returns:
+        The resized PIL Image object. If no resize parameters are provided
+        or if the image already meets the criteria, the original image is returned.
+
+    Order of precedence for resizing:
+    1. If width AND height are provided:
+       - If keep_aspect_ratio is True: Fits image within bounds, preserving aspect ratio.
+       - If keep_aspect_ratio is False: Resizes to exact dimensions (may distort).
+    2. Else if only width is provided: Calculates height proportionally.
+    3. Else if only height is provided: Calculates width proportionally.
+    4. Else if max_dimension is provided: Resizes the longest side to max_dimension
+       and scales the other side proportionally.
+    5. If none of the above are provided, returns the original image.
+    """
+    original_width, original_height = image.size
+
+    # If no arguments are provided, return the original image
+    if width is None and height is None and max_dimension is None:
+        return image
+
+    new_width = original_width
+    new_height = original_height
+
+    if width is not None and height is not None:
+        # No resize needed if image is already smaller than target dimensions
+        if original_width <= width and original_height <= height:
+            return image
+
+        if keep_aspect_ratio:
+            # Calculate the ratio to fit within the target dimensions
+            ratio = min(width / original_width, height / original_height)
+            new_width = int(original_width * ratio)
+            new_height = int(original_height * ratio)
+        else:
+            # Stretch to exact dimensions
+            new_width = width
+            new_height = height
+    elif width is not None:
+        # No resize needed if width is already smaller
+        if original_width <= width:
+            return image
+        # Calculate height proportionally
+        new_width = width
+        new_height = int((original_height / original_width) * new_width)
+    elif height is not None:
+        # No resize needed if height is already smaller
+        if original_height <= height:
+            return image
+        # Calculate width proportionally
+        new_height = height
+        new_width = int((original_width / original_height) * new_height)
+    elif max_dimension is not None:
+        # No resize needed if both dimensions are smaller than max_dimension
+        if max(original_height, original_width) <= max_dimension:
+            return image
+
+        if original_width > original_height:
+            # Width is the longer side
+            new_width = max_dimension
+            new_height = int((original_height / original_width) * new_width)
+        else:
+            # Height is the longer side or sides are equal
+            new_height = max_dimension
+            new_width = int((original_width / original_height) * new_height)
+
+    # Ensure dimensions are at least minimum values
+    new_width = max(min_width, new_width)
+    new_height = max(min_height, new_height)
+
+    # Perform the resize operation with the calculated dimensions
+    return image.resize((new_width, new_height), resample_filter)
+
+
+def truncate_tokens(
+    tokens: List[int],
+    max_length: int,
+    tokenizer: "PreTrainedTokenizerBase",
+    strategy: str = "left",
+):
+    if strategy == "left":
+        return tokens[-max_length:]
+    elif strategy == "right":
+        return tokens[:max_length]
+    elif strategy == "middle":
+        # Truncate the middle of the sequence
+        left_length = max_length // 2
+        right_length = max_length - left_length
+        return tokens[:left_length] + tokens[-right_length:]
+    return None
diff --git a/lm_eval/models/vllm_causallms.py b/lm_eval/models/vllm_causallms.py
index 866039fdb79ee0a9e436d45c1d8234b3e228204b..474f9bbe8dcddda60749ee2b0fa17bee537a1073 100644
--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
@@ -1,7 +1,13 @@
 import copy
+import gc
+import inspect
 import logging
+import os
 from importlib.metadata import version
 from importlib.util import find_spec
+from multiprocessing import Process, Queue
+from queue import Empty
+from time import sleep
 from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union
 
 from more_itertools import distribute
@@ -28,6 +34,7 @@ try:
     from vllm import LLM, SamplingParams
     from vllm.lora.request import LoRARequest
     from vllm.transformers_utils.tokenizer import get_tokenizer
+    from vllm.utils import get_open_port
 
     if parse_version(version("vllm")) >= parse_version("0.8.3"):
         from vllm.entrypoints.chat_utils import resolve_hf_chat_template
@@ -40,6 +47,63 @@ if TYPE_CHECKING:
 eval_logger = logging.getLogger(__name__)
 
 
+def _vllm_mp_worker(
+    model_args: dict,
+    sampling_params: "SamplingParams",
+    requests: list[list[int]],
+    lora_request: "LoRARequest",
+    result_queue: "Queue",
+    dp_size: int,
+    local_dp_rank: int,
+    dp_master_port: int,
+    dp_master_ip: str = "127.0.0.1",
+) -> None:
+    """
+    Worker process for vLLM multiprocessing.
+    Initializes a vLLM engine, processes requests, and puts results or errors
+    onto the result_queue.
+    """
+
+    if not requests:
+        result_queue.put((local_dp_rank, []))
+        return None
+
+    os.environ["VLLM_DP_RANK"] = os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
+    os.environ["VLLM_DP_SIZE"] = str(dp_size)
+    os.environ["VLLM_DP_MASTER_IP"] = str(dp_master_ip)
+    os.environ["VLLM_DP_MASTER_PORT"] = str(dp_master_port)
+
+    llm = None
+    try:
+        llm = LLM(**model_args)
+        res = llm.generate(
+            prompt_token_ids=requests,
+            sampling_params=sampling_params,
+            lora_request=lora_request,
+        )
+        # Give engines time to pause their processing loops before exiting."
+        sleep(1)
+        result_queue.put((local_dp_rank, res))
+
+    except Exception as e:
+        error_message = f"Worker {local_dp_rank} failed during generation: {type(e).__name__}: {str(e)}"
+        eval_logger.error(error_message, exc_info=True)
+        result_queue.put((local_dp_rank, {"error": error_message}))
+
+    finally:
+        if llm is not None:
+            try:
+                del llm
+                gc.collect()
+            except Exception as e_cleanup:
+                eval_logger.warning(
+                    f"Worker {local_dp_rank} encountered an error during LLM cleanup: {type(e_cleanup).__name__}: {str(e_cleanup)}",
+                    exc_info=True,
+                )
+
+    return None
+
+
 @register_model("vllm")
 class VLLM(TemplateLM):
     _DEFAULT_MAX_LENGTH = 2048
@@ -68,6 +132,7 @@ class VLLM(TemplateLM):
         device: str = "cuda",
         data_parallel_size: int = 1,
         lora_local_path: str = None,
+        enable_thinking: bool = False,
         **kwargs,
     ):
         super().__init__()
@@ -81,7 +146,7 @@ class VLLM(TemplateLM):
         assert max_length is None or max_model_len is None, (
             "Either max_length or max_model_len may be provided, but not both"
         )
-
+        self.V1 = os.environ.get("VLLM_USE_V1", "1") != "0"
         self._max_length = max_model_len if max_model_len is not None else max_length
         self.tensor_parallel_size = int(tensor_parallel_size)
         self.data_parallel_size = int(data_parallel_size)
@@ -96,9 +161,11 @@ class VLLM(TemplateLM):
             "trust_remote_code": trust_remote_code,
             "tensor_parallel_size": int(tensor_parallel_size),
             "max_model_len": int(self._max_length) if self._max_length else None,
+            "max_num_seqs": kwargs.get("max_num_seqs", max_batch_size),
             "swap_space": int(swap_space),
             "quantization": quantization,
             "seed": int(seed),
+            "device": str(device),
         }
         self.model_args.update(kwargs)
         self.batch_size = (
@@ -112,7 +179,11 @@ class VLLM(TemplateLM):
             eval_logger.warning(
                 "You might experience occasional issues with model weight downloading when data_parallel is in use. To ensure stable performance, run with data_parallel_size=1 until the weights are downloaded and cached."
             )
-            self.model_args["distributed_executor_backend"] = "ray"
+            self.model_args["distributed_executor_backend"] = (
+                "ray"
+                if not self.V1
+                else self.model_args.get("distributed_executor_backend", None)
+            )
             self.batch_size = "auto"
             eval_logger.info("Manual batching is not compatible with data parallelism.")
 
@@ -129,6 +200,7 @@ class VLLM(TemplateLM):
             add_bos_token=add_bos_token,
         )
         self.tokenizer = configure_pad_token(self.tokenizer, model_config=self._config)
+        self.enable_thinking = enable_thinking
         self.add_bos_token = add_bos_token
         if "gemma" in pretrained.lower():
             self.add_bos_token = True
@@ -137,11 +209,36 @@ class VLLM(TemplateLM):
             )
 
         if parse_version(version("vllm")) >= parse_version("0.8.3"):
+            kwargs_resolve_hf_chat_template = {
+                "tokenizer": self.tokenizer,
+                "chat_template": None,
+                "tools": None,
+            }
+
+            if parse_version(version("vllm")) >= parse_version("0.9.0"):
+                if self.data_parallel_size <= 1:
+                    kwargs_resolve_hf_chat_template["model_config"] = (
+                        self.model.llm_engine.model_config
+                    )
+                else:
+                    from vllm.engine.arg_utils import EngineArgs
+
+                    engine_args = EngineArgs(**self.model_args)
+                    model_config = engine_args.create_model_config()
+
+                    kwargs_resolve_hf_chat_template["model_config"] = model_config
+
+            # https://github.com/vllm-project/vllm/pull/18259
+            if (
+                "trsut_remote_code"
+                in inspect.signature(resolve_hf_chat_template).parameters
+            ):
+                kwargs_resolve_hf_chat_template["trsut_remote_code"] = trust_remote_code
+            else:
+                kwargs_resolve_hf_chat_template["trust_remote_code"] = trust_remote_code
+
             self.hf_chat_template = resolve_hf_chat_template(
-                tokenizer=self.tokenizer,
-                chat_template=None,
-                tools=None,
-                trust_remote_code=trust_remote_code,
+                **kwargs_resolve_hf_chat_template
             )
         else:
             self.hf_chat_template = None
@@ -209,6 +306,7 @@ class VLLM(TemplateLM):
             add_generation_prompt=add_generation_prompt,
             continue_final_message=not add_generation_prompt,
             chat_template=self.hf_chat_template,
+            enable_thinking=self.enable_thinking,
         )
 
         return chat_templated
@@ -257,7 +355,7 @@ class VLLM(TemplateLM):
             sampling_params = SamplingParams(
                 temperature=0, prompt_logprobs=1, max_tokens=1, detokenize=False
             )
-        if self.data_parallel_size > 1:
+        if self.data_parallel_size > 1 and not self.V1:
             # vLLM hangs if resources are set in ray.remote
             # also seems to only work with decorator and not with ray.remote() fn
             # see https://github.com/vllm-project/vllm/issues/973
@@ -288,14 +386,83 @@ class VLLM(TemplateLM):
             ray.shutdown()
             # flatten results
             return undistribute(results)
+        elif self.data_parallel_size > 1:
+            # based on https://github.com/vllm-project/vllm/blob/a04720bc36401d831cb048c3917b9e58173d9c1d/examples/offline_inference/data_parallel.py
+            dp_size = self.data_parallel_size
+            dp_master_ip = os.environ.get("VLLM_DP_MASTER_IP", "127.0.0.1")
+            dp_master_port = os.environ.get("VLLM_DP_MASTER_PORT") or get_open_port()
+
+            requests = (list(x) for x in distribute(self.data_parallel_size, requests))
+
+            procs, resq = [], Queue()
+            # We use Process as it is non-daemonic
+            try:
+                for rank, req in enumerate(requests):
+                    proc = Process(
+                        target=_vllm_mp_worker,
+                        args=(
+                            self.model_args.copy(),
+                            sampling_params,
+                            req,
+                            self.lora_request,
+                            resq,
+                            dp_size,
+                            rank,
+                            dp_master_port,
+                            dp_master_ip,
+                        ),
+                    )
+                    proc.start()
+                    procs.append(proc)
+
+                # Collect results
+                rank_res = {}
+                while len(rank_res) < len(procs):
+                    try:
+                        rank, result = resq.get(timeout=30)
+                        if isinstance(result, dict) and "error" in result:
+                            raise RuntimeError(result["error"])
+                        rank_res[rank] = result
+                    except Empty:
+                        dead_procs = [
+                            idx
+                            for idx, p in enumerate(procs)
+                            if not p.is_alive() and idx not in rank_res
+                        ]
+                        if dead_procs:
+                            raise RuntimeError(
+                                f"Worker processes {dead_procs} died unexpectedly"
+                            )
+                        continue
+
+                results = [rank_res[i] for i in range(len(procs))]
+                return undistribute(results)
+
+            # cleanup
+            finally:
+                try:
+                    resq.close()
+                    resq.join_thread()
+                except Exception:
+                    eval_logger.debug(
+                        "Failed to close vllm DP results queue", exc_info=True
+                    )
+                for proc in procs:
+                    proc.join(timeout=10)
+                    if proc.is_alive():
+                        proc.terminate()
+                        proc.join(timeout=5)
+                        if proc.is_alive():
+                            proc.kill()
 
-        outputs = self.model.generate(
-            prompt_token_ids=requests,
-            sampling_params=sampling_params,
-            use_tqdm=True if self.batch_size == "auto" else False,
-            lora_request=self.lora_request,
-        )
-        return outputs
+        else:
+            outputs = self.model.generate(
+                prompt_token_ids=requests,
+                sampling_params=sampling_params,
+                use_tqdm=True if self.batch_size == "auto" else False,
+                lora_request=self.lora_request,
+            )
+            return outputs
 
     def loglikelihood_rolling(
         self, requests: List[Instance], disable_tqdm: bool = False
@@ -427,6 +594,12 @@ class VLLM(TemplateLM):
             # set the max length in tokens of inputs ("context_enc")
             # max len for inputs = max length, minus room to generate the max new tokens
             max_ctx_len = self.max_length - max_gen_toks
+            all_lengths = [len(x) for x in context_encoding]
+            for length in all_lengths:
+                if length > max_ctx_len:
+                    eval_logger.warning(
+                        f"Context length {length} exceeds max length (context + max gen tokens): {max_ctx_len}. Truncating context."
+                    )
             context_encoding = [x[-max_ctx_len:] for x in context_encoding]
 
             # perform batched generation
@@ -441,6 +614,10 @@ class VLLM(TemplateLM):
             # cache generations
             for output, context in zip(cont, context):
                 generated_text = output.outputs[0].text
+                # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
+                for term in until:
+                    if len(term) > 0:
+                        generated_text = generated_text.split(term)[0]
                 res.append(generated_text)
                 self.cache_hook.add_partial(
                     "generate_until", (context, gen_kwargs), generated_text
@@ -477,6 +654,12 @@ class VLLM(TemplateLM):
             inputs = []
             ctxlens = []
             for cache_key, context_enc, continuation_enc in chunk:
+                if (
+                    full_length := len(context_enc + continuation_enc)
+                ) > self.max_length:
+                    eval_logger.warning(
+                        f"Context length {full_length} exceeds max length ({self.max_length}). Truncating context."
+                    )
                 inp = (context_enc + continuation_enc)[-(self.max_length) :]
                 ctxlen = len(context_enc) - max(
                     0, len(context_enc) + len(continuation_enc) - (self.max_length)
diff --git a/lm_eval/models/vllm_vlms.py b/lm_eval/models/vllm_vlms.py
index 62c35592535eebea32a793a6f18918cb0205a027..15813b8aa9a480b9bc645666daf6ead130e9cce0 100644
--- a/lm_eval/models/vllm_vlms.py
+++ b/lm_eval/models/vllm_vlms.py
@@ -12,6 +12,7 @@ from lm_eval.models.utils import (
     Collator,
     handle_stop_sequences,
     replace_placeholders,
+    resize_image,
     undistribute,
 )
 from lm_eval.models.vllm_causallms import VLLM
@@ -44,8 +45,20 @@ class VLLM_VLM(VLLM):
         interleave: bool = True,
         # TODO<baber>: handle max_images and limit_mm_per_prompt better
         max_images: int = 999,
+        image_width: Optional[int] = None,
+        image_height: Optional[int] = None,
+        image_max_side: Optional[int] = None,
         **kwargs,
     ):
+        self.image_width = image_width
+        self.image_height = image_height
+        self.image_max_side = image_max_side
+        if self.image_max_side and (self.image_width or self.image_height):
+            raise ValueError(
+                "Ambiguous config for image resize: you can not specify both "
+                "image_max_side and (image_width or image_height)"
+            )
+
         if max_images != 999:
             kwargs["limit_mm_per_prompt"] = {"image": max_images}
             eval_logger.info(f"Setting limit_mm_per_prompt[image] to {max_images}")
@@ -93,7 +106,7 @@ class VLLM_VLM(VLLM):
             outputs.append(inputs)
         return outputs
 
-    def _model_generate(
+    def _multimodal_model_generate(
         self,
         requests: List[List[dict]] = None,
         generate: bool = False,
@@ -205,7 +218,10 @@ class VLLM_VLM(VLLM):
     def generate_until(
         self, requests: List[Instance], disable_tqdm: bool = False
     ) -> List[str]:
-        # TODO: support text-only reqs
+        if requests and len(requests[0].args) < 3:
+            # Fall back to non-multimodal generation.
+            return super().generate_until(requests=requests, disable_tqdm=disable_tqdm)
+
         res = []
 
         def _collate(x):
@@ -239,7 +255,15 @@ class VLLM_VLM(VLLM):
         for chunk in chunks:
             contexts, all_gen_kwargs, aux_arguments = zip(*chunk)
 
-            visuals = [arg["visual"] for arg in aux_arguments]
+            visuals = [
+                [
+                    resize_image(
+                        img, self.image_width, self.image_height, self.image_max_side
+                    )
+                    for img in arg["visual"]
+                ]
+                for arg in aux_arguments
+            ]
 
             if not isinstance(contexts, list):
                 contexts = list(
@@ -272,7 +296,7 @@ class VLLM_VLM(VLLM):
                 left_truncate_len=max_ctx_len,
             )
 
-            cont = self._model_generate(
+            cont = self._multimodal_model_generate(
                 inputs, stop=until, generate=True, max_tokens=max_gen_toks, **kwargs
             )
 
@@ -288,3 +312,12 @@ class VLLM_VLM(VLLM):
 
         pbar.close()
         return res
+
+    def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
+        if requests and len(requests[0].args) < 3:
+            # Fall back to non-multimodal generation.
+            return super().loglikelihood_rolling(requests=requests)
+        raise NotImplementedError(
+            "model type `vllm-vlm` does not support loglikelihood_rolling. Use 'vlm' model type for text-only loglikelihood_rolling tasks ",
+            "this is because we do not support measuring the loglikelihood a model assigns to an image.",
+        )
diff --git a/lm_eval/tasks/README.md b/lm_eval/tasks/README.md
index 9b35aea8bd5b037bd44d22269df8509a4f1a9734..6f3ac1750af27d163d50ae72da9c32ff807188b0 100644
--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -5,165 +5,167 @@
 
  For more information, including a full list of task names and their precise meanings or sources, follow the links provided to the individual README.md files for each subfolder.
 
-| Task Family                                                              | Description                                                                                                                                                                                                                                                                                                                            | Language(s)                                                                                                                   |
-|--------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------|
-| [aclue](aclue/README.md)                                                 | Tasks focusing on ancient Chinese language understanding and cultural aspects.                                                                                                                                                                                                                                                         | Ancient Chinese                                                                                                               |
-| [acp_bench](acpbench/README.md)                                          | Tasks evaluating the reasoning ability about Action, Change, and Planning                                                                                                                                                                                                                                                              | English                                                                                                                       |
-| [aexams](aexams/README.md)                                               | Tasks in Arabic related to various academic exams covering a range of subjects.                                                                                                                                                                                                                                                        | Arabic                                                                                                                        |
-| [agieval](agieval/README.md)                                             | Tasks involving historical data or questions related to history and historical texts.                                                                                                                                                                                                                                                  | English, Chinese                                                                                                              |
-| [anli](anli/README.md)                                                   | Adversarial natural language inference tasks designed to test model robustness.                                                                                                                                                                                                                                                        | English                                                                                                                       |
-| [arabic_leaderboard_complete](arabic_leaderboard_complete/README.md)     | A full version of the tasks in the Open Arabic LLM Leaderboard, focusing on the evaluation of models that reflect the characteristics of Arabic language understanding and comprehension, culture, and heritage. Note that some of these tasks are machine-translated.                                                                 | Arabic (Some MT)                                                                                                              |
-| [arabic_leaderboard_light](arabic_leaderboard_light/README.md)           | A light version of the tasks in the Open Arabic LLM Leaderboard (i.e., 10% samples of the test set in the original benchmarks), focusing on the evaluation of models that reflect the characteristics of Arabic language understanding and comprehension, culture, and heritage. Note that some of these tasks are machine-translated. | Arabic (Some MT)                                                                                                              |
-| [arabicmmlu](arabicmmlu/README.md)                                       | Localized Arabic version of MMLU with multiple-choice questions from 40 subjects.                                                                                                                                                                                                                                                      | Arabic                                                                                                                        |
-| [AraDICE](aradice/README.md)                                             | A collection of multiple tasks carefully designed to evaluate dialectal and cultural capabilities in large language models (LLMs).                                                                                                                                                                                                     | Arabic                                                                                                                        |
-| [arc](arc/README.md)                                                     | Tasks involving complex reasoning over a diverse set of questions.                                                                                                                                                                                                                                                                     | English                                                                                                                       |
-| [arithmetic](arithmetic/README.md)                                       | Tasks involving numerical computations and arithmetic reasoning.                                                                                                                                                                                                                                                                       | English                                                                                                                       |
-| [asdiv](asdiv/README.md)                                                 | Tasks involving arithmetic and mathematical reasoning challenges.                                                                                                                                                                                                                                                                      | English                                                                                                                       |
-| [babi](babi/README.md)                                                   | Tasks designed as question and answering challenges based on simulated stories.                                                                                                                                                                                                                                                        | English                                                                                                                       |
-| [basque_bench](basque_bench/README.md)                                   | Collection of tasks in Basque encompassing various evaluation areas.                                                                                                                                                                                                                                                                   | Basque                                                                                                                        |
-| [basqueglue](basqueglue/README.md)                                       | Tasks designed to evaluate language understanding in Basque language.                                                                                                                                                                                                                                                                  | Basque                                                                                                                        |
-| [bbh](bbh/README.md)                                                     | Tasks focused on deep semantic understanding through hypothesization and reasoning.                                                                                                                                                                                                                                                    | English, German                                                                                                               |
-| [bbq](bbq/README.md)                                                     | A question-answering benchmark designed to measure social biases in language models across various demographic categories and contexts.                                                                                                                                                                                                | English                                                                                                                       |
-| [belebele](belebele/README.md)                                           | Language understanding tasks in a variety of languages and scripts.                                                                                                                                                                                                                                                                    | Multiple (122 languages)                                                                                                      |
-| benchmarks                                                               | General benchmarking tasks that test a wide range of language understanding capabilities.                                                                                                                                                                                                                                              |                                                                                                                               |
-| [bertaqa](bertaqa/README.md)                                             | Local Basque cultural trivia QA tests in English and Basque languages.                                                                                                                                                                                                                                                                 | English, Basque, Basque (MT)                                                                                                  |
-| [bigbench](bigbench/README.md)                                           | Broad tasks from the BIG-bench benchmark designed to push the boundaries of large models.                                                                                                                                                                                                                                              | Multiple                                                                                                                      |
-| [blimp](blimp/README.md)                                                 | Tasks testing grammatical phenomena to evaluate language model's linguistic capabilities.                                                                                                                                                                                                                                              | English                                                                                                                       |
-| [careqa](careqa/README.md)                                               | Multiple choice and open-ended medical question answering based on the Spanish Specialised Healthcare Training (MIR) exams.                                                                                                                                                                                                            | English, Spanish                                                                                                              |
-| [catalan_bench](catalan_bench/README.md)                                 | Collection of tasks in Catalan encompassing various evaluation areas.                                                                                                                                                                                                                                                                  | Catalan                                                                                                                       |
-| [ceval](ceval/README.md)                                                 | Tasks that evaluate language understanding and reasoning in an educational context.                                                                                                                                                                                                                                                    | Chinese                                                                                                                       |
-| [cmmlu](cmmlu/README.md)                                                 | Multi-subject multiple choice question tasks for comprehensive academic assessment.                                                                                                                                                                                                                                                    | Chinese                                                                                                                       |
-| code_x_glue                                                              | Tasks that involve understanding and generating code across multiple programming languages.                                                                                                                                                                                                                                            | Go, Java, JS, PHP, Python, Ruby                                                                                               |
-| [commonsense_qa](commonsense_qa/README.md)                               | CommonsenseQA, a multiple-choice QA dataset for measuring commonsense knowledge.                                                                                                                                                                                                                                                       | English                                                                                                                       |
-| [copal_id](copal_id/README.md)                United States              | Indonesian causal commonsense reasoning dataset that captures local nuances.                                                                                                                                                                                                                                                           | Indonesian                                                                                                                    |
-| [coqa](coqa/README.md)                                                   | Conversational question answering tasks to test dialog understanding.                                                                                                                                                                                                                                                                  | English                                                                                                                       |
-| [crows_pairs](crows_pairs/README.md)                                     | Tasks designed to test model biases in various sociodemographic groups.                                                                                                                                                                                                                                                                | English, French                                                                                                               |
-| csatqa                                                                   | Tasks related to SAT and other standardized testing questions for academic assessment.                                                                                                                                                                                                                                                 | Korean                                                                                                                        |
-| [darija_bench](darija_bench/README.md)                                   | Traditional NLP tasks (Translation, Summariation, etc..) for Moroccan Darija                                                                                                                                                                                                                                                           | Moroccan Darija (some MT)                                                                                                     |
-| [darijahellaswag](darijahellaswag/README.md)                             | Moroccan Darija version of HellaSwag.                                                                                                                                                                                                                                                                                                  | Moroccan Darija (MT)                                                                                                          |
-| [darijammlu](darijammlu/README.md)                                       | Multiple-choice QA in Moroccan Darija (an Arabic dialect).                                                                                                                                                                                                                                                                             | Moroccan Darija (MT)                                                                                                          |
-| [drop](drop/README.md)                                                   | Tasks requiring numerical reasoning, reading comprehension, and question answering.                                                                                                                                                                                                                                                    | English                                                                                                                       |
-| [eq_bench](eq_bench/README.md)                                           | Tasks focused on equality and ethics in question answering and decision-making.                                                                                                                                                                                                                                                        | English                                                                                                                       |
-| [eus_exams](eus_exams/README.md)                                         | Tasks based on various professional and academic exams in the Basque language.                                                                                                                                                                                                                                                         | Basque                                                                                                                        |
-| [eus_proficiency](eus_proficiency/README.md)                             | Tasks designed to test proficiency in the Basque language across various topics.                                                                                                                                                                                                                                                       | Basque                                                                                                                        |
-| [eus_reading](eus_reading/README.md)                                     | Reading comprehension tasks specifically designed for the Basque language.                                                                                                                                                                                                                                                             | Basque                                                                                                                        |
-| [eus_trivia](eus_trivia/README.md)                                       | Trivia and knowledge testing tasks in the Basque language.                                                                                                                                                                                                                                                                             | Basque                                                                                                                        |
-| [evalita_LLM](evalita_llm/README.md)                                     | A native Italian benchmark with diverse tasks formats and multiple prompts.                                                                                                                                                                                                                                                            | Italian                                                                                                                       |
-| [fda](fda/README.md)                                                     | Tasks for extracting key-value pairs from FDA documents to test information extraction.                                                                                                                                                                                                                                                | English                                                                                                                       |
-| [fld](fld/README.md)                                                     | Tasks involving free-form and directed dialogue understanding.                                                                                                                                                                                                                                                                         | English                                                                                                                       |
-| [french_bench](french_bench/README.md)                                   | Set of tasks designed to assess language model performance in French.                                                                                                                                                                                                                                                                  | French                                                                                                                        |
-| [galician_bench](galician_bench/README.md)                               | Collection of tasks in Galician encompassing various evaluation areas.                                                                                                                                                                                                                                                                 | Galician                                                                                                                      |
-| [global_mmlu](global_mmlu/README.md)                                     | Collection of culturally sensitive and culturally agnostic MMLU tasks in 15 languages with human translations or post-edits.                                                                                                                                                                                                           | Multiple (15 languages)                                                                                                       |
-| [glue](glue/README.md)                                                   | General Language Understanding Evaluation benchmark to test broad language abilities.                                                                                                                                                                                                                                                  | English                                                                                                                       |
-| [gpqa](gpqa/README.md)                                                   | Tasks designed for general public question answering and knowledge verification.                                                                                                                                                                                                                                                       | English                                                                                                                       |
-| [gsm8k](gsm8k/README.md)                                                 | A benchmark of grade school math problems aimed at evaluating reasoning capabilities.                                                                                                                                                                                                                                                  | English                                                                                                                       |
-| [groundcocoa](groundcocoa/README.md)                                     | A benchmark evaluating the conditional and compositional reasoning of language models using a grounding task.                                                                                                                                                                                                                          | English                                                                                                                       |
-| [haerae](haerae/README.md)                                               | Tasks focused on assessing detailed factual and historical knowledge.                                                                                                                                                                                                                                                                  | Korean                                                                                                                        |
-| [headqa](headqa/README.md)                                               | A high-level education-based question answering dataset to test specialized knowledge.                                                                                                                                                                                                                                                 | Spanish, English                                                                                                              |
-| [hellaswag](hellaswag/README.md)                                         | Tasks to predict the ending of stories or scenarios, testing comprehension and creativity.                                                                                                                                                                                                                                             | English                                                                                                                       |
-| [hendrycks_ethics](hendrycks_ethics/README.md)                           | Tasks designed to evaluate the ethical reasoning capabilities of models.                                                                                                                                                                                                                                                               | English                                                                                                                       |
-| [hendrycks_math](hendrycks_math/README.md)                               | Mathematical problem-solving tasks to test numerical reasoning and problem-solving.                                                                                                                                                                                                                                                    | English                                                                                                                       |
-| [histoires_morales](histoires_morales/README.md)                         | A dataset of structured narratives that describe normative and norm-divergent actions taken by individuals to accomplish certain intentions in concrete situations.                                                                                                                                                                    | French (Some MT)                                                                                                              |
-| [hrm8k](hrm8k/README.md)                                                 | A challenging bilingual math reasoning benchmark for Korean and English.                                                                                                                                                                                                                                                               | Korean (Some MT), English (Some MT)                                                                                           |
-| [humaneval](humaneval/README.md)                                         | Code generation task that measure functional correctness for synthesizing programs from docstrings.                                                                                                                                                                                                                                    | Python                                                                                                                        |
-| [ifeval](ifeval/README.md)                                               | Interactive fiction evaluation tasks for narrative understanding and reasoning.                                                                                                                                                                                                                                                        | English                                                                                                                       |
-| [inverse_scaling](inverse_scaling/README.md)                             | Multiple-choice tasks from the Inverse Scaling Prize, designed to find settings where larger language models perform worse.                                                                                                                                                                                                            | English                                                                                                                       |
-| [japanese_leaderboard](japanese_leaderboard/README.md)                   | Japanese language understanding tasks to benchmark model performance on various linguistic aspects.                                                                                                                                                                                                                                    | Japanese                                                                                                                      |
-| [jsonschema_bench](jsonschema_bench/README.md)                           | Evaluate the ability of LLMs to generate JSON objects that conform to a given JSON schema, including API, configuration files, and other structured data formats.                                                                                                                                                                      | JSON                                                                                                                          |
-| [kbl](kbl/README.md)                                                     | Korean Benchmark for Legal Language Understanding.                                                                                                                                                                                                                                                                                     | Korean                                                                                                                        |
-| [kmmlu](kmmlu/README.md)                                                 | Knowledge-based multi-subject multiple choice questions for academic evaluation.                                                                                                                                                                                                                                                       | Korean                                                                                                                        |
-| [kobest](kobest/README.md)                                               | A collection of tasks designed to evaluate understanding in Korean language.                                                                                                                                                                                                                                                           | Korean                                                                                                                        |
-| [kormedmcqa](kormedmcqa/README.md)                                       | Medical question answering tasks in Korean to test specialized domain knowledge.                                                                                                                                                                                                                                                       | Korean                                                                                                                        |
-| [lambada](lambada/README.md)                                             | Tasks designed to predict the endings of text passages, testing language prediction skills.                                                                                                                                                                                                                                            | English                                                                                                                       |
-| [lambada_cloze](lambada_cloze/README.md)                                 | Cloze-style LAMBADA dataset.                                                                                                                                                                                                                                                                                                           | English                                                                                                                       |
-| [lambada_multilingual](lambada_multilingual/README.md)                   | Multilingual LAMBADA dataset. This is a legacy version of the multilingual dataset, and users should instead use `lambada_multilingual_stablelm`.                                                                                                                                                                                      | German, English, Spanish, French, Italian                                                                                     |
-| [lambada_multilingual_stablelm](lambada_multilingual_stablelm/README.md) | Multilingual LAMBADA dataset. Users should prefer evaluating on this version of the multilingual dataset instead of on `lambada_multilingual`.                                                                                                                                                                                         | German, English, Spanish, French, Italian, Dutch, Portuguese                                                                  |
-| [leaderboard](leaderboard/README.md)                                     | Task group used by Hugging Face's [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). Those tasks are static and will not change through time                                                                                                                                          | English                                                                                                                       |
-| [lingoly](lingoly/README.md)                                             | Challenging logical reasoning benchmark in low-resource languages with controls for memorization                                                                                                                                                                                                                                       | English, Multilingual                                                                                                         |
-| [llama3](llama3/README.md)                                               | Evals reproducing those provided by the LLAMA team in the Hugging Face repo (instruct)                                                                                                                                                                                                                                                 | English, Multilingual                                                                                                         |
-| [logiqa](logiqa/README.md)                                               | Logical reasoning tasks requiring advanced inference and deduction.                                                                                                                                                                                                                                                                    | English, Chinese                                                                                                              |
-| [logiqa2](logiqa2/README.md)                                             | Large-scale logical reasoning dataset adapted from the Chinese Civil Service Examination.                                                                                                                                                                                                                                              | English, Chinese                                                                                                              |
-| [longbench](longbench/README.md)                                         | LongBench evaluates language models' ability to understand lengthy texts across multiple tasks and languages.                                                                                                                                                                                                                          | English, Chinese                                                                                                              |
-| [mastermind](mastermind/README.md)                                       | Reasoning benchmark based on the board game of Mastermind.                                                                                                                                                                                                                                                                             | English                                                                                                                       |
-| [mathqa](mathqa/README.md)                                               | Question answering tasks involving mathematical reasoning and problem-solving.                                                                                                                                                                                                                                                         | English                                                                                                                       |
-| [mbpp](mbpp/README.md)                                                   | A benchmark designed to measure the ability to synthesize short Python programs from natural language descriptions.                                                                                                                                                                                                                    | Python                                                                                                                        |
-| [meddialog](meddialog/README.md)                                         | Medical open-ended QA and Question Entailment stemming from the MedDialog dataset.                                                                                                                                                                                                                                                     | English                                                                                                                       |
-| [medtext](medtext/README.md)                                             | Medical open-ended QA from the MedText Clinical Notes dataset.                                                                                                                                                                                                                                                                         | English                                                                                                                       |
-| [mimic_repsum](mimic_repsum/README.md)                                   | Medical report summarization from the MIMIC-III dataset.                                                                                                                                                                                                                                                                               | English                                                                                                                       |
-| [mc_taco](mc_taco/README.md)                                             | Question-answer pairs that require temporal commonsense comprehension.                                                                                                                                                                                                                                                                 | English                                                                                                                       |
-| [med_concepts_qa](med_concepts_qa/README.md)                             | Benchmark for evaluating LLMs on their abilities to interpret medical codes and distinguish between medical concept.                                                                                                                                                                                                                   | English                                                                                                                       |
-| [metabench](metabench/README.md)                                         | Distilled versions of six popular benchmarks which are highly predictive of overall benchmark performance and of a single general ability latent trait.                                                                                                                                                                                | English                                                                                                                       |
-| [mediqa_qa2019](mediqa_qa2019/README.md)                                 | Open-ended healthcare question answering benchmark from the MEDIQA 2019 challenge.                                                                                                                                                                                                                                                     | English                                                                                                                       |
-| medmcqa                                                                  | Medical multiple choice questions assessing detailed medical knowledge.                                                                                                                                                                                                                                                                | English                                                                                                                       |
-| medqa                                                                    | Multiple choice question answering based on the United States Medical License Exams.                                                                                                                                                                                                                                                   |                                                                                                                               |
-| [meqsum](meqsum/README.md)                                               | Healtcare Question Entailment benchmark from the MeqSum dataset.                                                                                                                                                                                                                                                                       |                                                                                                                               |
-| [mgsm](mgsm/README.md)                                                   | Benchmark of multilingual grade-school math problems.                                                                                                                                                                                                                                                                                  | Spanish, French, German, Russian, Chinese, Japanese, Thai, Swahili, Bengali, Telugu                                           |
-| [minerva_math](minerva_math/README.md)                                   | Mathematics-focused tasks requiring numerical reasoning and problem-solving skills.                                                                                                                                                                                                                                                    | English                                                                                                                       |
-| [mlqa](mlqa/README.md)                                                   | MultiLingual Question Answering benchmark dataset for evaluating cross-lingual question answering performance.                                                                                                                                                                                                                         | English, Arabic, German, Spanish, Hindi, Vietnamese, Simplified Chinese                                                       |
-| [mmlu](mmlu/README.md)                                                   | Massive Multitask Language Understanding benchmark for broad domain language evaluation. Several variants are supported.                                                                                                                                                                                                               | English                                                                                                                       |
-| [mmlu_pro](mmlu_pro/README.md)                                           | A refined set of MMLU, integrating more challenging, reasoning-focused questions and expanding the choice set from four to ten options.                                                                                                                                                                                                | English                                                                                                                       |
-| [mmlu-pro-plus](mmlu-pro-plus/README.md)                                 | A new test set for evaluating shortcut learning and higher-order reasoning of LLMs.                                                                                                                                                                                                                                                    | English                                                                                                                       |
-| [mmlu_prox](mmlu_prox/README.md)                                         | A multilingual benchmark that extends MMLU-Pro to multiple typologically diverse languages with human validation.                                                                                                                                                                                                                      | English, Japanese, Chinese, Korean, French, German, Spanish, Portuguese, Swahili, Thai, Arabic, Hindi, Bengali                |
-| [mmlusr](mmlusr/README.md)                                               | Variation of MMLU designed to be more rigorous.                                                                                                                                                                                                                                                                                        | English                                                                                                                       |
-| model_written_evals                                                      | Evaluation tasks auto-generated for evaluating a collection of AI Safety concerns.                                                                                                                                                                                                                                                     |                                                                                                                               |
-| [moral_stories](moral_stories/README.md)                                 | A crowd-sourced dataset of structured narratives that describe normative and norm-divergent actions taken by individuals to accomplish certain intentions in concrete situations.                                                                                                                                                      | English                                                                                                                       |
-| [mts_dialog](mts_dialog/README.md)                                       | Open-ended healthcare QA from the MTS-Dialog dataset.                                                                                                                                                                                                                                                                                  | English                                                                                                                       |
-| [mutual](mutual/README.md)                                               | A retrieval-based dataset for multi-turn dialogue reasoning.                                                                                                                                                                                                                                                                           | English                                                                                                                       |
-| [nq_open](nq_open/README.md)                                             | Open domain question answering tasks based on the Natural Questions dataset.                                                                                                                                                                                                                                                           | English                                                                                                                       |
-| [okapi/arc_multilingual](okapi/arc_multilingual/README.md)               | Tasks that involve reading comprehension and information retrieval challenges.                                                                                                                                                                                                                                                         | Multiple (31 languages) **Machine Translated.**                                                                               |
-| [okapi/hellaswag_multilingual](okapi/hellaswag_multilingual/README.md)   | Tasks that involve reading comprehension and information retrieval challenges.                                                                                                                                                                                                                                                         | Multiple (30 languages) **Machine Translated.**                                                                               |
-| okapi/mmlu_multilingual                                                  | Tasks that involve reading comprehension and information retrieval challenges.                                                                                                                                                                                                                                                         | Multiple (34 languages) **Machine Translated.**                                                                               |
-| [okapi/truthfulqa_multilingual](okapi/truthfulqa_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges.                                                                                                                                                                                                                                                         | Multiple (31 languages) **Machine Translated.**                                                                               |
-| [olaph](olaph/README.md)                                                 | Open-ended medical factuality Question Answering from the OLAPH dataset.                                                                                                                                                                                                                                                               | English                                                                                                                       |
-| [openbookqa](openbookqa/README.md)                                       | Open-book question answering tasks that require external knowledge and reasoning.                                                                                                                                                                                                                                                      | English                                                                                                                       |
-| [paloma](paloma/README.md)                                               | Paloma is a comprehensive benchmark designed to evaluate open language models across a wide range of domains, ranging from niche artist communities to mental health forums on Reddit.                                                                                                                                                 | English                                                                                                                       |
-| [paws-x](paws-x/README.md)                                               | Paraphrase Adversaries from Word Scrambling, focusing on cross-lingual capabilities.                                                                                                                                                                                                                                                   | English, French, Spanish, German, Chinese, Japanese, Korean                                                                   |
-| [pile](pile/README.md)                                                   | Open source language modelling data set that consists of 22 smaller, high-quality datasets.                                                                                                                                                                                                                                            | English                                                                                                                       |
-| [pile_10k](pile_10k/README.md)                                           | The first 10K elements of The Pile, useful for debugging models trained on it.                                                                                                                                                                                                                                                         | English                                                                                                                       |
-| [piqa](piqa/README.md)                                                   | Physical Interaction Question Answering tasks to test physical commonsense reasoning.                                                                                                                                                                                                                                                  | English                                                                                                                       |
-| [polemo2](polemo2/README.md)                                             | Sentiment analysis and emotion detection tasks based on Polish language data.                                                                                                                                                                                                                                                          | Polish                                                                                                                        |
-| [portuguese_bench](portuguese_bench/README.md)                           | Collection of tasks in European Portuguese encompassing various evaluation areas.                                                                                                                                                                                                                                                      | Portuguese                                                                                                                    |
-| [prost](prost/README.md)                                                 | Tasks requiring understanding of professional standards and ethics in various domains.                                                                                                                                                                                                                                                 | English                                                                                                                       |
-| [pubmedqa](pubmedqa/README.md)                                           | Question answering tasks based on PubMed research articles for biomedical understanding.                                                                                                                                                                                                                                               | English                                                                                                                       |
-| [qa4mre](qa4mre/README.md)                                               | Question Answering for Machine Reading Evaluation, assessing comprehension and reasoning.                                                                                                                                                                                                                                              | English                                                                                                                       |
-| [qasper](qasper/README.md)                                               | Question Answering dataset based on academic papers, testing in-depth scientific knowledge.                                                                                                                                                                                                                                            | English                                                                                                                       |
-| [race](race/README.md)                                                   | Reading comprehension assessment tasks based on English exams in China.                                                                                                                                                                                                                                                                | English                                                                                                                       |
-| realtoxicityprompts                                                      | Tasks to evaluate language models for generating text with potential toxicity.                                                                                                                                                                                                                                                         |                                                                                                                               |
-| [ruler](ruler/README.md)                                                 | RULER is a benchmark for testing how well language models handle long pieces of text. Requires custom arg (see readme)                                                                                                                                                                                                                 | English                                                                                                                       |
-| [sciq](sciq/README.md)                                                   | Science Question Answering tasks to assess understanding of scientific concepts.                                                                                                                                                                                                                                                       | English                                                                                                                       |
-| [score](score/README.md)                                                 | Systematic consistency and robustness evaluation for LLMs on 3 datasets(MMLU-Pro, Agi Eval and MATH)                                                                                                                                                                                                                                   | English                                                                                                                       |
-| [scrolls](scrolls/README.md)                                             | Tasks that involve long-form reading comprehension across various domains.                                                                                                                                                                                                                                                             | English                                                                                                                       |
-| [simple_cooccurrence_bias](simple_cooccurrence_bias/README.md)           | A metric that evaluates language models for biases based on stereotypical word associations and co-occurrences in text.                                                                                                                                                                                                                | English                                                                                                                       |
-| [siqa](siqa/README.md)                                                   | Social Interaction Question Answering to evaluate common sense and social reasoning.                                                                                                                                                                                                                                                   | English                                                                                                                       |
-| [spanish_bench](spanish_bench/README.md)                                 | Collection of tasks in Spanish encompassing various evaluation areas.                                                                                                                                                                                                                                                                  | Spanish                                                                                                                       |
-| [squad_completion](squad_completion/README.md)                           | A variant of the SQuAD question answering task designed for zero-shot evaluation of small LMs.                                                                                                                                                                                                                                         | English                                                                                                                       |
-| [squadv2](squadv2/README.md)                                             | Stanford Question Answering Dataset version 2, a reading comprehension benchmark.                                                                                                                                                                                                                                                      | English                                                                                                                       |
-| [storycloze](storycloze/README.md)                                       | Tasks to predict story endings, focusing on narrative logic and coherence.                                                                                                                                                                                                                                                             | English                                                                                                                       |
-| [super_glue](super_glue/README.md)                                       | A suite of challenging tasks designed to test a range of language understanding skills.                                                                                                                                                                                                                                                | English                                                                                                                       |
-| [swag](swag/README.md)                                                   | Situations With Adversarial Generations, predicting the next event in videos.                                                                                                                                                                                                                                                          | English                                                                                                                       |
-| [swde](swde/README.md)                                                   | Information extraction tasks from semi-structured web pages.                                                                                                                                                                                                                                                                           | English                                                                                                                       |
-| [tinyBenchmarks](tinyBenchmarks/README.md)                               | Evaluation of large language models with fewer examples using tiny versions of popular benchmarks.                                                                                                                                                                                                                                     | English                                                                                                                       |
-| [tmmluplus](tmmluplus/README.md)                                         | An extended set of tasks under the TMMLU framework for broader academic assessments.                                                                                                                                                                                                                                                   | Traditional Chinese                                                                                                           |
-| [toxigen](toxigen/README.md)                                             | Tasks designed to evaluate language models on their propensity to generate toxic content.                                                                                                                                                                                                                                              | English                                                                                                                       |
-| [translation](translation/README.md)                                     | Tasks focused on evaluating the language translation capabilities of models.                                                                                                                                                                                                                                                           | Arabic, English, Spanish, Basque, Hindi, Indonesian, Burmese, Russian, Swahili, Telugu, Chinese                               |
-| [triviaqa](triviaqa/README.md)                                           | A large-scale dataset for trivia question answering to test general knowledge.                                                                                                                                                                                                                                                         | English                                                                                                                       |
-| [truthfulqa](truthfulqa/README.md)                                       | A QA task aimed at evaluating the truthfulness and factual accuracy of model responses.                                                                                                                                                                                                                                                | English                                                                                                                       |
-| [turkishmmlu](turkishmmlu/README.md)                                     | A multiple-choice QA test modeled after MMLU, written in Turkish based on Turkish high-school level exams.                                                                                                                                                                                                                             | Turkish                                                                                                                       |
-| [unitxt](unitxt/README.md)                                               | A number of tasks implemented using the unitxt library for flexible, shareable, and reusable data preparation and evaluation for generative AI.                                                                                                                                                                                        | English                                                                                                                       |
-| [unscramble](unscramble/README.md)                                       | Tasks involving the rearrangement of scrambled sentences to test syntactic understanding.                                                                                                                                                                                                                                              | English                                                                                                                       |
-| [webqs](webqs/README.md)                                                 | Web-based question answering tasks designed to evaluate internet search and retrieval.                                                                                                                                                                                                                                                 | English                                                                                                                       |
-| [wikitext](wikitext/README.md)                                           | Tasks based on text from Wikipedia articles to assess language modeling and generation.                                                                                                                                                                                                                                                | English                                                                                                                       |
-| [winogender](winogender/README.md)                                       | A diagnostic dataset that tests for gender bias in coreference resolution by measuring how models associate pronouns with different occupations.                                                                                                                                                                                       | English                                                                                                                       |
-| [winogrande](winogrande/README.md)                                       | A large-scale dataset for coreference resolution, inspired by the Winograd Schema Challenge.                                                                                                                                                                                                                                           | English                                                                                                                       |
-| [wmdp](wmdp/README.md)                                                   | A benchmark with the objective of minimizing performance, based on potentially-sensitive multiple-choice knowledge questions.                                                                                                                                                                                                          | English                                                                                                                       |
-| [wmt2016](wmt2016/README.md)                                             | Tasks from the WMT 2016 shared task, focusing on translation between multiple languages.                                                                                                                                                                                                                                               | English, Czech, German, Finnish, Russian, Romanian, Turkish                                                                   |
-| [wsc273](wsc273/README.md)                                               | The Winograd Schema Challenge, a test of commonsense reasoning and coreference resolution.                                                                                                                                                                                                                                             | English                                                                                                                       |
-| [xcopa](xcopa/README.md)                                                 | Cross-lingual Choice of Plausible Alternatives, testing reasoning in multiple languages.                                                                                                                                                                                                                                               | Estonian, Haitian, Indonesian, Italian, Quechua, Swahili, Tamil, Thai, Turkish, Vietnamese, Chinese                           |
+| Task Family                                                              | Description                                                                                                                                                                                                                                                                                                                            | Language(s)                                                                                                           |
+|--------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------|
+| [aclue](aclue/README.md)                                                 | Tasks focusing on ancient Chinese language understanding and cultural aspects.                                                                                                                                                                                                                                                         | Ancient Chinese                                                                                                       |
+| [acp_bench](acpbench/README.md) | Tasks evaluating the reasoning ability about Action, Change, and Planning | English |
+| [acp_bench_hard](acpbench/README.md) | Tasks evaluating the reasoning ability about Action, Change, and Planning | English |
+| [aexams](aexams/README.md)                                               | Tasks in Arabic related to various academic exams covering a range of subjects. | Arabic                                                                                                                        |
+| [agieval](agieval/README.md)                                             | Tasks involving historical data or questions related to history and historical texts.                                                                                                                                                                                                                                                  | English, Chinese                                                                                                      |
+| [anli](anli/README.md)                                                   | Adversarial natural language inference tasks designed to test model robustness.                                                                                                                                                                                                                                                        | English                                                                                                               |
+| [arabic_leaderboard_complete](arabic_leaderboard_complete/README.md)     | A full version of the tasks in the Open Arabic LLM Leaderboard, focusing on the evaluation of models that reflect the characteristics of Arabic language understanding and comprehension, culture, and heritage. Note that some of these tasks are machine-translated.                                                                 | Arabic (Some MT)                                                                                                      |
+| [arabic_leaderboard_light](arabic_leaderboard_light/README.md)           | A light version of the tasks in the Open Arabic LLM Leaderboard (i.e., 10% samples of the test set in the original benchmarks), focusing on the evaluation of models that reflect the characteristics of Arabic language understanding and comprehension, culture, and heritage. Note that some of these tasks are machine-translated. | Arabic (Some MT)                                                                                                      |
+| [arabicmmlu](arabicmmlu/README.md)                                       | Localized Arabic version of MMLU with multiple-choice questions from 40 subjects.                                                                                                                                                                                                                                                      | Arabic                                                                                                                |
+| [ArabCulture](arab_culture/README.md) | Benchmark for evaluating modeles' commonsense cultural knowledge across different 13 different Arab Countries. | Arabic |
+[AraDICE](aradice/README.md)                                             | A collection of multiple tasks carefully designed to evaluate dialectal and cultural capabilities in large language models (LLMs).                                                                                                                                                                                                     | Arabic                                                                                                                |
+| [arc](arc/README.md)                                                     | Tasks involving complex reasoning over a diverse set of questions.                                                                                                                                                                                                                                                                     | English                                                                                                               |
+| [arithmetic](arithmetic/README.md)                                       | Tasks involving numerical computations and arithmetic reasoning.                                                                                                                                                                                                                                                                       | English                                                                                                               |
+| [asdiv](asdiv/README.md)                                                 | Tasks involving arithmetic and mathematical reasoning challenges.                                                                                                                                                                                                                                                                      | English                                                                                                               |
+| [babi](babi/README.md)                                                   | Tasks designed as question and answering challenges based on simulated stories.                                                                                                                                                                                                                                                        | English                                                                                                               |
+| [basque_bench](basque_bench/README.md)                                   | Collection of tasks in Basque encompassing various evaluation areas.                                                                                                                                                                                                                                                                   | Basque                                                                                                                |
+| [basqueglue](basqueglue/README.md)                                       | Tasks designed to evaluate language understanding in Basque language.                                                                                                                                                                                                                                                                  | Basque                                                                                                                |
+| [bbh](bbh/README.md)                                                     | Tasks focused on deep semantic understanding through hypothesization and reasoning.                                                                                                                                                                                                                                                    | English, German                                                                                                       |
+| [bbq](bbq/README.md)                                                     | A question-answering benchmark designed to measure social biases in language models across various demographic categories and contexts.                                                                                                                                                                                                  | English                                                                                                               |
+| [belebele](belebele/README.md)                                           | Language understanding tasks in a variety of languages and scripts.                                                                                                                                                                                                                                                                    | Multiple (122 languages)                                                                                              |
+| benchmarks                                                               | General benchmarking tasks that test a wide range of language understanding capabilities.                                                                                                                                                                                                                                              |                                                                                                                       |
+| [bertaqa](bertaqa/README.md)                                             | Local Basque cultural trivia QA tests in English and Basque languages.                                                                                                                                                                                                                                                                 | English, Basque, Basque (MT)                                                                                          |
+| [bigbench](bigbench/README.md)                                           | Broad tasks from the BIG-bench benchmark designed to push the boundaries of large models.                                                                                                                                                                                                                                              | Multiple                                                                                                              |
+| [blimp](blimp/README.md)                                                 | Tasks testing grammatical phenomena to evaluate language model's linguistic capabilities.                                                                                                                                                                                                                                              | English                                                                                                               |
+| [c4](c4/README.md)                                                 | Tasks based on a colossal, cleaned version of Common Crawl's web crawl corpus to assess models' language modeling capabilities.                                                                                                                                                                                                                                              | English                                                                                                               |
+| [careqa](careqa/README.md)                                               | Multiple choice and open-ended medical question answering based on the Spanish Specialised Healthcare Training (MIR) exams.                                                                                                                                                                                                            | English, Spanish                                                                                                       |
+| [catalan_bench](catalan_bench/README.md)                                 | Collection of tasks in Catalan encompassing various evaluation areas.                                                                                                                                                                                                                                                                  | Catalan                                                                                                               |
+| [ceval](ceval/README.md)                                                 | Tasks that evaluate language understanding and reasoning in an educational context.                                                                                                                                                                                                                                                    | Chinese                                                                                                               |
+| [cmmlu](cmmlu/README.md)                                                 | Multi-subject multiple choice question tasks for comprehensive academic assessment.                                                                                                                                                                                                                                                    | Chinese                                                                                                               |
+| code_x_glue                                                              | Tasks that involve understanding and generating code across multiple programming languages.                                                                                                                                                                                                                                            | Go, Java, JS, PHP, Python, Ruby                                                                                       |
+| [commonsense_qa](commonsense_qa/README.md)                               | CommonsenseQA, a multiple-choice QA dataset for measuring commonsense knowledge.                                                                                                                                                                                                                                                       | English                                                                                                               |
+| [copal_id](copal_id/README.md)                United States              | Indonesian causal commonsense reasoning dataset that captures local nuances.                                                                                                                                                                                                                                                           | Indonesian                                                                                                            |
+| [coqa](coqa/README.md)                                                   | Conversational question answering tasks to test dialog understanding.                                                                                                                                                                                                                                                                  | English                                                                                                               |
+| [crows_pairs](crows_pairs/README.md)                                     | Tasks designed to test model biases in various sociodemographic groups.                                                                                                                                                                                                                                                                | English, French                                                                                                       |
+| csatqa                                                                   | Tasks related to SAT and other standardized testing questions for academic assessment.                                                                                                                                                                                                                                                 | Korean                                                                                                                |
+| [darija_bench](darija_bench/README.md) | Traditional NLP tasks (Translation, Summariation, etc..) for Moroccan Darija | Moroccan Darija (some MT) |
+| [darijahellaswag](darijahellaswag/README.md) | Moroccan Darija version of HellaSwag. | Moroccan Darija (MT) |
+| [darijammlu](darijammlu/README.md)| Multiple-choice QA in Moroccan Darija (an Arabic dialect).  | Moroccan Darija (MT) |
+| [drop](drop/README.md)                                                   | Tasks requiring numerical reasoning, reading comprehension, and question answering.                                                                                                                                                                                                                                                    | English                                                                                                               |
+| [eq_bench](eq_bench/README.md)                                           | Tasks focused on equality and ethics in question answering and decision-making.                                                                                                                                                                                                                                                        | English                                                                                                               |
+| [eus_exams](eus_exams/README.md)                                         | Tasks based on various professional and academic exams in the Basque language.                                                                                                                                                                                                                                                         | Basque                                                                                                                |
+| [eus_proficiency](eus_proficiency/README.md)                             | Tasks designed to test proficiency in the Basque language across various topics.                                                                                                                                                                                                                                                       | Basque                                                                                                                |
+| [eus_reading](eus_reading/README.md)                                     | Reading comprehension tasks specifically designed for the Basque language.                                                                                                                                                                                                                                                             | Basque                                                                                                                |
+| [eus_trivia](eus_trivia/README.md)                                       | Trivia and knowledge testing tasks in the Basque language.                                                                                                                                                                                                                                                                             | Basque                                                                                                                |
+| [evalita_LLM](evalita_llm/README.md)                                     | A native Italian benchmark with diverse tasks formats and multiple prompts.                                                                                                                                                                                                                                                            | Italian                                                                                                               |
+| [fda](fda/README.md)                                                     | Tasks for extracting key-value pairs from FDA documents to test information extraction.                                                                                                                                                                                                                                                | English                                                                                                               |
+| [fld](fld/README.md)                                                     | Tasks involving free-form and directed dialogue understanding.                                                                                                                                                                                                                                                                         | English                                                                                                               |
+| [french_bench](french_bench/README.md)                                   | Set of tasks designed to assess language model performance in French.                                                                                                                                                                                                                                                                  | French                                                                                                                |
+| [galician_bench](galician_bench/README.md)                               | Collection of tasks in Galician encompassing various evaluation areas.                                                                                                                                                                                                                                                                 | Galician                                                                                                              |
+| [global_mmlu](global_mmlu/README.md)                                     | Collection of culturally sensitive and culturally agnostic MMLU tasks in 15 languages with human translations or post-edits.                                                                                                                                                                                                           | Multiple (15 languages)                                                                                               |
+| [glue](glue/README.md)                                                   | General Language Understanding Evaluation benchmark to test broad language abilities.                                                                                                                                                                                                                                                  | English                                                                                                               |
+| [gpqa](gpqa/README.md)                                                   | Tasks designed for general public question answering and knowledge verification.                                                                                                                                                                                                                                                       | English                                                                                                               |
+| [gsm8k](gsm8k/README.md)                                                 | A benchmark of grade school math problems aimed at evaluating reasoning capabilities.                                                                                                                                                                                                                                                  | English                                                                                                               |
+| [groundcocoa](groundcocoa/README.md)                                     | A benchmark evaluating the conditional and compositional reasoning of language models using a grounding task. | English                                                                                                                       |
+| [haerae](haerae/README.md)                                               | Tasks focused on assessing detailed factual and historical knowledge.                                                                                                                                                                                                                                                                  | Korean                                                                                                                |
+| [headqa](headqa/README.md)                                               | A high-level education-based question answering dataset to test specialized knowledge.                                                                                                                                                                                                                                                 | Spanish, English                                                                                                      |
+| [hellaswag](hellaswag/README.md)                                         | Tasks to predict the ending of stories or scenarios, testing comprehension and creativity.                                                                                                                                                                                                                                             | English                                                                                                               |
+| [hendrycks_ethics](hendrycks_ethics/README.md)                           | Tasks designed to evaluate the ethical reasoning capabilities of models.                                                                                                                                                                                                                                                               | English                                                                                                               |
+| [hendrycks_math](hendrycks_math/README.md)                               | Mathematical problem-solving tasks to test numerical reasoning and problem-solving.                                                                                                                                                                                                                                                    | English                                                                                                               |
+| [histoires_morales](histoires_morales/README.md)                         | A dataset of structured narratives that describe normative and norm-divergent actions taken by individuals to accomplish certain intentions in concrete situations.                                                                                                                                                                    | French (Some MT)                                                                                                      |
+| [hrm8k](hrm8k/README.md)                                                 | A challenging bilingual math reasoning benchmark for Korean and English.                                                                                                                                                                                                                                                               | Korean (Some MT), English (Some MT)                                                                                   |
+| [humaneval](humaneval/README.md)                                         | Code generation task that measure functional correctness for synthesizing programs from docstrings.                                                                                                                                                                                                                                    | Python                                                                                                                |
+| [ifeval](ifeval/README.md)                                               | Interactive fiction evaluation tasks for narrative understanding and reasoning.                                                                                                                                                                                                                                                        | English                                                                                                               |
+| [inverse_scaling](inverse_scaling/README.md)                             | Multiple-choice tasks from the Inverse Scaling Prize, designed to find settings where larger language models perform worse.                                                                                                                                                                                                            | English                                                                                                               |
+| [japanese_leaderboard](japanese_leaderboard/README.md)                   | Japanese language understanding tasks to benchmark model performance on various linguistic aspects.                                                                                                                                                                                                                                    | Japanese                                                                                                              |
+| [jsonschema_bench](jsonschema_bench/README.md)                           | Evaluate the ability of LLMs to generate JSON objects that conform to a given JSON schema, including API, configuration files, and other structured data formats.                                                                                                                                                                      | JSON                                                                                                                  |
+| [kbl](kbl/README.md)                                                     | Korean Benchmark for Legal Language Understanding.                                                                                                                                                                                                                                                                                     | Korean                                                                                                                |
+| [kmmlu](kmmlu/README.md)                                                 | Knowledge-based multi-subject multiple choice questions for academic evaluation.                                                                                                                                                                                                                                                       | Korean                                                                                                                |
+| [kobest](kobest/README.md)                                               | A collection of tasks designed to evaluate understanding in Korean language.                                                                                                                                                                                                                                                           | Korean                                                                                                                |
+| [kormedmcqa](kormedmcqa/README.md)                                       | Medical question answering tasks in Korean to test specialized domain knowledge.                                                                                                                                                                                                                                                       | Korean                                                                                                                |
+| [lambada](lambada/README.md)                                             | Tasks designed to predict the endings of text passages, testing language prediction skills.                                                                                                                                                                                                                                            | English                                                                                                               |
+| [lambada_cloze](lambada_cloze/README.md)                                 | Cloze-style LAMBADA dataset.                                                                                                                                                                                                                                                                                                           | English                                                                                                               |
+| [lambada_multilingual](lambada_multilingual/README.md)                   | Multilingual LAMBADA dataset. This is a legacy version of the multilingual dataset, and users should instead use `lambada_multilingual_stablelm`.                                                                                                                                                                                      | German, English, Spanish, French, Italian                                                                             |
+| [lambada_multilingual_stablelm](lambada_multilingual_stablelm/README.md) | Multilingual LAMBADA dataset. Users should prefer evaluating on this version of the multilingual dataset instead of on `lambada_multilingual`.                                                                                                                                                                                         | German, English, Spanish, French, Italian, Dutch, Portuguese                                                          |
+| [leaderboard](leaderboard/README.md)                                     | Task group used by Hugging Face's [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). Those tasks are static and will not change through time                                                                                                                                          | English                                                                                                               |
+| [lingoly](lingoly/README.md)                                             | Challenging logical reasoning benchmark in low-resource languages with controls for memorization                                                                                                                                                                                                                                       | English, Multilingual                                                                                                 |
+| [logiqa](logiqa/README.md)                                               | Logical reasoning tasks requiring advanced inference and deduction.                                                                                                                                                                                                                                                                    | English, Chinese                                                                                                      |
+| [logiqa2](logiqa2/README.md)                                             | Large-scale logical reasoning dataset adapted from the Chinese Civil Service Examination.                                                                                                                                                                                                                                              | English, Chinese                                                                                                      |
+| [mastermind](mastermind/README.md)                                       | Reasoning benchmark based on the board game of Mastermind.                                                                                                                                                                                                                                                                             | English                                                                                                               |
+| [mathqa](mathqa/README.md)                                               | Question answering tasks involving mathematical reasoning and problem-solving.                                                                                                                                                                                                                                                         | English                                                                                                               |
+| [mbpp](mbpp/README.md)                                                   | A benchmark designed to measure the ability to synthesize short Python programs from natural language descriptions.                                                                                                                                                                                                                    | Python                                                                                                                |
+| [meddialog](meddialog/README.md)                                         | Medical open-ended QA and Question Entailment stemming from the MedDialog dataset.                                                                                                                                                                                                                                                     | English                                                                                                               |
+| [medtext](medtext/README.md)                                             | Medical open-ended QA from the MedText Clinical Notes dataset.                                                                                                                                                                                                                                                                         | English                                                                                                               |
+| [mimic_repsum](mimic_repsum/README.md)                                   | Medical report summarization from the MIMIC-III dataset.                                                                                                                                                                                                                                                                               | English                                                                                                               |
+| [mc_taco](mc_taco/README.md)                                             | Question-answer pairs that require temporal commonsense comprehension.                                                                                                                                                                                                                                                                 | English                                                                                                               |
+| [med_concepts_qa](med_concepts_qa/README.md)                             | Benchmark for evaluating LLMs on their abilities to interpret medical codes and distinguish between medical concept.                                                                                                                                                                                                                   | English                                                                                                               |
+| [metabench](metabench/README.md)                                         | Distilled versions of six popular benchmarks which are highly predictive of overall benchmark performance and of a single general ability latent trait.                                                                                                                                                                                | English                                                                                                               |
+| [mediqa_qa2019](mediqa_qa2019/README.md)                                 | Open-ended healthcare question answering benchmark from the MEDIQA 2019 challenge.                                                                                                                                                                                                                                                     | English                                                                                                               |
+| medmcqa                                                                  | Medical multiple choice questions assessing detailed medical knowledge.                                                                                                                                                                                                                                                                | English                                                                                                               |
+| medqa                                                                    | Multiple choice question answering based on the United States Medical License Exams.                                                                                                                                                                                                                                                   |                                                                                                                       |
+| [meqsum](meqsum/README.md)                                               | Healtcare Question Entailment benchmark from the MeqSum dataset.                                                                                                                                                                                                                                                                       |                                                                                                                       |
+| [mgsm](mgsm/README.md)                                                   | Benchmark of multilingual grade-school math problems.                                                                                                                                                                                                                                                                                  | Spanish, French, German, Russian, Chinese, Japanese, Thai, Swahili, Bengali, Telugu                                   |
+| [minerva_math](minerva_math/README.md)                                   | Mathematics-focused tasks requiring numerical reasoning and problem-solving skills.                                                                                                                                                                                                                                                    | English                                                                                                               |
+| [mlqa](mlqa/README.md)                                                   | MultiLingual Question Answering benchmark dataset for evaluating cross-lingual question answering performance.                                                                                                                                                                                                                         | English, Arabic, German, Spanish, Hindi, Vietnamese, Simplified Chinese                                               |
+| [mmlu](mmlu/README.md)                                                   | Massive Multitask Language Understanding benchmark for broad domain language evaluation. Several variants are supported.                                                                                                                                                                                                               | English                                                                                                               |
+| [mmlu_pro](mmlu_pro/README.md)                                           | A refined set of MMLU, integrating more challenging, reasoning-focused questions and expanding the choice set from four to ten options.                                                                                                                                                                                                | English                                                                                                               |
+| [mmlu-pro-plus](mmlu-pro-plus/README.md)                                 | A new test set for evaluating shortcut learning and higher-order reasoning of LLMs.                                                                                                                                                                                                                                                    | English                                                                                                               |
+| [mmlu_prox](mmlu_prox/README.md)                                         | A multilingual benchmark that extends MMLU-Pro to multiple typologically diverse languages with human validation.                                                                                                                                                                                                                      | English, Japanese, Chinese, Korean, French, German, Spanish, Portuguese, Swahili, Thai, Arabic, Hindi, Bengali        |
+| [mmlusr](mmlusr/README.md)                                               | Variation of MMLU designed to be more rigorous.                                                                                                                                                                                                                                                                                        | English                                                                                                               |
+| model_written_evals                                                      | Evaluation tasks auto-generated for evaluating a collection of AI Safety concerns.                                                                                                                                                                                                                                                     |                                                                                                                       |
+| [moral_stories](moral_stories/README.md)                                 | A crowd-sourced dataset of structured narratives that describe normative and norm-divergent actions taken by individuals to accomplish certain intentions in concrete situations.                                                                                                                                                      | English                                                                                                               |
+| [mts_dialog](mts_dialog/README.md)                                       | Open-ended healthcare QA from the MTS-Dialog dataset.                                                                                                                                                                                                                                                                                  | English                                                                                                               |
+| [mutual](mutual/README.md)                                               | A retrieval-based dataset for multi-turn dialogue reasoning.                                                                                                                                                                                                                                                                           | English                                                                                                               |
+| [noreval](noreval/README.md)                                             | A human-created Norwegian language understanding and generation benchmark.                                                                                                                                                                                                                                                             | Norwegian (Bokmål and Nynorsk)                                                                                        |
+| [nq_open](nq_open/README.md)                                             | Open domain question answering tasks based on the Natural Questions dataset.                                                                                                                                                                                                                                                           | English                                                                                                               |
+| [okapi/arc_multilingual](okapi/arc_multilingual/README.md)               | Tasks that involve reading comprehension and information retrieval challenges.                                                                                                                                                                                                                                                         | Multiple (31 languages) **Machine Translated.**                                                                       |
+| [okapi/hellaswag_multilingual](okapi/hellaswag_multilingual/README.md)   | Tasks that involve reading comprehension and information retrieval challenges.                                                                                                                                                                                                                                                         | Multiple (30 languages) **Machine Translated.**                                                                       |
+| okapi/mmlu_multilingual                                                  | Tasks that involve reading comprehension and information retrieval challenges.                                                                                                                                                                                                                                                         | Multiple (34 languages) **Machine Translated.**                                                                       |
+| [okapi/truthfulqa_multilingual](okapi/truthfulqa_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges.                                                                                                                                                                                                                                                         | Multiple (31 languages) **Machine Translated.**                                                                       |
+| [olaph](olaph/README.md)                                                 | Open-ended medical factuality Question Answering from the OLAPH dataset.                                                                                                                                                                                                                                                               | English                                                                                                               |
+| [openbookqa](openbookqa/README.md)                                       | Open-book question answering tasks that require external knowledge and reasoning.                                                                                                                                                                                                                                                      | English                                                                                                               |
+| [paloma](paloma/README.md)                                               | Paloma is a comprehensive benchmark designed to evaluate open language models across a wide range of domains, ranging from niche artist communities to mental health forums on Reddit.                                                                                                                                                 | English                                                                                                               |
+| [paws-x](paws-x/README.md)                                               | Paraphrase Adversaries from Word Scrambling, focusing on cross-lingual capabilities.                                                                                                                                                                                                                                                   | English, French, Spanish, German, Chinese, Japanese, Korean                                                           |
+| [pile](pile/README.md)                                                   | Open source language modelling data set that consists of 22 smaller, high-quality datasets.                                                                                                                                                                                                                                            | English                                                                                                               |
+| [pile_10k](pile_10k/README.md)                                           | The first 10K elements of The Pile, useful for debugging models trained on it.                                                                                                                                                                                                                                                         | English                                                                                                               |
+| [piqa](piqa/README.md)                                                   | Physical Interaction Question Answering tasks to test physical commonsense reasoning.                                                                                                                                                                                                                                                  | English                                                                                                               |
+| [polemo2](polemo2/README.md)                                             | Sentiment analysis and emotion detection tasks based on Polish language data.                                                                                                                                                                                                                                                          | Polish                                                                                                                |
+| [portuguese_bench](portuguese_bench/README.md)                           | Collection of tasks in European Portuguese encompassing various evaluation areas.                                                                                                                                                                                                                                                      | Portuguese                                                                                                            |
+| [prost](prost/README.md)                                                 | Tasks requiring understanding of professional standards and ethics in various domains.                                                                                                                                                                                                                                                 | English                                                                                                               |
+| [pubmedqa](pubmedqa/README.md)                                           | Question answering tasks based on PubMed research articles for biomedical understanding.                                                                                                                                                                                                                                               | English                                                                                                               |
+| [qa4mre](qa4mre/README.md)                                               | Question Answering for Machine Reading Evaluation, assessing comprehension and reasoning.                                                                                                                                                                                                                                              | English                                                                                                               |
+| [qasper](qasper/README.md)                                               | Question Answering dataset based on academic papers, testing in-depth scientific knowledge.                                                                                                                                                                                                                                            | English                                                                                                               |
+| [race](race/README.md)                                                   | Reading comprehension assessment tasks based on English exams in China.                                                                                                                                                                                                                                                                | English                                                                                                               |
+| realtoxicityprompts                                                      | Tasks to evaluate language models for generating text with potential toxicity.                                                                                                                                                                                                                                                         |                                                                                                                       |
+| [ruler](ruler/README.md)                                                 | RULER is a benchmark for testing how well language models handle long pieces of text. Requires custom arg (see readme)                                                                                                                                                                                                                 | English |
+| [sciq](sciq/README.md)                                                   | Science Question Answering tasks to assess understanding of scientific concepts.                                                                                                                                                                                                                                                       | English                                                                                                               |
+| [score](score/README.md)                                                 | Systematic consistency and robustness evaluation for LLMs on 3 datasets(MMLU-Pro, Agi Eval and MATH)                                                                                                                                                                                                                                   | English                                                                                                               |
+| [scrolls](scrolls/README.md)                                             | Tasks that involve long-form reading comprehension across various domains.                                                                                                                                                                                                                                                             | English                                                                                                               |
+| [simple_cooccurrence_bias](simple_cooccurrence_bias/README.md)           | A metric that evaluates language models for biases based on stereotypical word associations and co-occurrences in text.                                                                                                                                                                                                                | English                                                                                                               |
+| [siqa](siqa/README.md)                                                   | Social Interaction Question Answering to evaluate common sense and social reasoning.                                                                                                                                                                                                                                                   | English                                                                                                               |
+| [spanish_bench](spanish_bench/README.md)                                 | Collection of tasks in Spanish encompassing various evaluation areas.                                                                                                                                                                                                                                                                  | Spanish                                                                                                               |
+| [squad_completion](squad_completion/README.md)                           | A variant of the SQuAD question answering task designed for zero-shot evaluation of small LMs.                                                                                                                                                                                                                                         | English                                                                                                               |
+| [squadv2](squadv2/README.md)                                             | Stanford Question Answering Dataset version 2, a reading comprehension benchmark.                                                                                                                                                                                                                                                      | English                                                                                                               |
+| [storycloze](storycloze/README.md)                                       | Tasks to predict story endings, focusing on narrative logic and coherence.                                                                                                                                                                                                                                                             | English                                                                                                               |
+| [super_glue](super_glue/README.md)                                       | A suite of challenging tasks designed to test a range of language understanding skills.                                                                                                                                                                                                                                                | English                                                                                                               |
+| [swag](swag/README.md)                                                   | Situations With Adversarial Generations, predicting the next event in videos.                                                                                                                                                                                                                                                          | English                                                                                                               |
+| [swde](swde/README.md)                                                   | Information extraction tasks from semi-structured web pages.                                                                                                                                                                                                                                                                           | English                                                                                                               |
+| [tinyBenchmarks](tinyBenchmarks/README.md)                               | Evaluation of large language models with fewer examples using tiny versions of popular benchmarks.                                                                                                                                                                                                                                     | English                                                                                                               |
+| [tmmluplus](tmmluplus/README.md)                                         | An extended set of tasks under the TMMLU framework for broader academic assessments.                                                                                                                                                                                                                                                   | Traditional Chinese                                                                                                   |
+| [toxigen](toxigen/README.md)                                             | Tasks designed to evaluate language models on their propensity to generate toxic content.                                                                                                                                                                                                                                              | English                                                                                                               |
+| [translation](translation/README.md)                                     | Tasks focused on evaluating the language translation capabilities of models.                                                                                                                                                                                                                                                           | Arabic, English, Spanish, Basque, Hindi, Indonesian, Burmese, Russian, Swahili, Telugu, Chinese                       |
+| [triviaqa](triviaqa/README.md)                                           | A large-scale dataset for trivia question answering to test general knowledge.                                                                                                                                                                                                                                                         | English                                                                                                               |
+| [truthfulqa](truthfulqa/README.md)                                       | A QA task aimed at evaluating the truthfulness and factual accuracy of model responses.                                                                                                                                                                                                                                                | English                                                                                                               |
+| [turkishmmlu](turkishmmlu/README.md)                                     | A multiple-choice QA test modeled after MMLU, written in Turkish based on Turkish high-school level exams.                                                                                                                                                                                                                             | Turkish                                                                                                               |
+| [unitxt](unitxt/README.md)                                               | A number of tasks implemented using the unitxt library for flexible, shareable, and reusable data preparation and evaluation for generative AI.                                                                                                                                                                                        | English                                                                                                               |
+| [unscramble](unscramble/README.md)                                       | Tasks involving the rearrangement of scrambled sentences to test syntactic understanding.                                                                                                                                                                                                                                              | English                                                                                                               |
+| [webqs](webqs/README.md)                                                 | Web-based question answering tasks designed to evaluate internet search and retrieval.                                                                                                                                                                                                                                                 | English                                                                                                               |
+| [wikitext](wikitext/README.md)                                           | Tasks based on text from Wikipedia articles to assess language modeling and generation.                                                                                                                                                                                                                                                | English                                                                                                               |
+| [winogender](winogender/README.md)                                       | A diagnostic dataset that tests for gender bias in coreference resolution by measuring how models associate pronouns with different occupations.                                                                                                                                                                                         | English                                                                                                               |
+| [winogrande](winogrande/README.md)                                       | A large-scale dataset for coreference resolution, inspired by the Winograd Schema Challenge.                                                                                                                                                                                                                                           | English                                                                                                               |
+| [wmdp](wmdp/README.md)                                                   | A benchmark with the objective of minimizing performance, based on potentially-sensitive multiple-choice knowledge questions.                                                                                                                                                                                                          | English                                                                                                               |
+| [wmt2016](wmt2016/README.md)                                             | Tasks from the WMT 2016 shared task, focusing on translation between multiple languages.                                                                                                                                                                                                                                               | English, Czech, German, Finnish, Russian, Romanian, Turkish                                                           |
+| [wsc273](wsc273/README.md)                                               | The Winograd Schema Challenge, a test of commonsense reasoning and coreference resolution.                                                                                                                                                                                                                                             | English                                                                                                               |
+| [xcopa](xcopa/README.md)                                                 | Cross-lingual Choice of Plausible Alternatives, testing reasoning in multiple languages.                                                                                                                                                                                                                                               | Estonian, Haitian, Indonesian, Italian, Quechua, Swahili, Tamil, Thai, Turkish, Vietnamese, Chinese                   |
 | [xnli](xnli/README.md)                                                   | Cross-Lingual Natural Language Inference to test understanding across different languages.                                                                                                                                                                                                                                             | Arabic, Bulgarian, German, Greek, English, Spanish, French, Hindi, Russian, Swahili, Thai, Turkish, Urdu, Vietnamese, Chinese |
-| [xnli_eu](xnli_eu/README.md)                                             | Cross-lingual Natural Language Inference tasks in Basque.                                                                                                                                                                                                                                                                              | Basque                                                                                                                        |
-| [xquad](xquad/README.md)                                                 | Cross-lingual Question Answering Dataset in multiple languages.                                                                                                                                                                                                                                                                        | Arabic, German, Greek, English, Spanish, Hindi, Romanian, Russian, Thai, Turkish, Vietnamese, Chinese                         |
-| [xstorycloze](xstorycloze/README.md)                                     | Cross-lingual narrative understanding tasks to predict story endings in multiple languages.                                                                                                                                                                                                                                            | Russian, Simplified Chinese, Spanish, Arabic, Hindi, Indonesian, Telugu, Swahili, Basque, Burmese                             |
-| [xwinograd](xwinograd/README.md)                                         | Cross-lingual Winograd schema tasks for coreference resolution in multiple languages.                                                                                                                                                                                                                                                  | English, French, Japanese, Portuguese, Russian, Chinese                                                                       |
+| [xnli_eu](xnli_eu/README.md)                                             | Cross-lingual Natural Language Inference tasks in Basque.                                                                                                                                                                                                                                                                              | Basque                                                                                                                |
+| [xquad](xquad/README.md)                                                 | Cross-lingual Question Answering Dataset in multiple languages.                                                                                                                                                                                                                                                                        | Arabic, German, Greek, English, Spanish, Hindi, Romanian, Russian, Thai, Turkish, Vietnamese, Chinese                 |
+| [xstorycloze](xstorycloze/README.md)                                     | Cross-lingual narrative understanding tasks to predict story endings in multiple languages.                                                                                                                                                                                                                                            | Russian, Simplified Chinese, Spanish, Arabic, Hindi, Indonesian, Telugu, Swahili, Basque, Burmese                     |
+| [xwinograd](xwinograd/README.md)                                         | Cross-lingual Winograd schema tasks for coreference resolution in multiple languages.                                                                                                                                                                                                                                                  | English, French, Japanese, Portuguese, Russian, Chinese                                                               |
 
 ## Multimodal Tasks
 | Task Family                  | Description                                                                                             | Modality    |
diff --git a/lm_eval/tasks/aclue/_generate_configs.py b/lm_eval/tasks/aclue/_generate_configs.py
index 8bd1792ae3d200b422c6f804ef7d89252591b2a7..60666bc74fe17dbc6d231eed0e3b05c65f2a4efd 100644
--- a/lm_eval/tasks/aclue/_generate_configs.py
+++ b/lm_eval/tasks/aclue/_generate_configs.py
@@ -3,12 +3,14 @@ Take in a YAML, and output all other splits with this YAML
 """
 
 import argparse
+import logging
 import os
 
 import yaml
 from tqdm import tqdm
 
-from lm_eval.utils import eval_logger
+
+eval_logger = logging.getLogger(__name__)
 
 
 SUBJECTS = {
diff --git a/lm_eval/tasks/acpbench/README.md b/lm_eval/tasks/acpbench/README.md
index 5ff19061cfa127c0b73b1bde3d879a55fcddd496..264b54ce58d1b74f2745fd8426418cf7e63e8880 100644
--- a/lm_eval/tasks/acpbench/README.md
+++ b/lm_eval/tasks/acpbench/README.md
@@ -1,13 +1,26 @@
 # ACPBench
 
-### Paper
+**Homepage:** https://ibm.github.io/ACPBench/
 
-Title: ACPBench: Reasoning About Action, Change, and Planning
-Abstract: https://arxiv.org/pdf/2410.05669
+### Papers
+
+**Title:** ACPBench: Reasoning About Action, Change, and Planning
+**Pdf:** https://arxiv.org/pdf/2410.05669
+**Task:** `acp_bench`
+**Abstract:**
 
 There is an increasing body of work using Large Language Models (LLMs) as agents for orchestrating workflows and making decisions in domains that require planning and multi-step reasoning. As a result, it is imperative to evaluate LMs on core skills required for planning. ACPBench is a benchmark for evaluating the reasoning tasks in the field of planning. The benchmark consists of 7 reasoning tasks over 13 planning domains. The collection is constructed from planning domains described in a formal language. This allows the synthesized problems to have provably correct solutions across many tasks and domains. Further, it allows the luxury to scale without additional human effort, i.e., many additional problems can be created automatically.
 
-Homepage: https://ibm.github.io/ACPBench/
+
+
+**Title:** ACPBench Hard: Unrestrained Reasoning about Action, Change, and Planning
+**Pdf:** https://arxiv.org/abs/2503.24378
+**Task:** `acp_bench_hard`
+**Abstract:**
+
+We introduce ACPBench Hard, a dataset of generative, open-ended questions which LLM models needs to answer in order to plan. Models that perform well on these tasks could in principle be integrated into a planner or be used directly as a policy. We discuss the complexity of these tasks as well as the complexity of validating the correctness of their answers and present validation algorithms for each task. Equipped with these validators, we test the performance of a variety of models on our tasks and find that for most of these tasks, the performance of even the largest models is still subpar. Our experiments show that no model outperforms any other in these tasks, and with a few exceptions, all tested language models score below 65\%, indicating that even the current frontier language models as well as so-called reasoning models have a long way to go before they can reliably reason about planning.
+
+The dataset is available on [HuggingFace](https://huggingface.co/datasets/ibm-research/acp_bench).
 
 
 ### Citation
@@ -23,6 +36,19 @@ Homepage: https://ibm.github.io/ACPBench/
   publisher    = {{AAAI} Press},
   year         = {2025}
 }
+
+@misc{KokelKSS25ACPHard,
+  title       = {ACPBench Hard: Unrestrained Reasoning about Action, Change, and Planning},
+  author      = {Harsha Kokel and
+                 Michael Katz and
+                 Kavitha Srinivas and
+                 Shirin Sohrabi},
+  year        = {2025},
+  eprint      = {2503.24378},
+  archivePrefix = {arXiv},
+  primaryClass  = {cs.AI},
+  url         = {https://arxiv.org/abs/2503.24378},
+}
 ```
 
 ### Groups, Tags, and Tasks
@@ -33,9 +59,13 @@ Homepage: https://ibm.github.io/ACPBench/
 
 #### Tags
 
-* `acp_bench` : Evaluates `acp_bool_cot_2shot` and `acp_mcq_cot_2shot`
+* `acp_bench` : Evaluates `acp_bool_cot_2shot` and `acp_mcq_cot_2shot` (Main variant for ACPBench paper)
 * `acp_bool_cot_2shot` : Evaluates `acp_areach_bool`, `acp_app_bool`, `acp_just_bool`, `acp_land_bool`, `acp_prog_bool`, `acp_reach_bool`, `acp_val_bool` with chain-of-thought and 2 shots
 * `acp_mcq_cot_2shot` : Evaluates `acp_areach_mcq`, `acp_app_mcq`, `acp_just_mcq`, `acp_land_mcq`, `acp_prog_mcq`, `acp_reach_mcq`, `acp_val_mcq`  with chain-of-thought and 2 shots
+* `acp_bench_hard` : Evaluates `acp_gen_2shot` (Main variant for ACPBench Hard paper)
+* `acp_gen_2shot` : Evaluates `acp_areach_gen`, `acp_app_gen`, `acp_just_gen`, `acp_land_gen`, `acp_nexta_gen`, `acp_prog_gen`, `acp_reach_gen`, `acp_val_gen` with 2 shots
+* `acp_bench_hard_with_pddl` : Evaluates `acp_gen_2shot_with_pddl`
+* `acp_gen_2shot_with_pddl` : Evaluates `acp_areach_gen_with_pddl`, `acp_app_gen_with_pddl`, `acp_just_gen_with_pddl`, `acp_land_gen_with_pddl`, `acp_nexta_gen_with_pddl`, `acp_prog_gen_with_pddl`, `acp_reach_gen_with_pddl`, `acp_val_gen_with_pddl` with 2 shots
 
 #### Tasks
 
@@ -57,6 +87,26 @@ Homepage: https://ibm.github.io/ACPBench/
 * `acp_reach_mcq`
 * `acp_val_mcq`
 
+8 Generative tasks (with just natural language description in context)
+* `acp_areach_gen`
+* `acp_app_gen`
+* `acp_just_gen`
+* `acp_land_gen`
+* `acp_nexta_gen`
+* `acp_prog_gen`
+* `acp_reach_gen`
+* `acp_val_gen`
+
+and the same 8 generative tasks with natural language as well as the PDDL description of the domain and problem in context.
+* `acp_areach_gen_with_pddl`
+* `acp_app_gen_with_pddl`
+* `acp_just_gen_with_pddl`
+* `acp_land_gen_with_pddl`
+* `acp_nexta_gen_with_pddl`
+* `acp_prog_gen_with_pddl`
+* `acp_reach_gen_with_pddl`
+* `acp_val_gen_with_pddl`
+
 > ! The evaluation scripts are taken from original github https://github.com/IBM/ACPBench
 
 
@@ -77,3 +127,4 @@ If other tasks on this dataset are already supported:
 ### Change Log
 
 * 03/17/2025 Initial Commit
+* 05/13/2025 Adding ACPBench Hard tasks (with and without PDDL)
diff --git a/lm_eval/tasks/acpbench/gen_2shot/_gen_yaml_2shot b/lm_eval/tasks/acpbench/gen_2shot/_gen_yaml_2shot
new file mode 100644
index 0000000000000000000000000000000000000000..d7316051361973ab131eb43f6e66650d5b2ca5b9
--- /dev/null
+++ b/lm_eval/tasks/acpbench/gen_2shot/_gen_yaml_2shot
@@ -0,0 +1,25 @@
+tag:
+  - acp_gen_2shot
+  - acp_bench_hard
+dataset_path: ibm-research/acp_bench
+test_split: test
+doc_to_target: "{{answer}}"
+output_type: generate_until
+num_fewshot: 2
+generation_kwargs:
+  until:
+    - "\n\n\n\n"
+    - "\n\n"
+    - "**Question**:"
+    - "**Question:**"
+    - "Q:"
+  do_sample: false
+  max_gen_toks: 1000
+  temperature: 0.0
+metadata:
+  version: 1.0
+process_results: !function acp_utils.process_acp_results
+metric_list:
+  - metric: "score"
+    aggregation: mean
+    higher_is_better: True
diff --git a/lm_eval/tasks/acpbench/gen_2shot/acp_grammar.lark b/lm_eval/tasks/acpbench/gen_2shot/acp_grammar.lark
new file mode 100644
index 0000000000000000000000000000000000000000..036bd675faacb044ca5bb2ef66b3dfac47943815
--- /dev/null
+++ b/lm_eval/tasks/acpbench/gen_2shot/acp_grammar.lark
@@ -0,0 +1,23 @@
+NAME: /[a-zA-Z][a-zA-Z0-9-_]*/
+LPAR : "("
+RPAR : ")"
+LSPAR: "["
+RSPAR: "]"
+COMMA: ","
+WS: /[ \n]/
+
+action_none : "None"
+
+action_name : LPAR NAME (WS NAME)* RPAR
+
+action_list : (action_name WS?)*
+
+prog_list :  action_name* (COMMA action_name)*
+
+progression_list : LSPAR prog_list RSPAR LSPAR prog_list RSPAR
+
+act : action_name | action_none
+
+index: /[0-9]+[0-9]*/
+
+start: action_list
diff --git a/lm_eval/tasks/acpbench/gen_2shot/acp_utils.py b/lm_eval/tasks/acpbench/gen_2shot/acp_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5051b68cbf7b5ef384f2ec498f2759409383c7b7
--- /dev/null
+++ b/lm_eval/tasks/acpbench/gen_2shot/acp_utils.py
@@ -0,0 +1,1128 @@
+import json
+import os
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from pathlib import Path
+
+from lm_eval.api.registry import register_filter
+from lm_eval.filters.extraction import RegexFilter
+
+
+try:
+    import tempfile
+
+    import tarski
+    from kstar_planner import planners as kp
+    from lark import Lark
+    from lark.lexer import Token
+    from lark.visitors import Visitor
+    from pddl.core import Problem
+    from pddl.parser.domain import DomainParser
+    from pddl.parser.problem import ProblemParser
+    from tarski.grounding.common import StateVariableLite
+    from tarski.grounding.lp_grounding import LPGroundingStrategy
+    from tarski.io import PDDLReader
+    from tarski.io import fstrips as iofs
+    from tarski.syntax.formulas import is_atom
+    from tarski.syntax.transform.action_grounding import (
+        ground_schema_into_plain_operator_from_grounding,
+    )
+    from tarski.util import SymbolIndex
+except ModuleNotFoundError:
+    raise ModuleNotFoundError(
+        "`lark>=1.1.9`, `tarski[clingo]==0.8.2`, `pddl==0.4.2` and `kstar-planner==1.4.2` are required for evaluating the generative tasks. \
+Please install via pip install lm-eval[acpbench] or pip install -e .[acpbench]",
+    )
+
+
+#########################################################################
+# Grammar
+
+
+GRAMMAR_FILE = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)), "acp_grammar.lark"
+)
+
+
+class ACPBench_Visitor(Visitor):
+    def __init__(self) -> None:
+        super().__init__()
+        self.action_lists = None
+        self.action_names = None
+        self.progression_lists = None
+        self.prog_lists = None
+        self.indexes = None
+
+    def action_list(self, tree):
+        self.action_lists = []
+
+    def prog_list(self, tree):
+        if self.prog_lists is not None:
+            self.progression_lists.append(self.prog_lists)
+        self.prog_lists = []
+
+    def progression_list(self, tree):
+        self.progression_lists = []
+
+    def action_none(self, tree):
+        self.action_names = "None"
+
+    def action_name(self, tree):
+        act_name = "(" + "".join(tree.children[1:-1]) + ")"
+        self.action_names = act_name
+        if self.action_lists is not None:
+            self.action_lists.append(act_name)
+        if self.prog_lists is not None:
+            self.prog_lists.append(act_name)
+
+    def index(self, tree):
+        self.indexes = "".join(tree.children)
+        if not self.indexes.isnumeric():
+            self.indexes = None
+
+
+class ACPGrammarParser(object):
+    def __init__(self, task) -> None:
+        self.task = task
+        with open(GRAMMAR_FILE) as f:
+            grammar = f.read()
+            self.acp_parser = Lark(grammar, start=task, parser="lalr")
+
+    def parse(self, input, debug=False):
+        def ignore_errors(e):
+            if hasattr(e, "token") and e.token.type == "$END":
+                for x in e.expected:
+                    if x != "WS":
+                        e.interactive_parser.feed_token(
+                            Token(x, self.acp_parser.get_terminal(x).pattern.value)
+                        )
+
+            return True
+
+        input = input.replace("\n", "")
+        input = input.strip()
+        try:
+            tree = self.acp_parser.parse(input, on_error=ignore_errors)
+
+            if debug:
+                print(tree)
+            visitor = ACPBench_Visitor()
+            visitor.visit_topdown(tree)
+            if self.task == "action_list":
+                return visitor.action_lists
+            elif self.task == "act":
+                return visitor.action_names
+            elif self.task == "action_name":
+                return visitor.action_names
+            elif self.task == "index":
+                return visitor.indexes
+            elif self.task == "progression_list":
+                if visitor.prog_lists not in visitor.progression_lists:
+                    visitor.progression_lists.append(visitor.prog_lists)
+                return visitor.progression_lists
+        except Exception as e:
+            if debug:
+                print("exception")
+                print(e)
+            return None
+
+
+##############################################################################
+# Utils
+
+
+# Used in next action
+def is_on_optimal_plan(domain, problem, action, opt):
+    with (
+        tempfile.NamedTemporaryFile() as domain_temp,
+        tempfile.NamedTemporaryFile() as problem_temp,
+    ):
+        with open(str(domain_temp.name), "w", encoding="utf8") as file:
+            file.write(domain.lower())
+        with open(str(problem_temp.name), "w", encoding="utf8") as file:
+            file.write(problem.lower())
+
+        # Here, we need to keep the temp files live until the end of the function
+        try:
+            P = STRIPS(str(domain_temp.name), str(problem_temp.name))
+        except Exception:
+            # Unsolvable
+            return False
+
+        a = P.get_action_or_none(action[1:-1])
+        if a is None:
+            return False
+        state = P.init
+        next_state = progress(state, a)
+        if opt is None:
+            # Get an optimal plan cost
+            plans = generate_optimal_plans_for_problem_state(
+                P, state, num_plans=1, timeout=5
+            )
+            opt = len(plans[0]["actions"])
+        else:
+            opt = int(opt)
+
+        # Getting an optimal plan for the next state
+        next_plans = generate_optimal_plans_for_problem_state(
+            P, next_state, num_plans=1, timeout=5
+        )
+        if next_plans is None:
+            return False
+        next_opt = len(next_plans[0]["actions"])
+        return next_opt + 1 == opt
+
+
+# Used in justification
+def is_plan(domain, problem, new_plan):
+    P = get_STRIPS(domain, problem)
+    if P is None:
+        # Unsolvable
+        return False
+
+    # Check if new_plan is a plan
+    current_state = P.init
+    for action in new_plan:
+        applicable_actions = P.get_applicable_actions(current_state)
+        app_actions_list = [f"({a.name.lower()})" for a in applicable_actions]
+        if action.lower() not in app_actions_list:
+            return False
+        a = applicable_actions[app_actions_list.index(action.lower())]
+        current_state = progress(current_state, a)
+    return entails(current_state, P.goal)
+
+
+# Used in action reachability
+def get_action_preconditions(domain, problem, action):
+    P = get_STRIPS(domain, problem)
+
+    assert P is not None, f"Domain\n{domain}\nProblem\n{problem}\nAction: {action}"
+    a = P.get_action_or_none(action[1:-1])
+    if a is None:
+        return a
+
+    return [f"({f})" for f in a.pres]
+
+
+def generate_optimal_plans_for_problem_state(P, state, num_plans, timeout):
+    import tempfile
+
+    with (
+        tempfile.NamedTemporaryFile() as domain_temp,
+        tempfile.NamedTemporaryFile() as problem_temp,
+    ):
+        create_tmp_dom_prob_replace_init(P, state, domain_temp, problem_temp)
+        plans = generate_top_q_plans(
+            domain=str(domain_temp.name),
+            problem=str(problem_temp.name),
+            num_plans=num_plans,
+            quality_bound=1.0,
+            timeout=timeout,
+        )
+        # print(plans)
+        if plans is None or len(plans["plans"]) == 0:
+            return None
+        return plans["plans"]
+
+
+def generate_top_q_plans(domain, problem, num_plans=10, quality_bound=1.0, timeout=30):
+    # print("Running K* planner")
+    plans = kp.plan_unordered_topq(
+        domain_file=Path(domain),
+        problem_file=Path(problem),
+        number_of_plans_bound=num_plans,
+        quality_bound=quality_bound,
+        timeout=timeout,
+    )
+    return plans
+
+
+# Used in (action) reachability
+def is_unsolvable_new_goal(domain, problem, new_goal):
+    goal = extract_goal(problem)
+    new_problem = problem.replace(goal, f"(:goal {new_goal} )")
+    return is_unsolvable(domain, new_problem)
+
+
+def is_unsolvable(domain, problem):
+    with (
+        tempfile.NamedTemporaryFile() as domain_temp,
+        tempfile.NamedTemporaryFile() as problem_temp,
+    ):
+        with open(str(domain_temp.name), "w", encoding="utf8") as file:
+            file.write(str(domain))
+        with open(str(problem_temp.name), "w", encoding="utf8") as file:
+            file.write(str(problem))
+
+        plans = kp.plan_unordered_topq(
+            domain_file=Path(str(domain_temp.name)),
+            problem_file=Path(str(problem_temp.name)),
+            quality_bound=1.0,
+            number_of_plans_bound=1,
+            timeout=3,
+        )
+
+        if len(plans["planner_error"]) > 0:
+            fl = plans["planner_error"].split("\n")[0]
+            print(f"Planner error: {fl}")
+            return False
+        if plans is None or len(plans["plans"]) == 0:
+            return plans["unsolvable"]
+        return False
+
+
+def extract_goal(prob):
+    a = prob.split("(:goal")[1]
+    cp = 1
+    for i, c in enumerate(a):
+        if c == ")":
+            cp -= 1
+        if c == "(":
+            cp += 1
+        if cp == 0:
+            return "(:goal" + a[: i + 1]
+
+    assert False
+
+
+def entails(state, partialstate):
+    return partialstate <= state
+
+
+def progress(state, act):
+    assert entails(state, act.pres), (
+        "Cannot progress with inconsistent state / action precondition:\n\t Action: "
+        + act.name
+        + "\n\t State: \n\t\t"
+        + "\n\t\t".join(state)
+    )
+    return (state - act.dels) | act.adds
+
+
+def regress(state, act):
+    assert len(state & act.dels) == 0, (
+        "Cannot regress with inconsistent state / action delete effect:\n\t Action: "
+        + act.name
+        + "\n\t State: \n\t\t"
+        + "\n\t\t".join(state)
+    )
+    return (state - act.adds) | act.pres
+
+
+def get_STRIPS(domain, problem):
+    with (
+        tempfile.NamedTemporaryFile() as domain_temp,
+        tempfile.NamedTemporaryFile() as problem_temp,
+    ):
+        with open(str(domain_temp.name), "w", encoding="utf8") as file:
+            file.write(domain.lower())
+        with open(str(problem_temp.name), "w", encoding="utf8") as file:
+            file.write(problem.lower())
+
+        try:
+            P = STRIPS(str(domain_temp.name), str(problem_temp.name))
+            return P
+        except Exception as e:
+            print(f"||{e}||")
+            return None
+
+
+def create_tmp_dom_prob_replace_init(P, state, result_domain_file, result_problem_file):
+    d, p = P.PDDL_replace_init_pddl_parser(state)
+    with open(str(result_domain_file.name), "w", encoding="utf8") as file:
+        file.write(str(d))
+    with open(str(result_problem_file.name), "w", encoding="utf8") as file:
+        file.write(str(p))
+
+    return d, p
+
+
+def fix_name(s):
+    # (act param)
+    if "(" == s[0] and ")" == s[-1]:
+        return s[1:-1]
+    # make it space separated
+    s = s.replace(", ", " ").replace(",", " ")
+    # act(param)
+    if "(" in s:
+        assert ")" == s[-1], f"Broken name? {s}"
+        s = s.replace("(", " ").replace(")", "")
+    # act param
+    return s
+
+
+def get_atoms_pddl(d, p, atoms):
+    objs = set()
+    preds = defaultdict(list)
+    for atom in atoms:
+        a = atom.lower().strip().split(" ")
+        args = a[1:]
+        preds[a[0]].append(args)
+        objs |= set(args)
+
+    constants = [o for o in p.objects | d.constants if o.name.lower() in objs]
+    constants_dict = {}
+    for c in constants:
+        constants_dict[c.name.lower()] = c
+    assert len(objs) == len(constants), (
+        f"Could not identify all objects: {objs - set(constants_dict.keys())} not found, {set(constants_dict.keys()) - objs} should not be there"
+    )
+
+    state = []
+    covered_preds = set()
+    for f in d.predicates:
+        name = f.name.lower()
+        if name in preds:
+            covered_preds.add(name)
+            assert len(preds[name][0]) == f.arity, (
+                f"The arity does not match: {preds[name]} vs {f.terms}"
+            )
+            # Going over the lists of objects, adding ground predicate for each
+            for ob in preds[name]:
+                c = [constants_dict[o] for o in ob]
+                state.append(f(*c))
+    assert len(covered_preds) == len(preds.keys()), (
+        f"Covered predicates: \n{sorted(list(covered_preds))} vs \n{sorted(list(preds.keys()))}"
+    )
+    return set(state)
+
+
+class Action:
+    def __init__(self, name, pre, add, delete):
+        self.name = name
+        self.pres = pre
+        self.adds = add
+        self.dels = delete
+
+    def __str__(self):
+        pres = "{" + ", ".join([f"({a})" for a in self.pres]) + "}"
+        adds = "{" + ", ".join([f"({a})" for a in self.adds]) + "}"
+        dels = "{" + ", ".join([f"({a})" for a in self.dels]) + "}"
+
+        return f"< {self.name}, {pres}, {adds}, {dels} >"
+
+    def toJSON(self):
+        return json.dumps(
+            {
+                "name": self.name,
+                "preconditions": [f"({a})" for a in self.pres],
+                "add_effects": [f"({a})" for a in self.adds],
+                "delete_effects": [f"({a})" for a in self.dels],
+            },
+            sort_keys=True,
+            indent=4,
+        )
+
+    def __repr__(self):
+        return self.name
+
+    def __eq__(self, action):
+        return self.name == action.name
+
+    def __hash__(self):
+        return hash(self.name)
+
+
+class STRIPS:
+    def __init__(self, domain, problem):
+        self.domain_file = domain
+        self.problem_file = problem
+        self.reader = PDDLReader(raise_on_error=True)
+        self.reader.parse_domain(domain)
+        self.problem = self.reader.parse_instance(problem)
+        (self.grounded_fluents, init, goal, self.operators, self.grounder) = (
+            self.ground_problem(self.problem)
+        )
+
+        self.fluents = set([fix_name(str(f)) for f in self.grounded_fluents])
+        self.fluents_map = dict()
+        for f in self.grounded_fluents:
+            self.fluents_map[fix_name(str(f))] = f
+        self.init = set([fix_name(str(f)) for f in init])
+        self.goal = set([fix_name(str(f)) for f in goal])
+        self.actions = set()
+        self.action_map = {}
+        self.init_fluents = [self.fluents_map[f] for f in self.init]
+
+        self.static_predicates = [i.name for i in self.grounder.static_symbols]
+        for op in self.operators:
+            act = self.operator_to_action(op)
+            self.actions.add(act)
+            self.action_map[act.name.lower()] = act
+
+    def __str__(self):
+        fluents = "P = {" + ", ".join([f"({a})" for a in self.fluents]) + "}"
+        init = "I = {" + ", ".join([f"({a})" for a in self.init]) + "}"
+        goal = "G = {" + ", ".join([f"({a})" for a in self.goal]) + "}"
+        actions = "A = {" + "\n ".join([a.__str__() for a in self.actions]) + "}"
+        return fluents + ",\n" + init + "\n" + goal + "\n" + actions
+
+    def toJSON(self):
+        actions = [a.toJSON() for a in self.actions]
+        return json.dumps(
+            {
+                "fluents": list(self.fluents),
+                "initial_state": list(self.init),
+                "goal": list(self.goal),
+                "actions": actions,
+            },
+            sort_keys=True,
+            indent=4,
+        )
+
+    def operator_to_action(self, op, check_fluents=True, check_static=False):
+        adds = {
+            fix_name(str(f.atom)) for f in op.effects if isinstance(f, iofs.AddEffect)
+        } & self.fluents
+        dels = {
+            fix_name(str(f.atom)) for f in op.effects if isinstance(f, iofs.DelEffect)
+        } & self.fluents
+        pre = self.fix_pre_name(op.precondition)
+        if check_fluents:
+            pre = pre & self.fluents
+        if check_static:
+            pre = {p for p in pre if p.split()[0] not in self.static_predicates}
+        act = Action(fix_name(str(op)), pre, adds, dels)
+        return act
+
+    def fix_pre_name(self, precondition):
+        if not is_atom(precondition):
+            return {fix_name(str(f)) for f in precondition.subformulas}
+        return {fix_name(str(precondition))}
+
+    def action(self, name):
+        return self.action_map[fix_name(name).lower()]
+
+    def get_action_or_none(self, name):
+        if "(" in name and ")" != name[-1]:
+            return None
+        return self.action_map.get(fix_name(name).lower(), None)
+
+    def fluent(self, name):
+        return fix_name(name)
+
+    def static_symbols(self):
+        return list(self.grounder.static_symbols)
+
+    def fluent_symbols(self):
+        return list(self.grounder.fluent_symbols)
+
+    def get_grounded_atoms(self, symbol):
+        variables = SymbolIndex()
+        lang = symbol.language
+        key = "atom_" + symbol.name
+        model = self.grounder._solve_lp()
+        if (
+            key in model
+        ):  # in case there is no reachable ground state variable from that fluent symbol
+            for binding in model[key]:
+                binding_with_constants = tuple(lang.get(c) for c in binding)
+                variables.add(StateVariableLite(symbol, binding_with_constants))
+        return variables
+
+    def get_applicable_actions(self, s):
+        return [a for a in self.actions if entails(s, a.pres)]
+
+    def ground_problem(self, problem):
+        grounder = LPGroundingStrategy(problem, include_variable_inequalities=True)
+        action_groundings = grounder.ground_actions()
+        operators = []
+        for action_name, groundings in action_groundings.items():
+            action = problem.get_action(action_name)
+            for grounding in groundings:
+                operators.append(
+                    ground_schema_into_plain_operator_from_grounding(action, grounding)
+                )
+
+        grounded_fluents = set(
+            [
+                grounded_fluent.to_atom()
+                for grounded_fluent in grounder.ground_state_variables().objects
+            ]
+        )
+        init = [f for f in problem.init.as_atoms() if f in grounded_fluents]
+        if isinstance(problem.goal, tarski.syntax.Atom):
+            goal = [problem.goal]
+        else:
+            goal = [f for f in problem.goal.subformulas if f in grounded_fluents]
+
+        return (grounded_fluents, init, goal, operators, grounder)
+
+    def get_static(self):
+        static_symbols = self.static_symbols()
+        ret = []
+        for symbol in static_symbols:
+            ret.extend(self.get_grounded_atoms(symbol))
+        return set([fix_name(str(x)) for x in ret])
+
+    def PDDL_replace_init_pddl_parser(self, s):
+        d = DomainParser()(open(self.domain_file, "r").read().lower())
+        p = ProblemParser()(open(self.problem_file, "r").read().lower())
+
+        new_state = get_atoms_pddl(d, p, s | self.get_static())
+
+        new_p = Problem(
+            p.name, domain=d, objects=p.objects, init=new_state, goal=p.goal
+        )
+
+        return d, new_p
+
+
+def parse_ans(response: str, parser: ACPGrammarParser, task: str):
+    return [parser.parse(clean_answer(resp, task)) for resp in response]
+
+
+# def parse_ans(response : str, parser : ACPGrammarParser, task : str):
+#     ans = [parser.parse(clean_answer(resp, task), debug=True) for resp in response]
+#     if any(elem is None for elem in ans) or any(elem is None for elem in ans[0]):
+#         return None
+#     return ans
+
+
+def remove_garbage(s):
+    while True:
+        if s.endswith("."):
+            s = s[:-1]
+        elif s.endswith("\n"):
+            s = s[:-2]
+        else:
+            break
+    return s.rstrip()
+
+
+def compare_str(s1, s2):
+    return remove_garbage(s1).lower() == remove_garbage(s2).lower()
+
+
+def compare(l1, l2):
+    if not isinstance(l1, list):
+        return compare_str(l1, l2)
+    if not isinstance(l2, list):
+        return False
+    for i, v in enumerate(l1):
+        if not compare(v, l2[i]):
+            return False
+    return True
+
+
+def check_prog_response(resp):
+    if (
+        "Positive Effects".lower() in resp.lower()
+        and "Negative Effects".lower() in resp.lower()
+    ):
+        if "[" not in resp:
+            return True
+    return False
+
+
+def clean_answer(resp, task):
+    # Minor cleanup
+    if "progression_gen" in task:
+        # Check for Positive Effects and Negative Effects instead of separation
+        if check_prog_response(resp):
+            # replace **Positive Effects** with "["
+            # replace **Negative Effects** with "] ["
+            # append "]" to the end
+            resp2 = resp.lower()
+            resp2 = resp2.replace("*", "")
+            resp2 = resp2.replace("positive effects", "[")
+            resp2 = resp2.replace("negative effects", "] [")
+            resp2 = resp2 + "]"
+            return resp2
+    if "action_justification_gen" in task:
+        # Check for "simplified plan:"
+        if "simplified plan:" in resp.lower():
+            resp2 = resp.lower()
+            resp2 = resp2.replace("*", "")
+            resp2 = resp2.split("simplified plan:")[1]
+            return resp2
+    return resp
+
+
+def get_grammar_task(task):
+    # print(task)
+    if task == "reachable_atom_gen":
+        return "act"
+    elif task == "progression_gen":
+        return "progression_list"
+    elif task == "validation_gen":
+        return "index"
+    elif task == "reachable_action_gen":
+        return "act"
+    elif task == "action_justification_gen":
+        return "action_list"
+    elif task == "landmarks_gen":
+        return "act"
+    elif task == "goal_closer_gen":
+        return "action_name"
+    elif task == "applicable_actions_gen":
+        return "action_list"
+
+
+##############################################################################
+#  Evaluators
+
+
+def fix_action_name(a):
+    assert a.startswith("(") and a.endswith(")")
+    return "(" + " ".join([x.strip() for x in a[1:-1].split(" ") if len(x) > 0]) + ")"
+
+
+def str_remove_before_first_parentheses(s):
+    if s.startswith("("):
+        return s
+    try:
+        return s[s.index("(") :]
+    except Exception:
+        return ""
+
+
+def str_remove_after_last_parentheses(s):
+    if s.endswith(")"):
+        return s
+
+    i = s.rfind(")")
+
+    if i == -1:
+        return ""
+    return s[: i + 1]
+
+
+def cleanup_answer(ans):
+    if isinstance(ans, str):
+        ans = str_remove_before_first_parentheses(ans)
+        ans = str_remove_after_last_parentheses(ans)
+        ans = ans.lower()
+        ans = (
+            ans.replace(")\n(", ")######(")
+            .replace("),(", ")######(")
+            .replace(") (", ")######(")
+            .split("######")
+        )
+        return ans
+    if isinstance(ans, list):
+        res = []
+        for x in ans:
+            res.extend(cleanup_answer(x))
+        return res
+
+
+def set_equal(ans1, ans2):
+    return set(ans1) == set(ans2)
+
+
+class BaseEvaluator(ABC):
+    def __init__(self) -> None:
+        self.scores = []
+
+    @abstractmethod
+    def get_score(self, ans, doc):
+        pass
+
+    def add_scores(self, scores):
+        self.scores.extend(scores)
+
+    def get_avg_score(self):
+        avg_score = sum(self.scores) / len(self.scores)
+        return avg_score
+
+
+def get_evaluator(group):
+    if group == "applicable_actions_gen":
+        return ApplicabilityEvaluator()
+    elif group == "progression_gen":
+        return ProgressionEvaluator()
+    elif group == "validation_gen":
+        return ValidationEvaluator()
+    elif group == "reachable_atom_gen":
+        return ReachabilityEvaluator()
+    elif group == "goal_closer_gen":
+        return NextActionEvaluator()
+    elif group == "action_justification_gen":
+        return JustificationEvaluator()
+    elif group == "landmarks_gen":
+        return LandmarksEvaluator()
+    elif group == "reachable_action_gen":
+        return ActionReachabilityEvaluator()
+    assert True, f"Group {group} not found"
+
+
+"""
+Action Reachability task: generate a valid action that is not applicable to any reachable state.
+answer: A subset of actions that are known to be unreachable (not an exhaustive set).
+        It is empty only when we *know* that there are no such actions.
+"""
+
+
+class ActionReachabilityEvaluator(BaseEvaluator):
+    def get_score(self, ans, doc):
+        real_answer = doc["answer"]
+        if not real_answer or len(real_answer) == 0:
+            # The correct answer is None
+            self.add_scores(
+                ["none" == x.strip().lower() if x is not None else False for x in ans]
+            )
+        else:
+            for x in ans:
+                if x is None:
+                    self.scores.append(False)
+                    continue
+                action = x.strip().lower()
+                if action in real_answer:
+                    # The answer is in the subset of stored correct answers
+                    self.scores.append(True)
+                    continue
+                prec = get_action_preconditions(
+                    doc["PDDL_domain"].lower(), doc["PDDL_problem"].lower(), action
+                )
+                if prec is None:
+                    # The answer does not correspond to a valid action
+                    self.scores.append(False)
+                else:
+                    # Need to run a planner on a task with the answer action preconditions as the new goal
+                    prec = f"(and {' '.join(prec)})"
+                    self.scores.append(
+                        is_unsolvable_new_goal(
+                            doc["PDDL_domain"].lower(),
+                            doc["PDDL_problem"].lower(),
+                            prec,
+                        )
+                    )
+
+        return self.get_avg_score()
+
+
+"""
+Action Applicability task: generate all actions that are applicable in the current state.
+answer: A set of all applicable actions.
+"""
+
+
+class ApplicabilityEvaluator(BaseEvaluator):
+    def get_score(self, ans, doc):
+        real_answer = doc["answer"]
+        real_answer = [a.lower() for a in real_answer]
+        ans = [[fix_action_name(a) for a in x] if x is not None else None for x in ans]
+
+        # Check if the answer is equal (as a set) to the real stored answer
+        self.add_scores(
+            [
+                set_equal(real_answer, cleanup_answer(x)) if x is not None else False
+                for x in ans
+            ]
+        )
+        return self.get_avg_score()
+
+
+def is_subsequence(plan, new_plan):
+    i = 0
+    for a in plan:
+        if a == new_plan[i]:
+            i += 1
+            if len(new_plan) == i:
+                # Done
+                return True
+    return False
+
+
+def is_subsequence_and_plan(domain, problem, plan, new_plan):
+    if len(plan) <= len(new_plan):
+        return False
+    if not is_subsequence(plan, new_plan):
+        return False
+    return is_plan(domain, problem, new_plan)
+
+
+"""
+Justification task: generate a proper subsequence of the given plan that is also a plan.
+answer: A list of examples of actions that can be removed (ignored in evaluation).
+"""
+
+
+class JustificationEvaluator(BaseEvaluator):
+    def get_score(self, ans, doc):
+        # Sequence of actions (plan) from the question
+        if "inputs" in doc:  # old field name
+            seq = doc["inputs"][19:-147]
+        else:
+            seq = doc["question"][19:-147]
+        seq = seq.replace(") (", ")######(").split("######")
+        for x in ans:
+            if x is None:
+                self.scores.append(False)
+                continue
+            # An answer plan candidate
+            x = [fix_action_name(a) for a in x]
+            if len(x) == 0:
+                # Wrong answer - never an empty sequence
+                self.scores.append(0)
+                continue
+            # Check if the plan candidate from the answer (a) is a proper subsequence of the plan in the question and (b) is a plan.
+            self.scores.append(
+                is_subsequence_and_plan(
+                    doc["PDDL_domain"].lower(), doc["PDDL_problem"].lower(), seq, x
+                )
+            )
+        return self.get_avg_score()
+
+
+"""
+Landmarks task: generate a fact that is a non-trivial landmark for the current state.
+answer: A list of facts that are found to be landmarks and a list of facts that are found to be non-landmarks.
+
+The questions are generated only for cases where all facts either
+    (a) hold in the current state,
+    (b) true in goal,
+    (c) are found to be landmarks, or
+    (d) are found to be non-landmarks.
+In such cases, the evaluation is simple, it does not require checking whether a fact is a landmark, it was
+already done during question generation.
+"""
+
+
+class LandmarksEvaluator(BaseEvaluator):
+    def get_score(self, ans, doc):
+        # The set of facts that are found to be landmarks
+        real_answer = doc["answer"]
+        real_answer_yes = [a.lower() for a in real_answer["yes"]]
+
+        for x in ans:
+            if x is None:
+                self.scores.append(False)
+                continue
+            if x.strip().lower() in real_answer_yes:
+                # The answer fact is known to be landmark
+                self.scores.append(True)
+            elif x.strip().lower() == "none":
+                # The answer is none, correct only if there are no known landmarks,
+                #   since we only generate questions when that means that there are no non-trivial landmarks
+                self.scores.append(len(real_answer_yes) == 0)
+            else:
+                # All other cases the answer is incorrect
+                self.scores.append(False)
+
+        return self.get_avg_score()
+
+
+"""
+Next Action task: generate an action that takes us closer to the goal.
+answer:
+    (a) A list of applicable actions that are known to be correct answers
+    (b) A list of applicable actions that are known to be incorrect answers
+    (c) The rest of the applicable actions (maybe).
+"""
+
+
+class NextActionEvaluator(BaseEvaluator):
+    def get_score(self, ans, doc):
+        real_answer = doc["answer"]
+        real_answer_yes = [a.lower() for a in real_answer["yes"]]
+        real_answer_no = [a.lower() for a in real_answer["no"]]
+        real_answer_maybe = [a.lower() for a in real_answer["maybe"]]
+        # The cost of the optimal plan from the current state
+        opt = real_answer.get("opt", None)
+        for x in ans:
+            if x is None:
+                self.scores.append(False)
+                continue
+            action = x.strip().lower()
+            if action in real_answer_yes:
+                # Known to be correct
+                self.scores.append(True)
+            elif action in real_answer_no:
+                # Known to be incorrect
+                self.scores.append(False)
+            elif action not in real_answer_maybe:
+                # Not applicable, must be incorrect
+                self.scores.append(False)
+            else:
+                # Unknown, need to run a planner to check whether the state that results from applying the action is closer to the goal
+                #  meaning has smaller optimal plan cost.
+                self.scores.append(
+                    is_on_optimal_plan(
+                        doc["PDDL_domain"].lower(),
+                        doc["PDDL_problem"].lower(),
+                        action,
+                        opt,
+                    )
+                )
+
+        return self.get_avg_score()
+
+
+"""
+Progression task: generate the positive and negative effects of an action in the current state.
+answer:
+    (a) A list of facts that were false and become true, when the action is applied
+    (b) A list of facts that were true and become false, when the action is applied
+"""
+
+
+class ProgressionEvaluator(BaseEvaluator):
+    def get_score(self, ans, doc):
+        real_answer = doc["answer"]
+        real_answer_pos = [a.lower() for a in real_answer["pos"]]
+        real_answer_neg = [a.lower() for a in real_answer["neg"]]
+
+        for x in ans:
+            # The answer should be two lists. We allow for a single list and assume that the second one is empty (relaxed evaluation).
+            if x is None or len(x) > 2 or len(x) < 1:
+                self.scores.append(False)
+            else:
+                p = cleanup_answer(x[0])
+                if len(x) == 2:
+                    n = cleanup_answer(x[1])
+                else:
+                    # Assuming the last element is dropped because it is empty
+                    n = []
+                # Check if the answer is equal as sets to the correct answers.
+                ans = [set_equal(real_answer_pos, p), set_equal(real_answer_neg, n)]
+                self.scores.append(all(ans))
+
+        return self.get_avg_score()
+
+
+"""
+Reachability task: generate a valid fact that will never become true in any reachable state.
+answer: A subset of facts that are known to be unreachable (not an exhaustive set).
+        It is empty only when we *know* that there are no such facts.
+"""
+
+
+class ReachabilityEvaluator(BaseEvaluator):
+    def get_score(self, ans, doc):
+        real_answer = doc["answer"]
+        real_answer = [f"({x.strip().lower()})" for x in real_answer]
+
+        if len(real_answer) == 0:
+            # The correct answer is None
+            self.add_scores(
+                ["none" == x.strip().lower() if x is not None else False for x in ans]
+            )
+        else:
+            for x in ans:
+                if x is None:
+                    self.scores.append(False)
+                elif x.strip().lower() in real_answer:
+                    # The answer is in the subset of stored correct answers
+                    self.scores.append(True)
+                else:
+                    # Need to run a planner on a task with the answer fact as the new goal
+                    atom = x.strip().lower()
+                    self.scores.append(
+                        is_unsolvable_new_goal(
+                            doc["PDDL_domain"].lower(),
+                            doc["PDDL_problem"].lower(),
+                            atom,
+                        )
+                    )
+
+        return self.get_avg_score()
+
+
+"""
+Validation task: generate an index of the first inapplicable action in the given sequence.
+answer: the correct index.
+"""
+
+
+class ValidationEvaluator(BaseEvaluator):
+    def get_score(self, ans, doc):
+        real_answer = str(doc["answer"])
+        assert int(real_answer) >= 0, (
+            f"The index must be non-negative, received {real_answer}"
+        )
+        # Exact match
+        self.add_scores(
+            [
+                real_answer.lower() == x.strip().lower() if x is not None else False
+                for x in ans
+            ]
+        )
+
+        return self.get_avg_score()
+
+
+##############################################################################
+
+
+def dump_item(item, **kwargs):
+    return json.dumps(item)
+
+
+def parse_prediction(prediction):
+    try:
+        ans = json.loads(prediction.strip())
+        response = ans.get("answer", None)
+        return response
+    except Exception as e:
+        print(f"Exception occurred {e}")
+        return prediction
+
+
+@register_filter("ACP_grammar_filter")
+class ACPGrammarFilter(RegexFilter):
+    """Filtering Index using"""
+
+    def __init__(self, *args, **kwargs):
+        self.parser = ACPGrammarParser(kwargs["grammar_task"])
+        self.clean = kwargs["clean"] if "clean" in kwargs else None
+
+    def clean_pos_neg(self, resp):
+        # Check for Positive Effects and Negative Effects instead of separation
+        if check_prog_response(resp):
+            resp2 = resp.lower()
+            resp2 = resp2.replace("*", "")
+            resp2 = resp2.replace("positive effects", "[")
+            resp2 = resp2.replace("negative effects", "] [")
+            resp2 = resp2 + "]"
+            return resp2
+        return resp
+
+    def clean_simplified_plan(self, resp):
+        # Check for "simplified plan:"
+        if "simplified plan:" in resp.lower():
+            resp2 = resp.lower()
+            resp2 = resp2.replace("*", "")
+            resp2 = resp2.split("simplified plan:")[1]
+            return resp2
+        return resp
+
+    def apply(self, resps, docs):
+        if self.clean == "pos_neg":
+            filtered_resps = [
+                [self.parser.parse(self.clean_pos_neg(r)) for r in resp]
+                for resp in resps
+            ]
+        elif self.clean == "simplified plan":
+            filtered_resps = [
+                [self.parser.parse(self.clean_simplified_plan(r)) for r in resp]
+                for resp in resps
+            ]
+        else:
+            filtered_resps = [[self.parser.parse(r) for r in resp] for resp in resps]
+        return filtered_resps
+
+
+def process_acp_results(doc, results):
+    return {"score": get_evaluator(doc["group"]).get_score(results, doc)}
+
+
+def get_score(references, predictions, **kwargs):
+    # print(f"References: {references}")
+    # print(f"Predictions: {predictions}")
+    data = json.loads(references[0].strip())
+    real_ans = data["answer"]
+    task = data["group"]
+
+    responses = [parse_prediction(prediction) for prediction in predictions]
+
+    print(f"Real answer: {real_ans}")
+    print(f"Model answers: {responses}")
+    parser = ACPGrammarParser(get_grammar_task(task))
+    ans = parse_ans(responses, parser, task)
+
+    print(f"Parsed model answers: {ans}")
+    score = get_evaluator(task).get_score(ans, data)
+
+    return {"get_score": score}
diff --git a/lm_eval/tasks/acpbench/gen_2shot/act_reach.yaml b/lm_eval/tasks/acpbench/gen_2shot/act_reach.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..36850e9caf8efdc7d5fd82de52de305d77a83ac6
--- /dev/null
+++ b/lm_eval/tasks/acpbench/gen_2shot/act_reach.yaml
@@ -0,0 +1,19 @@
+task: acp_areach_gen
+dataset_name: acp_areach_gen
+include: _gen_yaml_2shot
+fewshot_config:
+  sampler: first_n
+  samples:
+  - context: "A robot is in a grid and can only move to places that are connected to its current position. The grid size is 5x5, and the locations are of the form fi-jf (e.g., f3-2f or f0-1f). The grid cells are connected to their neighbors (e.g., f1-2f is connected to the four neighbors f0-2f, f2-2f, f1-1f, and f1-3f). Some positions on the grid are locked and can be opened with a key of a matching shape. The robot has an arm that can pick up a key when the key is in same location as the robot and the arm is empty.  There are 2 keys in 0 different shapes: Key key0-0 is of shape shape0, Key key0-1 is of shape shape0.  Currently, the robot is at position f2-2f and its arm is empty. All the positions are open except the following: f2-0f has shape0 shaped lock, f4-2f has shape0 shaped lock. Key key0-0 is at position f1-2f. Key key0-1 is at position f1-3f. The available actions are: (unlock ?curpos ?lockpos ?key ?shape) - unlock the place ?lockpos with key ?key of shape ?shape from the current position place ?curpos, (move ?curpos ?nextpos) - move from place ?curpos to place ?nextpos, (pickup ?curpos ?key) - retrieve the key ?key from its current position ?curpos, (pickup-and-loose ?curpos ?newkey ?oldkey) - pick up the key ?newkey from the current position ?curpos and loose the key ?oldkey which is being held, and (putdown ?curpos ?key) - put the key ?key at the current position place ?curpos."
+    question: "What action can never become applicable, in any state reachable from the current state?"
+    answer: "(unlock f0-3f f0-4f key0-0 shape0)"
+  - context: "There are several cities, each containing several locations, some of which are airports. There are also trucks, which can drive within a single city, and airplanes, which can fly between airports. The goal is to get some packages from various locations to various new locations. There are 2 trucks and 1 airplane, as well as 4 packages. There are 4 locations across 2 cities. The locations are in cities as follows: l1-1 and l1-0 are in c1; l0-0 and l0-1 are in c0. Currently, t1 is at l1-1, a0 is at l1-0, p0 is at l0-0, t0 is at l0-1, p2 is in a0, p1 is in t1, p3 is in t0. The available actions are: (load-truck ?obj ?truck ?loc) - load the object ?obj from location ?loc into the truck ?truck, (load-airplane ?obj ?airplane ?loc) - load object ?obj into airplane ?airplane at location ?loc, (unload-truck ?obj ?truck ?loc) - offload the object ?obj from the truck ?truck at location ?loc, (unload-airplane ?obj ?airplane ?loc) - offload the object ?obj from the airplane ?airplane at location ?loc, (drive-truck ?truck ?loc-from ?loc-to ?city) - navigate the truck ?truck from location ?loc-from in city ?city to location ?loc-to in the same city, and (fly-airplane ?airplane ?loc-from ?loc-to) - operate the airplane ?airplane from airport ?loc-from to airport ?loc-to."
+    question: "What action can never become applicable, in any state reachable from the current state?"
+    answer: "(drive-truck t0 l1-1 l0-0 c0)"
+doc_to_text: "**Question**: {{context}} {{question}} Each action starts with an opening parenthesis and ends with closing parenthesis. Provide one action or None. **Final Answer**:"
+filter_list:
+  - name: "acp_grammar_parse"
+    filter:
+      - function: "ACP_grammar_filter"
+        grammar_task: "act"
+      - function: "take_first"
diff --git a/lm_eval/tasks/acpbench/gen_2shot/app.yaml b/lm_eval/tasks/acpbench/gen_2shot/app.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..452254aba0e61200cc6ac95ac07ce26b06eb2961
--- /dev/null
+++ b/lm_eval/tasks/acpbench/gen_2shot/app.yaml
@@ -0,0 +1,19 @@
+task: acp_app_gen
+dataset_name: acp_app_gen
+include: _gen_yaml_2shot
+fewshot_config:
+  sampler: first_n
+  samples:
+  - context: "A robot is in a grid and can only move to places that are connected to its current position. The grid size is 5x5, and the locations are of the form fi-jf (e.g., f3-2f or f0-1f). The grid cells are connected to their neighbors (e.g., f1-2f is connected to the four neighbors f0-2f, f2-2f, f1-1f, and f1-3f). Some positions on the grid are locked and can be opened with a key of a matching shape. The robot has an arm that can pick up a key when the key is in same location as the robot and the arm is empty.  There are 2 keys in 1 different shapes: Key key0-1 is of shape shape0, Key key0-0 is of shape shape0.  Currently, the robot is at position f3-2f and its arm is empty. All the positions are open except the following: f2-0f has shape0 shaped lock, f4-2f has shape0 shaped lock. Key key0-0 is at position f2-2f. Key key0-1 is at position f1-3f. The available actions are: (unlock ?curpos ?lockpos ?key ?shape) - unlock the place ?lockpos with key ?key of shape ?shape from the current position place ?curpos, (move ?curpos ?nextpos) - travel from the current position ?curpos to the next position ?nextpos, (pickup ?curpos ?key) - pick up key ?key from place ?curpos, (pickup-and-loose ?curpos ?newkey ?oldkey) - pick up the key ?newkey at the current position place ?curpos and loose the key ?oldkey being held, and (putdown ?curpos ?key) - put down key ?key at current position place ?curpos."
+    question: "Generate the list of all ground actions that are applicable in this state."
+    answer: "[(move f3-2f f3-1f), (move f3-2f f2-2f), (move f3-2f f3-3f)]"
+  - context: "There are several cities, each containing several locations, some of which are airports. There are also trucks, which can drive within a single city, and airplanes, which can fly between airports. The goal is to get some packages from various locations to various new locations. There are 2 trucks and 1 airplane, as well as 4 packages. There are 4 locations across 2 cities. The locations are in cities as follows: l0-0 and l0-1 are in c0; l1-0 and l1-1 are in c1. Currently, t1, p2, and p3 are at l1-0, a0 is at l0-0, t0 is at l0-1, p1 and p0 are in t1. The available actions are: (load-truck ?obj ?truck ?loc) - load object ?obj into truck ?truck at location ?loc, (load-airplane ?obj ?airplane ?loc) - load object ?obj into airplane ?airplane at location ?loc, (unload-truck ?obj ?truck ?loc) - unload object ?obj from truck ?truck at location ?loc, (unload-airplane ?obj ?airplane ?loc) - remove the object ?obj from the airplane ?airplane and place it on the location ?loc, (drive-truck ?truck ?loc-from ?loc-to ?city) - navigate the truck ?truck from its current location ?loc-from in city ?city to the new location ?loc-to within the same city, and (fly-airplane ?airplane ?loc-from ?loc-to) - fly airplane ?airplane from airport ?loc-from to airport ?loc-to."
+    question: "Generate the list of all ground actions that are applicable in this state."
+    answer: "[(drive-truck t1 l1-0 l1-0 c1), (drive-truck t0 l0-1 l0-0 c0), (load-truck p2 t1 l1-0), (unload-truck p0 t1 l1-0), (drive-truck t0 l0-1 l0-1 c0), (fly-airplane a0 l0-0 l1-0), (fly-airplane a0 l0-0 l0-0), (unload-truck p1 t1 l1-0), (drive-truck t1 l1-0 l1-1 c1), (load-truck p3 t1 l1-0)]"
+doc_to_text: "**Question**: {{context}} {{question}} Each action starts with an opening parenthesis and ends with closing parenthesis. Provide only the actions. **Final Answer**:"
+filter_list:
+  - name: "acp_grammar_parse"
+    filter:
+      - function: "ACP_grammar_filter"
+        grammar_task: "action_list"
+      - function: "take_first"
diff --git a/lm_eval/tasks/acpbench/gen_2shot/just.yaml b/lm_eval/tasks/acpbench/gen_2shot/just.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..78daa393c3ffc383857b65091220cb3b180614e7
--- /dev/null
+++ b/lm_eval/tasks/acpbench/gen_2shot/just.yaml
@@ -0,0 +1,20 @@
+task: acp_just_gen
+dataset_name: acp_just_gen
+include: _gen_yaml_2shot
+fewshot_config:
+  sampler: first_n
+  samples:
+  - context: "A robot is in a grid and can only move to places that are connected to its current position. The grid size is 5x5, and the locations are of the form fi-jf (e.g., f3-2f or f0-1f). The grid cells are connected to their neighbors (e.g., f1-2f is connected to the four neighbors f0-2f, f2-2f, f1-1f, and f1-3f). Some positions on the grid are locked and can be opened with a key of a matching shape. The robot has an arm that can pick up a key when the key is in same location as the robot and the arm is empty.  There are 2 keys in 1 different shapes: Key key0-0 is of shape shape0, Key key0-1 is of shape shape0.  Currently, the robot is at position f3-3f and its arm is empty. All the positions are open except the following: f4-2f has shape0 shaped lock, f2-0f has shape0 shaped lock. Key key0-0 is at position f2-2f. Key key0-1 is at position f1-3f. The available actions are: (unlock ?curpos ?lockpos ?key ?shape) - unlock place ?lockpos with key ?key of shape ?shape from current position place ?curpos, (move ?curpos ?nextpos) - move from ?curpos to ?nextpos, (pickup ?curpos ?key) - retrieve the key ?key from its current position ?curpos, (pickup-and-loose ?curpos ?newkey ?oldkey) - pick up key ?newkey at current position place ?curpos and loose key ?oldkey being held, and (putdown ?curpos ?key) - put down the key ?key at the current position ?curpos. The goal is to reach a state where the following facts hold: Key key0-0 is at f2-0f location and Key key0-1 is at f1-3f location."
+    question: "Simplify the plan [(move f3-3f f3-2f), (move f3-2f f2-2f), (pickup f2-2f key0-0), (move f2-2f f2-1f), (putdown f2-1f key0-0), (pickup f2-1f key0-0), (unlock f2-1f f2-0f key0-0 shape0), (move f2-1f f2-0f), (putdown f2-0f key0-0)] by removing either a single action or a pair of consecutive actions, while still maintaining a valid plan. Provide the resulting simplified plan."
+    answer: "[(move f3-3f f3-2f), (move f3-2f f2-2f), (pickup f2-2f key0-0), (move f2-2f f2-1f), (unlock f2-1f f2-0f key0-0 shape0), (move f2-1f f2-0f), (putdown f2-0f key0-0)]"
+  - context: "There are several cities, each containing several locations, some of which are airports. There are also trucks, which can drive within a single city, and airplanes, which can fly between airports. The goal is to get some packages from various locations to various new locations. There are 2 trucks and 1 airplane, as well as 4 packages. There are 4 locations across 2 cities. The locations are in cities as follows: l1-1 and l1-0 are in c1; l0-0 and l0-1 are in c0. Currently, p2, p1, and p3 are at l1-0, p0 and t1 are at l1-1, t0 is at l0-1, a0 is at l0-0. The available actions are: (load-truck ?obj ?truck ?loc) - load the object ?obj from location ?loc into the truck ?truck, (load-airplane ?obj ?airplane ?loc) - load the object ?obj from location ?loc onto the airplane ?airplane, (unload-truck ?obj ?truck ?loc) - unload the object ?obj from the truck ?truck at location ?loc, (unload-airplane ?obj ?airplane ?loc) - remove the object ?obj from the airplane ?airplane and place it on the location ?loc, (drive-truck ?truck ?loc-from ?loc-to ?city) - drive truck ?truck from location ?loc-from in city ?city to location ?loc-to in the same city, and (fly-airplane ?airplane ?loc-from ?loc-to) - fly the airplane ?airplane from location ?loc-from to location ?loc-to. The goal is to reach a state where the following facts hold: p3 is at l0-1, p2 is at l1-0, p0 is at l0-0, and p1 is at l1-0."
+    question: "Simplify the plan [(fly-airplane a0 l0-0 l1-0), (fly-airplane a0 l1-0 l0-0), (load-truck p0 t1 l1-1), (drive-truck t1 l1-1 l1-0 c1), (unload-truck p0 t1 l1-0), (fly-airplane a0 l0-0 l1-0), (load-airplane p0 a0 l1-0), (load-airplane p3 a0 l1-0), (fly-airplane a0 l1-0 l0-0), (unload-airplane p0 a0 l0-0), (unload-airplane p3 a0 l0-0), (drive-truck t0 l0-1 l0-0 c0), (load-truck p3 t0 l0-0), (drive-truck t0 l0-0 l0-1 c0), (unload-truck p3 t0 l0-1)] by removing either a single action or a pair of consecutive actions, while still maintaining a valid plan. Provide the resulting simplified plan."
+    answer: "[(load-truck p0 t1 l1-1), (drive-truck t1 l1-1 l1-0 c1), (unload-truck p0 t1 l1-0), (fly-airplane a0 l0-0 l1-0), (load-airplane p0 a0 l1-0), (load-airplane p3 a0 l1-0), (fly-airplane a0 l1-0 l0-0), (unload-airplane p0 a0 l0-0), (unload-airplane p3 a0 l0-0), (drive-truck t0 l0-1 l0-0 c0), (load-truck p3 t0 l0-0), (drive-truck t0 l0-0 l0-1 c0), (unload-truck p3 t0 l0-1)]"
+doc_to_text: "**Question**: {{context}} {{question}} **Final Answer**:"
+filter_list:
+  - name: "acp_grammar_parse"
+    filter:
+      - function: "ACP_grammar_filter"
+        grammar_task: "action_list"
+        clean: "simplified plan"
+      - function: "take_first"
diff --git a/lm_eval/tasks/acpbench/gen_2shot/land.yaml b/lm_eval/tasks/acpbench/gen_2shot/land.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a3c65fd37670d9eb3893f0102153bbcca86e08f7
--- /dev/null
+++ b/lm_eval/tasks/acpbench/gen_2shot/land.yaml
@@ -0,0 +1,19 @@
+task: acp_land_gen
+dataset_name: acp_land_gen
+include: _gen_yaml_2shot
+fewshot_config:
+  sampler: first_n
+  samples:
+  - context: "A robot is in a grid and can only move to places that are connected to its current position. The grid size is 5x5, and the locations are of the form fi-jf (e.g., f3-2f or f0-1f). The grid cells are connected to their neighbors (e.g., f1-2f is connected to the four neighbors f0-2f, f2-2f, f1-1f, and f1-3f). Some positions on the grid are locked and can be opened with a key of a matching shape. The robot has an arm that can pick up a key when the key is in same location as the robot and the arm is empty.  There are 2 keys in 1 different shapes: Key key0-1 is of shape shape0, Key key0-0 is of shape shape0.  Currently, the robot is at position f3-0f and its arm is empty. All the positions are open except the following: f4-2f has shape0 shaped lock. Key key0-0 is at position f3-0f. Key key0-1 is at position f1-3f. The goal is to reach a state where the following facts hold: Key key0-0 is at f2-0f location and Key key0-1 is at f1-3f location. The available propositions are: (at ?r ?x) - Key ?r is at ?x location, (at-robot ?x) - Robot is at ?x location, (locked ?x) - Location ?x is locked, (holding ?k) - Robot is holding ?k, (open ?x) - Location ?x is open, and (arm-empty) - Robot's arm is empty."
+    question: "Generate a non-trivial fact landmark, one that does not hold in the initial state or goal."
+    answer: "(holding key0-0)"
+  - context: "There are several cities, each containing several locations, some of which are airports. There are also trucks, which can drive within a single city, and airplanes, which can fly between airports. The goal is to get some packages from various locations to various new locations. There are 2 trucks and 1 airplane, as well as 4 packages. There are 4 locations across 2 cities. The locations are in cities as follows: l1-0 and l1-1 are in c1; l0-1 and l0-0 are in c0. Currently, a0 and p2 are at l1-0, t0 is at l0-0, t1 is at l1-1, p3 and p1 are in a0, p0 is in t1. The goal is to reach a state where the following facts hold: p0 is at l0-0, p2 is at l1-0, p1 is at l1-0, and p3 is at l0-1. The available propositions are: (at ?obj ?loc) - ?obj is at ?loc and (in ?obj1 ?obj2) - ?obj1 is in ?obj2."
+    question: "Generate a non-trivial fact landmark, one that does not hold in the initial state or goal."
+    answer: "(in p3 t0)"
+doc_to_text: "**Question**: {{context}} {{question}} Provide only the ground proposition or None. **Final Answer**:"
+filter_list:
+  - name: "acp_grammar_parse"
+    filter:
+      - function: "ACP_grammar_filter"
+        grammar_task: "act"
+      - function: "take_first"
diff --git a/lm_eval/tasks/acpbench/gen_2shot/next_act.yaml b/lm_eval/tasks/acpbench/gen_2shot/next_act.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a264d6449b6630ff008bc5de94e47692dc63a2c8
--- /dev/null
+++ b/lm_eval/tasks/acpbench/gen_2shot/next_act.yaml
@@ -0,0 +1,19 @@
+task: acp_nexta_gen
+dataset_name: acp_nexta_gen
+include: _gen_yaml_2shot
+fewshot_config:
+  sampler: first_n
+  samples:
+  - context: "A robot is in a grid and can only move to places that are connected to its current position. The grid size is 5x5, and the locations are of the form fi-jf (e.g., f3-2f or f0-1f). The grid cells are connected to their neighbors (e.g., f1-2f is connected to the four neighbors f0-2f, f2-2f, f1-1f, and f1-3f). Some positions on the grid are locked and can be opened with a key of a matching shape. The robot has an arm that can pick up a key when the key is in same location as the robot and the arm is empty. There are 2 keys in 1 different shapes: Key key0-1 is of shape shape0, Key key0-0 is of shape shape0. Currently, the robot is at position f4-0f and its arm is empty. All the positions are open except the following: f4-2f has shape0 shaped lock. Key key0-0 is at position f3-0f. Key key0-1 is at position f1-3f. The goal is to reach a state where the following facts hold: Key key0-0 is at f2-0f location and Key key0-1 is at f1-3f location. The available actions are: (unlock ?curpos ?lockpos ?key ?shape) - unlock place ?lockpos with key ?key of shape ?shape from current position place ?curpos, (move ?curpos ?nextpos) - travel from the current position ?curpos to the next position ?nextpos, (pickup ?curpos ?key) - pick up key ?key from place ?curpos, (pickup-and-loose ?curpos ?newkey ?oldkey) - pick up the key ?newkey at the current position place ?curpos and loose the key ?oldkey being held, and (putdown ?curpos ?key) - put down the key ?key at the current position ?curpos."
+    question: "What is the next action that takes us towards the goal?"
+    answer: "(move f4-0f f3-0f)"
+  - context: "There are several cities, each containing several locations, some of which are airports. There are also trucks, which can drive within a single city, and airplanes, which can fly between airports. The goal is to get some packages from various locations to various new locations. There are 2 trucks and 1 airplane, as well as 4 packages. There are 4 locations across 2 cities. The locations are in cities as follows: l0-1 and l0-0 are in c0; l1-1 and l1-0 are in c1. Currently, t0 is at l0-1, a0 is at l0-0, t1 and p1 are at l1-0, p2, p0, and p3 are in t1. The goal is to reach a state where the following facts hold: p3 is at l0-1, p2 is at l1-0, p1 is at l1-0, and p0 is at l0-0. The available actions are: (load-truck ?obj ?truck ?loc) - load object ?obj into truck ?truck at location ?loc, (load-airplane ?obj ?airplane ?loc) - load the object ?obj from location ?loc onto the airplane ?airplane, (unload-truck ?obj ?truck ?loc) - unload the object ?obj from the truck ?truck at location ?loc, (unload-airplane ?obj ?airplane ?loc) - unload object ?obj from airplane ?airplane at location ?loc, (drive-truck ?truck ?loc-from ?loc-to ?city) - drive the truck ?truck in city ?city from location ?loc-from to location ?loc-to, and (fly-airplane ?airplane ?loc-from ?loc-to) - operate the airplane ?airplane from airport ?loc-from to airport ?loc-to."
+    question: "What is the next action that takes us towards the goal?"
+    answer: "(drive-truck t0 l0-1 l0-0 c0)"
+doc_to_text: "**Question**: {{context}} {{question}} Each action starts with an opening parenthesis and ends with closing parenthesis. Provide only the action. **Final Answer**:"
+filter_list:
+  - name: "acp_grammar_parse"
+    filter:
+      - function: "ACP_grammar_filter"
+        grammar_task: "action_name"
+      - function: "take_first"
diff --git a/lm_eval/tasks/acpbench/gen_2shot/prog.yaml b/lm_eval/tasks/acpbench/gen_2shot/prog.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6267f29acdd0c9767aa11277dadfa35d763bfeb1
--- /dev/null
+++ b/lm_eval/tasks/acpbench/gen_2shot/prog.yaml
@@ -0,0 +1,20 @@
+task: acp_prog_gen
+dataset_name: acp_prog_gen
+include: _gen_yaml_2shot
+fewshot_config:
+  sampler: first_n
+  samples:
+  - context: "A robot is in a grid and can only move to places that are connected to its current position. \nThe grid size is 5x5, and the locations are of the form fi-jf (e.g., f3-2f or f0-1f). The grid cells are connected to their neighbors (e.g., f1-2f is connected to the four neighbors f0-2f, f2-2f, f1-1f, and f1-3f). Some positions on the grid are locked and can be opened with a key of a matching shape. The robot has an arm that can pick up a key when the key is in same location as the robot and the arm is empty.  \nThere are 2 keys in 0 different shapes: Key key0-0 is of shape shape0, Key key0-1 is of shape shape0.  \nCurrently, the robot is at position f0-1f and its arm is empty. All the positions are open except the following: f4-2f has shape0 shaped lock. Key key0-1 is at position f1-3f. Key key0-0 is at position f0-1f. The available propositions are: (at ?r ?x) - Key ?r is at ?x location, (at-robot ?x) - Robot is at ?x location, (locked ?x) - Location ?x is locked, (holding ?k) - Robot is holding ?k, (open ?x) - Location ?x is open, and (arm-empty) - Robot's arm is empty."
+    question: "Break down the outcomes of performing the action \"retrieve the key key0-0 from its current position f0-1f\" into two lists, positive effects and negative effects. Positive effects are the propositions that are false in the current state but will become true after performing the action. Negative effects are the propositions that are true in the current state and will become false after performing the action."
+    answer: "[(holding key0-0)] [(arm-empty), (at key0-0 f0-1f)]"
+  - context: "There are several cities, each containing several locations, some of which are airports. There are also trucks, which can drive within a single city, and airplanes, which can fly between airports. The goal is to get some packages from various locations to various new locations. There are 2 trucks and 1 airplane, as well as 4 packages. There are 4 locations across 2 cities. The locations are in cities as follows: l1-1 and l1-0 are in c1; l0-1 and l0-0 are in c0. Currently, p2, t1, p1, p3, a0, and p0 are at l1-0, t0 is at l0-1. The available propositions are: (at ?obj ?loc) - ?obj is at ?loc and (in ?obj1 ?obj2) - ?obj1 is in ?obj2."
+    question: "Break down the outcomes of performing the action \"load object p3 into truck t1 at location l1-0\" into two lists, positive effects and negative effects. Positive effects are the propositions that are false in the current state but will become true after performing the action. Negative effects are the propositions that are true in the current state and will become false after performing the action."
+    answer: "[(in p3 t1)] [(at p3 l1-0)]"
+doc_to_text: "**Question**: {{context}} {{question}} Provide only the two lists with the ground propositions. **Final Answer**:"
+filter_list:
+  - name: "acp_grammar_parse"
+    filter:
+      - function: "ACP_grammar_filter"
+        grammar_task: "progression_list"
+        clean: "pos_neg"
+      - function: "take_first"
diff --git a/lm_eval/tasks/acpbench/gen_2shot/reach.yaml b/lm_eval/tasks/acpbench/gen_2shot/reach.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c3a192fcc610a12093876275e3b5702c5aadbc36
--- /dev/null
+++ b/lm_eval/tasks/acpbench/gen_2shot/reach.yaml
@@ -0,0 +1,19 @@
+task: acp_reach_gen
+dataset_name: acp_reach_gen
+include: _gen_yaml_2shot
+fewshot_config:
+  sampler: first_n
+  samples:
+  - context: "A robot is in a grid and can only move to places that are connected to its current position. The grid size is 5x5, and the locations are of the form fi-jf (e.g., f3-2f or f0-1f). The grid cells are connected to their neighbors (e.g., f1-2f is connected to the four neighbors f0-2f, f2-2f, f1-1f, and f1-3f). Some positions on the grid are locked and can be opened with a key of a matching shape. The robot has an arm that can pick up a key when the key is in same location as the robot and the arm is empty.  There are 2 keys in 0 different shapes: Key key0-1 is of shape shape0, Key key0-0 is of shape shape0.  Currently, the robot is at position f1-2f and its arm is empty. All the positions are open except the following: f4-2f has shape0 shaped lock. Key key0-0 is at position f1-0f. Key key0-1 is at position f1-3f. The available propositions are: (at ?r ?x) - Key ?r is at ?x location, (at-robot ?x) - Robot is at ?x location, (locked ?x) - Location ?x is locked, (holding ?k) - Robot is holding ?k, (open ?x) - Location ?x is open, and (arm-empty) - Robot's arm is empty."
+    question: "What proposition can never hold in any potentially reachable state?"
+    answer: "(locked f3-1f)"
+  - context: "There are several cities, each containing several locations, some of which are airports. There are also trucks, which can drive within a single city, and airplanes, which can fly between airports. The goal is to get some packages from various locations to various new locations. There are 2 trucks and 1 airplane, as well as 4 packages. There are 4 locations across 2 cities. The locations are in cities as follows: l0-0 and l0-1 are in c0; l1-0 and l1-1 are in c1. Currently, a0, p2, and t1 are at l1-0, p3 and p0 are at l0-0, t0 is at l0-1, p1 is in t1. The available propositions are: (at ?obj ?loc) - ?obj is at ?loc and (in ?obj1 ?obj2) - ?obj1 is in ?obj2."
+    question: "What proposition can never hold in any potentially reachable state?"
+    answer: "(at t0 l1-1)"
+doc_to_text: "**Question**: {{context}} {{question}} Provide one proposition or None. **Final Answer**:"
+filter_list:
+  - name: "acp_grammar_parse"
+    filter:
+      - function: "ACP_grammar_filter"
+        grammar_task: "act"
+      - function: "take_first"
diff --git a/lm_eval/tasks/acpbench/gen_2shot/val.yaml b/lm_eval/tasks/acpbench/gen_2shot/val.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5dc02acf6a8c4948305e8f827c99064ba6b440a5
--- /dev/null
+++ b/lm_eval/tasks/acpbench/gen_2shot/val.yaml
@@ -0,0 +1,19 @@
+task: acp_val_gen
+dataset_name: acp_val_gen
+include: _gen_yaml_2shot
+fewshot_config:
+  sampler: first_n
+  samples:
+  - context: "A robot is in a grid and can only move to places that are connected to its current position. The grid size is 5x5, and the locations are of the form fi-jf (e.g., f3-2f or f0-1f). The grid cells are connected to their neighbors (e.g., f1-2f is connected to the four neighbors f0-2f, f2-2f, f1-1f, and f1-3f). Some positions on the grid are locked and can be opened with a key of a matching shape. The robot has an arm that can pick up a key when the key is in same location as the robot and the arm is empty.  There are 2 keys in 1 different shapes: Key key0-0 is of shape shape0, Key key0-1 is of shape shape0.  Currently, the robot is at position f3-3f and its arm is empty. All the positions are open except the following: f2-0f has shape0 shaped lock, f4-2f has shape0 shaped lock. Key key0-1 is at position f1-3f. Key key0-0 is at position f2-2f. The goal is to reach a state where the following facts hold: Key key0-0 is at f2-0f location and Key key0-1 is at f1-3f location. The available actions are: (unlock ?curpos ?lockpos ?key ?shape) - unlock the place ?lockpos with the key ?key of the shape ?shape from the current position place ?curpos, (move ?curpos ?nextpos) - travel from the current position ?curpos to the next position ?nextpos, (pickup ?curpos ?key) - pick up key ?key from place ?curpos, (pickup-and-loose ?curpos ?newkey ?oldkey) - pick up the key ?newkey from the current position ?curpos and loose the key ?oldkey which is being held, and (putdown ?curpos ?key) - put down key ?key at current position place ?curpos."
+    question: "What is the first inapplicable action in the next sequence of actions: [(move f3-3f f3-2f), (move f3-2f f2-2f), (pickup f2-2f key0-0), (pickup-and-loose f4-0f key0-0 key0-1), (unlock f2-1f f2-0f key0-0 shape0), (move f2-1f f2-0f), (putdown f2-0f key0-0), (move f2-0f f2-1f)]?"
+    answer: "3"
+  - context: "There are several cities, each containing several locations, some of which are airports. There are also trucks, which can drive within a single city, and airplanes, which can fly between airports. The goal is to get some packages from various locations to various new locations. There are 2 trucks and 1 airplane, as well as 4 packages. There are 4 locations across 2 cities. The locations are in cities as follows: l0-1 and l0-0 are in c0; l1-1 and l1-0 are in c1. Currently, t1 and p0 are at l1-1, t0 is at l0-1, p3, p2, and p1 are at l1-0, a0 is at l0-0. The goal is to reach a state where the following facts hold: p2 is at l1-0, p3 is at l0-1, p0 is at l0-0, and p1 is at l1-0. The available actions are: (load-truck ?obj ?truck ?loc) - load object ?obj into truck ?truck at location ?loc, (load-airplane ?obj ?airplane ?loc) - load the object ?obj from location ?loc onto the airplane ?airplane, (unload-truck ?obj ?truck ?loc) - unload the object ?obj from the truck ?truck at location ?loc, (unload-airplane ?obj ?airplane ?loc) - unload object ?obj from airplane ?airplane at location ?loc, (drive-truck ?truck ?loc-from ?loc-to ?city) - navigate the truck ?truck from its current location ?loc-from in city ?city to the new location ?loc-to within the same city, and (fly-airplane ?airplane ?loc-from ?loc-to) - fly the airplane ?airplane from location ?loc-from to location ?loc-to."
+    question: "What is the first inapplicable action in the next sequence of actions: [(load-truck p0 t1 l1-1), (drive-truck t1 l1-1 l1-0 c1), (unload-truck p0 t1 l1-0), (fly-airplane a0 l0-0 l1-0), (unload-truck p3 t0 l0-1), (load-airplane p3 a0 l1-0), (fly-airplane a0 l1-0 l0-0), (unload-airplane p0 a0 l0-0), (unload-airplane p3 a0 l0-0), (drive-truck t0 l0-1 l0-0 c0), (load-truck p3 t0 l0-0), (drive-truck t0 l0-0 l0-1 c0), (unload-truck p3 t0 l0-1)]?"
+    answer: "4"
+doc_to_text: "**Question**: {{context}} {{question}} Provide only the index of the action. **Final Answer**:"
+filter_list:
+  - name: "acp_grammar_parse"
+    filter:
+      - function: "ACP_grammar_filter"
+        grammar_task: "index"
+      - function: "take_first"
diff --git a/lm_eval/tasks/acpbench/gen_2shot_with_pddl/_gen_yaml_2shot b/lm_eval/tasks/acpbench/gen_2shot_with_pddl/_gen_yaml_2shot
new file mode 100644
index 0000000000000000000000000000000000000000..710b360498e7cd8550de96b7d9c0a73d4303c8a8
--- /dev/null
+++ b/lm_eval/tasks/acpbench/gen_2shot_with_pddl/_gen_yaml_2shot
@@ -0,0 +1,26 @@
+tag:
+  - acp_gen_2shot_with_pddl
+  - acp_bench_hard_with_pddl
+dataset_path: ibm-research/acp_bench
+test_split: test
+description: "Answer the question based on the provided PDDL domain and PDDL problem. The current state is the initial state described in the PDDL problem below.\n\n"
+doc_to_target: "{{answer}}"
+output_type: generate_until
+num_fewshot: 2
+generation_kwargs:
+  until:
+    - "\n\n\n\n"
+    - "\n\n"
+    - "**Question**:"
+    - "**Question:**"
+    - "Q:"
+  do_sample: false
+  max_gen_toks: 1000
+  temperature: 0.0
+metadata:
+  version: 1.0
+process_results: !function acp_utils.process_acp_results
+metric_list:
+  - metric: "score"
+    aggregation: mean
+    higher_is_better: True
diff --git a/lm_eval/tasks/acpbench/gen_2shot_with_pddl/acp_grammar.lark b/lm_eval/tasks/acpbench/gen_2shot_with_pddl/acp_grammar.lark
new file mode 100644
index 0000000000000000000000000000000000000000..036bd675faacb044ca5bb2ef66b3dfac47943815
--- /dev/null
+++ b/lm_eval/tasks/acpbench/gen_2shot_with_pddl/acp_grammar.lark
@@ -0,0 +1,23 @@
+NAME: /[a-zA-Z][a-zA-Z0-9-_]*/
+LPAR : "("
+RPAR : ")"
+LSPAR: "["
+RSPAR: "]"
+COMMA: ","
+WS: /[ \n]/
+
+action_none : "None"
+
+action_name : LPAR NAME (WS NAME)* RPAR
+
+action_list : (action_name WS?)*
+
+prog_list :  action_name* (COMMA action_name)*
+
+progression_list : LSPAR prog_list RSPAR LSPAR prog_list RSPAR
+
+act : action_name | action_none
+
+index: /[0-9]+[0-9]*/
+
+start: action_list
diff --git a/lm_eval/tasks/acpbench/gen_2shot_with_pddl/acp_utils.py b/lm_eval/tasks/acpbench/gen_2shot_with_pddl/acp_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5051b68cbf7b5ef384f2ec498f2759409383c7b7
--- /dev/null
+++ b/lm_eval/tasks/acpbench/gen_2shot_with_pddl/acp_utils.py
@@ -0,0 +1,1128 @@
+import json
+import os
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from pathlib import Path
+
+from lm_eval.api.registry import register_filter
+from lm_eval.filters.extraction import RegexFilter
+
+
+try:
+    import tempfile
+
+    import tarski
+    from kstar_planner import planners as kp
+    from lark import Lark
+    from lark.lexer import Token
+    from lark.visitors import Visitor
+    from pddl.core import Problem
+    from pddl.parser.domain import DomainParser
+    from pddl.parser.problem import ProblemParser
+    from tarski.grounding.common import StateVariableLite
+    from tarski.grounding.lp_grounding import LPGroundingStrategy
+    from tarski.io import PDDLReader
+    from tarski.io import fstrips as iofs
+    from tarski.syntax.formulas import is_atom
+    from tarski.syntax.transform.action_grounding import (
+        ground_schema_into_plain_operator_from_grounding,
+    )
+    from tarski.util import SymbolIndex
+except ModuleNotFoundError:
+    raise ModuleNotFoundError(
+        "`lark>=1.1.9`, `tarski[clingo]==0.8.2`, `pddl==0.4.2` and `kstar-planner==1.4.2` are required for evaluating the generative tasks. \
+Please install via pip install lm-eval[acpbench] or pip install -e .[acpbench]",
+    )
+
+
+#########################################################################
+# Grammar
+
+
+GRAMMAR_FILE = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)), "acp_grammar.lark"
+)
+
+
+class ACPBench_Visitor(Visitor):
+    def __init__(self) -> None:
+        super().__init__()
+        self.action_lists = None
+        self.action_names = None
+        self.progression_lists = None
+        self.prog_lists = None
+        self.indexes = None
+
+    def action_list(self, tree):
+        self.action_lists = []
+
+    def prog_list(self, tree):
+        if self.prog_lists is not None:
+            self.progression_lists.append(self.prog_lists)
+        self.prog_lists = []
+
+    def progression_list(self, tree):
+        self.progression_lists = []
+
+    def action_none(self, tree):
+        self.action_names = "None"
+
+    def action_name(self, tree):
+        act_name = "(" + "".join(tree.children[1:-1]) + ")"
+        self.action_names = act_name
+        if self.action_lists is not None:
+            self.action_lists.append(act_name)
+        if self.prog_lists is not None:
+            self.prog_lists.append(act_name)
+
+    def index(self, tree):
+        self.indexes = "".join(tree.children)
+        if not self.indexes.isnumeric():
+            self.indexes = None
+
+
+class ACPGrammarParser(object):
+    def __init__(self, task) -> None:
+        self.task = task
+        with open(GRAMMAR_FILE) as f:
+            grammar = f.read()
+            self.acp_parser = Lark(grammar, start=task, parser="lalr")
+
+    def parse(self, input, debug=False):
+        def ignore_errors(e):
+            if hasattr(e, "token") and e.token.type == "$END":
+                for x in e.expected:
+                    if x != "WS":
+                        e.interactive_parser.feed_token(
+                            Token(x, self.acp_parser.get_terminal(x).pattern.value)
+                        )
+
+            return True
+
+        input = input.replace("\n", "")
+        input = input.strip()
+        try:
+            tree = self.acp_parser.parse(input, on_error=ignore_errors)
+
+            if debug:
+                print(tree)
+            visitor = ACPBench_Visitor()
+            visitor.visit_topdown(tree)
+            if self.task == "action_list":
+                return visitor.action_lists
+            elif self.task == "act":
+                return visitor.action_names
+            elif self.task == "action_name":
+                return visitor.action_names
+            elif self.task == "index":
+                return visitor.indexes
+            elif self.task == "progression_list":
+                if visitor.prog_lists not in visitor.progression_lists:
+                    visitor.progression_lists.append(visitor.prog_lists)
+                return visitor.progression_lists
+        except Exception as e:
+            if debug:
+                print("exception")
+                print(e)
+            return None
+
+
+##############################################################################
+# Utils
+
+
+# Used in next action
+def is_on_optimal_plan(domain, problem, action, opt):
+    with (
+        tempfile.NamedTemporaryFile() as domain_temp,
+        tempfile.NamedTemporaryFile() as problem_temp,
+    ):
+        with open(str(domain_temp.name), "w", encoding="utf8") as file:
+            file.write(domain.lower())
+        with open(str(problem_temp.name), "w", encoding="utf8") as file:
+            file.write(problem.lower())
+
+        # Here, we need to keep the temp files live until the end of the function
+        try:
+            P = STRIPS(str(domain_temp.name), str(problem_temp.name))
+        except Exception:
+            # Unsolvable
+            return False
+
+        a = P.get_action_or_none(action[1:-1])
+        if a is None:
+            return False
+        state = P.init
+        next_state = progress(state, a)
+        if opt is None:
+            # Get an optimal plan cost
+            plans = generate_optimal_plans_for_problem_state(
+                P, state, num_plans=1, timeout=5
+            )
+            opt = len(plans[0]["actions"])
+        else:
+            opt = int(opt)
+
+        # Getting an optimal plan for the next state
+        next_plans = generate_optimal_plans_for_problem_state(
+            P, next_state, num_plans=1, timeout=5
+        )
+        if next_plans is None:
+            return False
+        next_opt = len(next_plans[0]["actions"])
+        return next_opt + 1 == opt
+
+
+# Used in justification
+def is_plan(domain, problem, new_plan):
+    P = get_STRIPS(domain, problem)
+    if P is None:
+        # Unsolvable
+        return False
+
+    # Check if new_plan is a plan
+    current_state = P.init
+    for action in new_plan:
+        applicable_actions = P.get_applicable_actions(current_state)
+        app_actions_list = [f"({a.name.lower()})" for a in applicable_actions]
+        if action.lower() not in app_actions_list:
+            return False
+        a = applicable_actions[app_actions_list.index(action.lower())]
+        current_state = progress(current_state, a)
+    return entails(current_state, P.goal)
+
+
+# Used in action reachability
+def get_action_preconditions(domain, problem, action):
+    P = get_STRIPS(domain, problem)
+
+    assert P is not None, f"Domain\n{domain}\nProblem\n{problem}\nAction: {action}"
+    a = P.get_action_or_none(action[1:-1])
+    if a is None:
+        return a
+
+    return [f"({f})" for f in a.pres]
+
+
+def generate_optimal_plans_for_problem_state(P, state, num_plans, timeout):
+    import tempfile
+
+    with (
+        tempfile.NamedTemporaryFile() as domain_temp,
+        tempfile.NamedTemporaryFile() as problem_temp,
+    ):
+        create_tmp_dom_prob_replace_init(P, state, domain_temp, problem_temp)
+        plans = generate_top_q_plans(
+            domain=str(domain_temp.name),
+            problem=str(problem_temp.name),
+            num_plans=num_plans,
+            quality_bound=1.0,
+            timeout=timeout,
+        )
+        # print(plans)
+        if plans is None or len(plans["plans"]) == 0:
+            return None
+        return plans["plans"]
+
+
+def generate_top_q_plans(domain, problem, num_plans=10, quality_bound=1.0, timeout=30):
+    # print("Running K* planner")
+    plans = kp.plan_unordered_topq(
+        domain_file=Path(domain),
+        problem_file=Path(problem),
+        number_of_plans_bound=num_plans,
+        quality_bound=quality_bound,
+        timeout=timeout,
+    )
+    return plans
+
+
+# Used in (action) reachability
+def is_unsolvable_new_goal(domain, problem, new_goal):
+    goal = extract_goal(problem)
+    new_problem = problem.replace(goal, f"(:goal {new_goal} )")
+    return is_unsolvable(domain, new_problem)
+
+
+def is_unsolvable(domain, problem):
+    with (
+        tempfile.NamedTemporaryFile() as domain_temp,
+        tempfile.NamedTemporaryFile() as problem_temp,
+    ):
+        with open(str(domain_temp.name), "w", encoding="utf8") as file:
+            file.write(str(domain))
+        with open(str(problem_temp.name), "w", encoding="utf8") as file:
+            file.write(str(problem))
+
+        plans = kp.plan_unordered_topq(
+            domain_file=Path(str(domain_temp.name)),
+            problem_file=Path(str(problem_temp.name)),
+            quality_bound=1.0,
+            number_of_plans_bound=1,
+            timeout=3,
+        )
+
+        if len(plans["planner_error"]) > 0:
+            fl = plans["planner_error"].split("\n")[0]
+            print(f"Planner error: {fl}")
+            return False
+        if plans is None or len(plans["plans"]) == 0:
+            return plans["unsolvable"]
+        return False
+
+
+def extract_goal(prob):
+    a = prob.split("(:goal")[1]
+    cp = 1
+    for i, c in enumerate(a):
+        if c == ")":
+            cp -= 1
+        if c == "(":
+            cp += 1
+        if cp == 0:
+            return "(:goal" + a[: i + 1]
+
+    assert False
+
+
+def entails(state, partialstate):
+    return partialstate <= state
+
+
+def progress(state, act):
+    assert entails(state, act.pres), (
+        "Cannot progress with inconsistent state / action precondition:\n\t Action: "
+        + act.name
+        + "\n\t State: \n\t\t"
+        + "\n\t\t".join(state)
+    )
+    return (state - act.dels) | act.adds
+
+
+def regress(state, act):
+    assert len(state & act.dels) == 0, (
+        "Cannot regress with inconsistent state / action delete effect:\n\t Action: "
+        + act.name
+        + "\n\t State: \n\t\t"
+        + "\n\t\t".join(state)
+    )
+    return (state - act.adds) | act.pres
+
+
+def get_STRIPS(domain, problem):
+    with (
+        tempfile.NamedTemporaryFile() as domain_temp,
+        tempfile.NamedTemporaryFile() as problem_temp,
+    ):
+        with open(str(domain_temp.name), "w", encoding="utf8") as file:
+            file.write(domain.lower())
+        with open(str(problem_temp.name), "w", encoding="utf8") as file:
+            file.write(problem.lower())
+
+        try:
+            P = STRIPS(str(domain_temp.name), str(problem_temp.name))
+            return P
+        except Exception as e:
+            print(f"||{e}||")
+            return None
+
+
+def create_tmp_dom_prob_replace_init(P, state, result_domain_file, result_problem_file):
+    d, p = P.PDDL_replace_init_pddl_parser(state)
+    with open(str(result_domain_file.name), "w", encoding="utf8") as file:
+        file.write(str(d))
+    with open(str(result_problem_file.name), "w", encoding="utf8") as file:
+        file.write(str(p))
+
+    return d, p
+
+
+def fix_name(s):
+    # (act param)
+    if "(" == s[0] and ")" == s[-1]:
+        return s[1:-1]
+    # make it space separated
+    s = s.replace(", ", " ").replace(",", " ")
+    # act(param)
+    if "(" in s:
+        assert ")" == s[-1], f"Broken name? {s}"
+        s = s.replace("(", " ").replace(")", "")
+    # act param
+    return s
+
+
+def get_atoms_pddl(d, p, atoms):
+    objs = set()
+    preds = defaultdict(list)
+    for atom in atoms:
+        a = atom.lower().strip().split(" ")
+        args = a[1:]
+        preds[a[0]].append(args)
+        objs |= set(args)
+
+    constants = [o for o in p.objects | d.constants if o.name.lower() in objs]
+    constants_dict = {}
+    for c in constants:
+        constants_dict[c.name.lower()] = c
+    assert len(objs) == len(constants), (
+        f"Could not identify all objects: {objs - set(constants_dict.keys())} not found, {set(constants_dict.keys()) - objs} should not be there"
+    )
+
+    state = []
+    covered_preds = set()
+    for f in d.predicates:
+        name = f.name.lower()
+        if name in preds:
+            covered_preds.add(name)
+            assert len(preds[name][0]) == f.arity, (
+                f"The arity does not match: {preds[name]} vs {f.terms}"
+            )
+            # Going over the lists of objects, adding ground predicate for each
+            for ob in preds[name]:
+                c = [constants_dict[o] for o in ob]
+                state.append(f(*c))
+    assert len(covered_preds) == len(preds.keys()), (
+        f"Covered predicates: \n{sorted(list(covered_preds))} vs \n{sorted(list(preds.keys()))}"
+    )
+    return set(state)
+
+
+class Action:
+    def __init__(self, name, pre, add, delete):
+        self.name = name
+        self.pres = pre
+        self.adds = add
+        self.dels = delete
+
+    def __str__(self):
+        pres = "{" + ", ".join([f"({a})" for a in self.pres]) + "}"
+        adds = "{" + ", ".join([f"({a})" for a in self.adds]) + "}"
+        dels = "{" + ", ".join([f"({a})" for a in self.dels]) + "}"
+
+        return f"< {self.name}, {pres}, {adds}, {dels} >"
+
+    def toJSON(self):
+        return json.dumps(
+            {
+                "name": self.name,
+                "preconditions": [f"({a})" for a in self.pres],
+                "add_effects": [f"({a})" for a in self.adds],
+                "delete_effects": [f"({a})" for a in self.dels],
+            },
+            sort_keys=True,
+            indent=4,
+        )
+
+    def __repr__(self):
+        return self.name
+
+    def __eq__(self, action):
+        return self.name == action.name
+
+    def __hash__(self):
+        return hash(self.name)
+
+
+class STRIPS:
+    def __init__(self, domain, problem):
+        self.domain_file = domain
+        self.problem_file = problem
+        self.reader = PDDLReader(raise_on_error=True)
+        self.reader.parse_domain(domain)
+        self.problem = self.reader.parse_instance(problem)
+        (self.grounded_fluents, init, goal, self.operators, self.grounder) = (
+            self.ground_problem(self.problem)
+        )
+
+        self.fluents = set([fix_name(str(f)) for f in self.grounded_fluents])
+        self.fluents_map = dict()
+        for f in self.grounded_fluents:
+            self.fluents_map[fix_name(str(f))] = f
+        self.init = set([fix_name(str(f)) for f in init])
+        self.goal = set([fix_name(str(f)) for f in goal])
+        self.actions = set()
+        self.action_map = {}
+        self.init_fluents = [self.fluents_map[f] for f in self.init]
+
+        self.static_predicates = [i.name for i in self.grounder.static_symbols]
+        for op in self.operators:
+            act = self.operator_to_action(op)
+            self.actions.add(act)
+            self.action_map[act.name.lower()] = act
+
+    def __str__(self):
+        fluents = "P = {" + ", ".join([f"({a})" for a in self.fluents]) + "}"
+        init = "I = {" + ", ".join([f"({a})" for a in self.init]) + "}"
+        goal = "G = {" + ", ".join([f"({a})" for a in self.goal]) + "}"
+        actions = "A = {" + "\n ".join([a.__str__() for a in self.actions]) + "}"
+        return fluents + ",\n" + init + "\n" + goal + "\n" + actions
+
+    def toJSON(self):
+        actions = [a.toJSON() for a in self.actions]
+        return json.dumps(
+            {
+                "fluents": list(self.fluents),
+                "initial_state": list(self.init),
+                "goal": list(self.goal),
+                "actions": actions,
+            },
+            sort_keys=True,
+            indent=4,
+        )
+
+    def operator_to_action(self, op, check_fluents=True, check_static=False):
+        adds = {
+            fix_name(str(f.atom)) for f in op.effects if isinstance(f, iofs.AddEffect)
+        } & self.fluents
+        dels = {
+            fix_name(str(f.atom)) for f in op.effects if isinstance(f, iofs.DelEffect)
+        } & self.fluents
+        pre = self.fix_pre_name(op.precondition)
+        if check_fluents:
+            pre = pre & self.fluents
+        if check_static:
+            pre = {p for p in pre if p.split()[0] not in self.static_predicates}
+        act = Action(fix_name(str(op)), pre, adds, dels)
+        return act
+
+    def fix_pre_name(self, precondition):
+        if not is_atom(precondition):
+            return {fix_name(str(f)) for f in precondition.subformulas}
+        return {fix_name(str(precondition))}
+
+    def action(self, name):
+        return self.action_map[fix_name(name).lower()]
+
+    def get_action_or_none(self, name):
+        if "(" in name and ")" != name[-1]:
+            return None
+        return self.action_map.get(fix_name(name).lower(), None)
+
+    def fluent(self, name):
+        return fix_name(name)
+
+    def static_symbols(self):
+        return list(self.grounder.static_symbols)
+
+    def fluent_symbols(self):
+        return list(self.grounder.fluent_symbols)
+
+    def get_grounded_atoms(self, symbol):
+        variables = SymbolIndex()
+        lang = symbol.language
+        key = "atom_" + symbol.name
+        model = self.grounder._solve_lp()
+        if (
+            key in model
+        ):  # in case there is no reachable ground state variable from that fluent symbol
+            for binding in model[key]:
+                binding_with_constants = tuple(lang.get(c) for c in binding)
+                variables.add(StateVariableLite(symbol, binding_with_constants))
+        return variables
+
+    def get_applicable_actions(self, s):
+        return [a for a in self.actions if entails(s, a.pres)]
+
+    def ground_problem(self, problem):
+        grounder = LPGroundingStrategy(problem, include_variable_inequalities=True)
+        action_groundings = grounder.ground_actions()
+        operators = []
+        for action_name, groundings in action_groundings.items():
+            action = problem.get_action(action_name)
+            for grounding in groundings:
+                operators.append(
+                    ground_schema_into_plain_operator_from_grounding(action, grounding)
+                )
+
+        grounded_fluents = set(
+            [
+                grounded_fluent.to_atom()
+                for grounded_fluent in grounder.ground_state_variables().objects
+            ]
+        )
+        init = [f for f in problem.init.as_atoms() if f in grounded_fluents]
+        if isinstance(problem.goal, tarski.syntax.Atom):
+            goal = [problem.goal]
+        else:
+            goal = [f for f in problem.goal.subformulas if f in grounded_fluents]
+
+        return (grounded_fluents, init, goal, operators, grounder)
+
+    def get_static(self):
+        static_symbols = self.static_symbols()
+        ret = []
+        for symbol in static_symbols:
+            ret.extend(self.get_grounded_atoms(symbol))
+        return set([fix_name(str(x)) for x in ret])
+
+    def PDDL_replace_init_pddl_parser(self, s):
+        d = DomainParser()(open(self.domain_file, "r").read().lower())
+        p = ProblemParser()(open(self.problem_file, "r").read().lower())
+
+        new_state = get_atoms_pddl(d, p, s | self.get_static())
+
+        new_p = Problem(
+            p.name, domain=d, objects=p.objects, init=new_state, goal=p.goal
+        )
+
+        return d, new_p
+
+
+def parse_ans(response: str, parser: ACPGrammarParser, task: str):
+    return [parser.parse(clean_answer(resp, task)) for resp in response]
+
+
+# def parse_ans(response : str, parser : ACPGrammarParser, task : str):
+#     ans = [parser.parse(clean_answer(resp, task), debug=True) for resp in response]
+#     if any(elem is None for elem in ans) or any(elem is None for elem in ans[0]):
+#         return None
+#     return ans
+
+
+def remove_garbage(s):
+    while True:
+        if s.endswith("."):
+            s = s[:-1]
+        elif s.endswith("\n"):
+            s = s[:-2]
+        else:
+            break
+    return s.rstrip()
+
+
+def compare_str(s1, s2):
+    return remove_garbage(s1).lower() == remove_garbage(s2).lower()
+
+
+def compare(l1, l2):
+    if not isinstance(l1, list):
+        return compare_str(l1, l2)
+    if not isinstance(l2, list):
+        return False
+    for i, v in enumerate(l1):
+        if not compare(v, l2[i]):
+            return False
+    return True
+
+
+def check_prog_response(resp):
+    if (
+        "Positive Effects".lower() in resp.lower()
+        and "Negative Effects".lower() in resp.lower()
+    ):
+        if "[" not in resp:
+            return True
+    return False
+
+
+def clean_answer(resp, task):
+    # Minor cleanup
+    if "progression_gen" in task:
+        # Check for Positive Effects and Negative Effects instead of separation
+        if check_prog_response(resp):
+            # replace **Positive Effects** with "["
+            # replace **Negative Effects** with "] ["
+            # append "]" to the end
+            resp2 = resp.lower()
+            resp2 = resp2.replace("*", "")
+            resp2 = resp2.replace("positive effects", "[")
+            resp2 = resp2.replace("negative effects", "] [")
+            resp2 = resp2 + "]"
+            return resp2
+    if "action_justification_gen" in task:
+        # Check for "simplified plan:"
+        if "simplified plan:" in resp.lower():
+            resp2 = resp.lower()
+            resp2 = resp2.replace("*", "")
+            resp2 = resp2.split("simplified plan:")[1]
+            return resp2
+    return resp
+
+
+def get_grammar_task(task):
+    # print(task)
+    if task == "reachable_atom_gen":
+        return "act"
+    elif task == "progression_gen":
+        return "progression_list"
+    elif task == "validation_gen":
+        return "index"
+    elif task == "reachable_action_gen":
+        return "act"
+    elif task == "action_justification_gen":
+        return "action_list"
+    elif task == "landmarks_gen":
+        return "act"
+    elif task == "goal_closer_gen":
+        return "action_name"
+    elif task == "applicable_actions_gen":
+        return "action_list"
+
+
+##############################################################################
+#  Evaluators
+
+
+def fix_action_name(a):
+    assert a.startswith("(") and a.endswith(")")
+    return "(" + " ".join([x.strip() for x in a[1:-1].split(" ") if len(x) > 0]) + ")"
+
+
+def str_remove_before_first_parentheses(s):
+    if s.startswith("("):
+        return s
+    try:
+        return s[s.index("(") :]
+    except Exception:
+        return ""
+
+
+def str_remove_after_last_parentheses(s):
+    if s.endswith(")"):
+        return s
+
+    i = s.rfind(")")
+
+    if i == -1:
+        return ""
+    return s[: i + 1]
+
+
+def cleanup_answer(ans):
+    if isinstance(ans, str):
+        ans = str_remove_before_first_parentheses(ans)
+        ans = str_remove_after_last_parentheses(ans)
+        ans = ans.lower()
+        ans = (
+            ans.replace(")\n(", ")######(")
+            .replace("),(", ")######(")
+            .replace(") (", ")######(")
+            .split("######")
+        )
+        return ans
+    if isinstance(ans, list):
+        res = []
+        for x in ans:
+            res.extend(cleanup_answer(x))
+        return res
+
+
+def set_equal(ans1, ans2):
+    return set(ans1) == set(ans2)
+
+
+class BaseEvaluator(ABC):
+    def __init__(self) -> None:
+        self.scores = []
+
+    @abstractmethod
+    def get_score(self, ans, doc):
+        pass
+
+    def add_scores(self, scores):
+        self.scores.extend(scores)
+
+    def get_avg_score(self):
+        avg_score = sum(self.scores) / len(self.scores)
+        return avg_score
+
+
+def get_evaluator(group):
+    if group == "applicable_actions_gen":
+        return ApplicabilityEvaluator()
+    elif group == "progression_gen":
+        return ProgressionEvaluator()
+    elif group == "validation_gen":
+        return ValidationEvaluator()
+    elif group == "reachable_atom_gen":
+        return ReachabilityEvaluator()
+    elif group == "goal_closer_gen":
+        return NextActionEvaluator()
+    elif group == "action_justification_gen":
+        return JustificationEvaluator()
+    elif group == "landmarks_gen":
+        return LandmarksEvaluator()
+    elif group == "reachable_action_gen":
+        return ActionReachabilityEvaluator()
+    assert True, f"Group {group} not found"
+
+
+"""
+Action Reachability task: generate a valid action that is not applicable to any reachable state.
+answer: A subset of actions that are known to be unreachable (not an exhaustive set).
+        It is empty only when we *know* that there are no such actions.
+"""
+
+
+class ActionReachabilityEvaluator(BaseEvaluator):
+    def get_score(self, ans, doc):
+        real_answer = doc["answer"]
+        if not real_answer or len(real_answer) == 0:
+            # The correct answer is None
+            self.add_scores(
+                ["none" == x.strip().lower() if x is not None else False for x in ans]
+            )
+        else:
+            for x in ans:
+                if x is None:
+                    self.scores.append(False)
+                    continue
+                action = x.strip().lower()
+                if action in real_answer:
+                    # The answer is in the subset of stored correct answers
+                    self.scores.append(True)
+                    continue
+                prec = get_action_preconditions(
+                    doc["PDDL_domain"].lower(), doc["PDDL_problem"].lower(), action
+                )
+                if prec is None:
+                    # The answer does not correspond to a valid action
+                    self.scores.append(False)
+                else:
+                    # Need to run a planner on a task with the answer action preconditions as the new goal
+                    prec = f"(and {' '.join(prec)})"
+                    self.scores.append(
+                        is_unsolvable_new_goal(
+                            doc["PDDL_domain"].lower(),
+                            doc["PDDL_problem"].lower(),
+                            prec,
+                        )
+                    )
+
+        return self.get_avg_score()
+
+
+"""
+Action Applicability task: generate all actions that are applicable in the current state.
+answer: A set of all applicable actions.
+"""
+
+
+class ApplicabilityEvaluator(BaseEvaluator):
+    def get_score(self, ans, doc):
+        real_answer = doc["answer"]
+        real_answer = [a.lower() for a in real_answer]
+        ans = [[fix_action_name(a) for a in x] if x is not None else None for x in ans]
+
+        # Check if the answer is equal (as a set) to the real stored answer
+        self.add_scores(
+            [
+                set_equal(real_answer, cleanup_answer(x)) if x is not None else False
+                for x in ans
+            ]
+        )
+        return self.get_avg_score()
+
+
+def is_subsequence(plan, new_plan):
+    i = 0
+    for a in plan:
+        if a == new_plan[i]:
+            i += 1
+            if len(new_plan) == i:
+                # Done
+                return True
+    return False
+
+
+def is_subsequence_and_plan(domain, problem, plan, new_plan):
+    if len(plan) <= len(new_plan):
+        return False
+    if not is_subsequence(plan, new_plan):
+        return False
+    return is_plan(domain, problem, new_plan)
+
+
+"""
+Justification task: generate a proper subsequence of the given plan that is also a plan.
+answer: A list of examples of actions that can be removed (ignored in evaluation).
+"""
+
+
+class JustificationEvaluator(BaseEvaluator):
+    def get_score(self, ans, doc):
+        # Sequence of actions (plan) from the question
+        if "inputs" in doc:  # old field name
+            seq = doc["inputs"][19:-147]
+        else:
+            seq = doc["question"][19:-147]
+        seq = seq.replace(") (", ")######(").split("######")
+        for x in ans:
+            if x is None:
+                self.scores.append(False)
+                continue
+            # An answer plan candidate
+            x = [fix_action_name(a) for a in x]
+            if len(x) == 0:
+                # Wrong answer - never an empty sequence
+                self.scores.append(0)
+                continue
+            # Check if the plan candidate from the answer (a) is a proper subsequence of the plan in the question and (b) is a plan.
+            self.scores.append(
+                is_subsequence_and_plan(
+                    doc["PDDL_domain"].lower(), doc["PDDL_problem"].lower(), seq, x
+                )
+            )
+        return self.get_avg_score()
+
+
+"""
+Landmarks task: generate a fact that is a non-trivial landmark for the current state.
+answer: A list of facts that are found to be landmarks and a list of facts that are found to be non-landmarks.
+
+The questions are generated only for cases where all facts either
+    (a) hold in the current state,
+    (b) true in goal,
+    (c) are found to be landmarks, or
+    (d) are found to be non-landmarks.
+In such cases, the evaluation is simple, it does not require checking whether a fact is a landmark, it was
+already done during question generation.
+"""
+
+
+class LandmarksEvaluator(BaseEvaluator):
+    def get_score(self, ans, doc):
+        # The set of facts that are found to be landmarks
+        real_answer = doc["answer"]
+        real_answer_yes = [a.lower() for a in real_answer["yes"]]
+
+        for x in ans:
+            if x is None:
+                self.scores.append(False)
+                continue
+            if x.strip().lower() in real_answer_yes:
+                # The answer fact is known to be landmark
+                self.scores.append(True)
+            elif x.strip().lower() == "none":
+                # The answer is none, correct only if there are no known landmarks,
+                #   since we only generate questions when that means that there are no non-trivial landmarks
+                self.scores.append(len(real_answer_yes) == 0)
+            else:
+                # All other cases the answer is incorrect
+                self.scores.append(False)
+
+        return self.get_avg_score()
+
+
+"""
+Next Action task: generate an action that takes us closer to the goal.
+answer:
+    (a) A list of applicable actions that are known to be correct answers
+    (b) A list of applicable actions that are known to be incorrect answers
+    (c) The rest of the applicable actions (maybe).
+"""
+
+
+class NextActionEvaluator(BaseEvaluator):
+    def get_score(self, ans, doc):
+        real_answer = doc["answer"]
+        real_answer_yes = [a.lower() for a in real_answer["yes"]]
+        real_answer_no = [a.lower() for a in real_answer["no"]]
+        real_answer_maybe = [a.lower() for a in real_answer["maybe"]]
+        # The cost of the optimal plan from the current state
+        opt = real_answer.get("opt", None)
+        for x in ans:
+            if x is None:
+                self.scores.append(False)
+                continue
+            action = x.strip().lower()
+            if action in real_answer_yes:
+                # Known to be correct
+                self.scores.append(True)
+            elif action in real_answer_no:
+                # Known to be incorrect
+                self.scores.append(False)
+            elif action not in real_answer_maybe:
+                # Not applicable, must be incorrect
+                self.scores.append(False)
+            else:
+                # Unknown, need to run a planner to check whether the state that results from applying the action is closer to the goal
+                #  meaning has smaller optimal plan cost.
+                self.scores.append(
+                    is_on_optimal_plan(
+                        doc["PDDL_domain"].lower(),
+                        doc["PDDL_problem"].lower(),
+                        action,
+                        opt,
+                    )
+                )
+
+        return self.get_avg_score()
+
+
+"""
+Progression task: generate the positive and negative effects of an action in the current state.
+answer:
+    (a) A list of facts that were false and become true, when the action is applied
+    (b) A list of facts that were true and become false, when the action is applied
+"""
+
+
+class ProgressionEvaluator(BaseEvaluator):
+    def get_score(self, ans, doc):
+        real_answer = doc["answer"]
+        real_answer_pos = [a.lower() for a in real_answer["pos"]]
+        real_answer_neg = [a.lower() for a in real_answer["neg"]]
+
+        for x in ans:
+            # The answer should be two lists. We allow for a single list and assume that the second one is empty (relaxed evaluation).
+            if x is None or len(x) > 2 or len(x) < 1:
+                self.scores.append(False)
+            else:
+                p = cleanup_answer(x[0])
+                if len(x) == 2:
+                    n = cleanup_answer(x[1])
+                else:
+                    # Assuming the last element is dropped because it is empty
+                    n = []
+                # Check if the answer is equal as sets to the correct answers.
+                ans = [set_equal(real_answer_pos, p), set_equal(real_answer_neg, n)]
+                self.scores.append(all(ans))
+
+        return self.get_avg_score()
+
+
+"""
+Reachability task: generate a valid fact that will never become true in any reachable state.
+answer: A subset of facts that are known to be unreachable (not an exhaustive set).
+        It is empty only when we *know* that there are no such facts.
+"""
+
+
+class ReachabilityEvaluator(BaseEvaluator):
+    def get_score(self, ans, doc):
+        real_answer = doc["answer"]
+        real_answer = [f"({x.strip().lower()})" for x in real_answer]
+
+        if len(real_answer) == 0:
+            # The correct answer is None
+            self.add_scores(
+                ["none" == x.strip().lower() if x is not None else False for x in ans]
+            )
+        else:
+            for x in ans:
+                if x is None:
+                    self.scores.append(False)
+                elif x.strip().lower() in real_answer:
+                    # The answer is in the subset of stored correct answers
+                    self.scores.append(True)
+                else:
+                    # Need to run a planner on a task with the answer fact as the new goal
+                    atom = x.strip().lower()
+                    self.scores.append(
+                        is_unsolvable_new_goal(
+                            doc["PDDL_domain"].lower(),
+                            doc["PDDL_problem"].lower(),
+                            atom,
+                        )
+                    )
+
+        return self.get_avg_score()
+
+
+"""
+Validation task: generate an index of the first inapplicable action in the given sequence.
+answer: the correct index.
+"""
+
+
+class ValidationEvaluator(BaseEvaluator):
+    def get_score(self, ans, doc):
+        real_answer = str(doc["answer"])
+        assert int(real_answer) >= 0, (
+            f"The index must be non-negative, received {real_answer}"
+        )
+        # Exact match
+        self.add_scores(
+            [
+                real_answer.lower() == x.strip().lower() if x is not None else False
+                for x in ans
+            ]
+        )
+
+        return self.get_avg_score()
+
+
+##############################################################################
+
+
+def dump_item(item, **kwargs):
+    return json.dumps(item)
+
+
+def parse_prediction(prediction):
+    try:
+        ans = json.loads(prediction.strip())
+        response = ans.get("answer", None)
+        return response
+    except Exception as e:
+        print(f"Exception occurred {e}")
+        return prediction
+
+
+@register_filter("ACP_grammar_filter")
+class ACPGrammarFilter(RegexFilter):
+    """Filtering Index using"""
+
+    def __init__(self, *args, **kwargs):
+        self.parser = ACPGrammarParser(kwargs["grammar_task"])
+        self.clean = kwargs["clean"] if "clean" in kwargs else None
+
+    def clean_pos_neg(self, resp):
+        # Check for Positive Effects and Negative Effects instead of separation
+        if check_prog_response(resp):
+            resp2 = resp.lower()
+            resp2 = resp2.replace("*", "")
+            resp2 = resp2.replace("positive effects", "[")
+            resp2 = resp2.replace("negative effects", "] [")
+            resp2 = resp2 + "]"
+            return resp2
+        return resp
+
+    def clean_simplified_plan(self, resp):
+        # Check for "simplified plan:"
+        if "simplified plan:" in resp.lower():
+            resp2 = resp.lower()
+            resp2 = resp2.replace("*", "")
+            resp2 = resp2.split("simplified plan:")[1]
+            return resp2
+        return resp
+
+    def apply(self, resps, docs):
+        if self.clean == "pos_neg":
+            filtered_resps = [
+                [self.parser.parse(self.clean_pos_neg(r)) for r in resp]
+                for resp in resps
+            ]
+        elif self.clean == "simplified plan":
+            filtered_resps = [
+                [self.parser.parse(self.clean_simplified_plan(r)) for r in resp]
+                for resp in resps
+            ]
+        else:
+            filtered_resps = [[self.parser.parse(r) for r in resp] for resp in resps]
+        return filtered_resps
+
+
+def process_acp_results(doc, results):
+    return {"score": get_evaluator(doc["group"]).get_score(results, doc)}
+
+
+def get_score(references, predictions, **kwargs):
+    # print(f"References: {references}")
+    # print(f"Predictions: {predictions}")
+    data = json.loads(references[0].strip())
+    real_ans = data["answer"]
+    task = data["group"]
+
+    responses = [parse_prediction(prediction) for prediction in predictions]
+
+    print(f"Real answer: {real_ans}")
+    print(f"Model answers: {responses}")
+    parser = ACPGrammarParser(get_grammar_task(task))
+    ans = parse_ans(responses, parser, task)
+
+    print(f"Parsed model answers: {ans}")
+    score = get_evaluator(task).get_score(ans, data)
+
+    return {"get_score": score}
diff --git a/lm_eval/tasks/acpbench/gen_2shot_with_pddl/act_reach.yaml b/lm_eval/tasks/acpbench/gen_2shot_with_pddl/act_reach.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b05d7223d1f5b140a71e8ead4a28a8d26892fc16
--- /dev/null
+++ b/lm_eval/tasks/acpbench/gen_2shot_with_pddl/act_reach.yaml
@@ -0,0 +1,23 @@
+task: acp_areach_gen_with_pddl
+dataset_name: acp_areach_gen
+include: _gen_yaml_2shot
+fewshot_config:
+  sampler: first_n
+  samples:
+  - context: "A robot is in a grid and can only move to places that are connected to its current position. \nThe grid size is 5x5, and the locations are of the form fi-jf (e.g., f3-2f or f0-1f). The grid cells are connected to their neighbors (e.g., f1-2f is connected to the four neighbors f0-2f, f2-2f, f1-1f, and f1-3f). Some positions on the grid are locked and can be opened with a key of a matching shape. The robot has an arm that can pick up a key when the key is in same location as the robot and the arm is empty.  \nThere are 2 keys in 0 different shapes: Key key0-1 is of shape shape0, Key key0-0 is of shape shape0.  \nCurrently, the robot is at position f3-2f and its arm is empty. All the positions are open except the following: f2-0f has shape0 shaped lock, f4-2f has shape0 shaped lock. Key key0-1 is at position f1-3f. Key key0-0 is at position f2-2f. The available actions are: (unlock ?curpos ?lockpos ?key ?shape) - unlock the place ?lockpos with key ?key of shape ?shape from the current position place ?curpos, (move ?curpos ?nextpos) - move to place ?nextpos from place ?curpos, (pickup ?curpos ?key) - acquire the key ?key from the place ?curpos, (pickup-and-loose ?curpos ?newkey ?oldkey) - pick up key ?newkey at current position place ?curpos and loose key ?oldkey being held, and (putdown ?curpos ?key) - put down key ?key at current position place ?curpos."
+    question: "What action can never become applicable, in any state reachable from the current state?"
+    answer: "(pickup-and-loose f0-1f key0-0 key0-0)"
+    PDDL_domain: "(define (domain grid)\n    (:requirements :strips :typing)\n    (:types key place shape - object)\n    (:predicates (arm-empty) (at ?r - key ?x - place)  (at-robot ?x - place)  (conn ?x - place ?y - place)  (holding ?k - key)  (key-shape ?k - key ?s - shape)  (lock-shape ?x - place ?s - shape)  (locked ?x - place)  (open ?x - place))\n    (:action move\n        :parameters (?curpos - place ?nextpos - place)\n        :precondition (and (at-robot ?curpos) (conn ?curpos ?nextpos) (open ?nextpos))\n        :effect (and (at-robot ?nextpos) (not (at-robot ?curpos)))\n    )\n     (:action pickup\n        :parameters (?curpos - place ?key - key)\n        :precondition (and (at-robot ?curpos) (at ?key ?curpos) (arm-empty))\n        :effect (and (holding ?key) (not (at ?key ?curpos)) (not (arm-empty)))\n    )\n     (:action pickup-and-loose\n        :parameters (?curpos - place ?newkey - key ?oldkey - key)\n        :precondition (and (at-robot ?curpos) (holding ?oldkey) (at ?newkey ?curpos))\n        :effect (and (holding ?newkey) (at ?oldkey ?curpos) (not (holding ?oldkey)) (not (at ?newkey ?curpos)))\n    )\n     (:action putdown\n        :parameters (?curpos - place ?key - key)\n        :precondition (and (at-robot ?curpos) (holding ?key))\n        :effect (and (arm-empty) (at ?key ?curpos) (not (holding ?key)))\n    )\n     (:action unlock\n        :parameters (?curpos - place ?lockpos - place ?key - key ?shape - shape)\n        :precondition (and (conn ?curpos ?lockpos) (key-shape ?key ?shape) (lock-shape ?lockpos ?shape) (at-robot ?curpos) (locked ?lockpos) (holding ?key))\n        :effect (and (open ?lockpos) (not (locked ?lockpos)))\n    )\n)"
+    PDDL_problem: "(define (problem grid-x5-y5-t1-k2-l2-p100)\n    (:domain grid)\n    (:requirements :strips :typing)\n    (:objects key0-0 key0-1 - key f0-0f f0-1f f0-2f f0-3f f0-4f f1-0f f1-1f f1-2f f1-3f f1-4f f2-0f f2-1f f2-2f f2-3f f2-4f f3-0f f3-1f f3-2f f3-3f f3-4f f4-0f f4-1f f4-2f f4-3f f4-4f - place shape0 - shape)\n    (:init (arm-empty) (at key0-0 f2-2f) (at key0-1 f1-3f) (at-robot f3-2f) (conn f0-0f f0-1f) (conn f0-0f f1-0f) (conn f0-1f f0-0f) (conn f0-1f f0-2f) (conn f0-1f f1-1f) (conn f0-2f f0-1f) (conn f0-2f f0-3f) (conn f0-2f f1-2f) (conn f0-3f f0-2f) (conn f0-3f f0-4f) (conn f0-3f f1-3f) (conn f0-4f f0-3f) (conn f0-4f f1-4f) (conn f1-0f f0-0f) (conn f1-0f f1-1f) (conn f1-0f f2-0f) (conn f1-1f f0-1f) (conn f1-1f f1-0f) (conn f1-1f f1-2f) (conn f1-1f f2-1f) (conn f1-2f f0-2f) (conn f1-2f f1-1f) (conn f1-2f f1-3f) (conn f1-2f f2-2f) (conn f1-3f f0-3f) (conn f1-3f f1-2f) (conn f1-3f f1-4f) (conn f1-3f f2-3f) (conn f1-4f f0-4f) (conn f1-4f f1-3f) (conn f1-4f f2-4f) (conn f2-0f f1-0f) (conn f2-0f f2-1f) (conn f2-0f f3-0f) (conn f2-1f f1-1f) (conn f2-1f f2-0f) (conn f2-1f f2-2f) (conn f2-1f f3-1f) (conn f2-2f f1-2f) (conn f2-2f f2-1f) (conn f2-2f f2-3f) (conn f2-2f f3-2f) (conn f2-3f f1-3f) (conn f2-3f f2-2f) (conn f2-3f f2-4f) (conn f2-3f f3-3f) (conn f2-4f f1-4f) (conn f2-4f f2-3f) (conn f2-4f f3-4f) (conn f3-0f f2-0f) (conn f3-0f f3-1f) (conn f3-0f f4-0f) (conn f3-1f f2-1f) (conn f3-1f f3-0f) (conn f3-1f f3-2f) (conn f3-1f f4-1f) (conn f3-2f f2-2f) (conn f3-2f f3-1f) (conn f3-2f f3-3f) (conn f3-2f f4-2f) (conn f3-3f f2-3f) (conn f3-3f f3-2f) (conn f3-3f f3-4f) (conn f3-3f f4-3f) (conn f3-4f f2-4f) (conn f3-4f f3-3f) (conn f3-4f f4-4f) (conn f4-0f f3-0f) (conn f4-0f f4-1f) (conn f4-1f f3-1f) (conn f4-1f f4-0f) (conn f4-1f f4-2f) (conn f4-2f f3-2f) (conn f4-2f f4-1f) (conn f4-2f f4-3f) (conn f4-3f f3-3f) (conn f4-3f f4-2f) (conn f4-3f f4-4f) (conn f4-4f f3-4f) (conn f4-4f f4-3f) (key-shape key0-0 shape0) (key-shape key0-1 shape0) (lock-shape f2-0f shape0) (lock-shape f4-2f shape0) (locked f2-0f) (locked f4-2f) (open f0-0f) (open f0-1f) (open f0-2f) (open f0-3f) (open f0-4f) (open f1-0f) (open f1-1f) (open f1-2f) (open f1-3f) (open f1-4f) (open f2-1f) (open f2-2f) (open f2-3f) (open f2-4f) (open f3-0f) (open f3-1f) (open f3-2f) (open f3-3f) (open f3-4f) (open f4-0f) (open f4-1f) (open f4-3f) (open f4-4f))\n    (:goal (and (at key0-0 f2-0f) (at key0-1 f1-3f)))\n)"
+  - context: "There are several cities, each containing several locations, some of which are airports. There are also trucks, which can drive within a single city, and airplanes, which can fly between airports. The goal is to get some packages from various locations to various new locations. \nThere are 2 trucks and 1 airplane, as well as 4 packages. There are 4 locations across 2 cities. \nThe locations are in cities as follows: l1-0 and l1-1 are in c1; l0-0 and l0-1 are in c0. \nCurrently, a0, p1, and p2 are at l1-0, t0 is at l0-1, p3 and p0 are at l0-0, t1 is at l1-1. The available actions are: (load-truck ?obj ?truck ?loc) - place the object ?obj into the truck ?truck at location ?loc, (load-airplane ?obj ?airplane ?loc) - load the object ?obj from location ?loc into the airplane ?airplane, (unload-truck ?obj ?truck ?loc) - unload object ?obj from truck ?truck at location ?loc, (unload-airplane ?obj ?airplane ?loc) - unload object ?obj from airplane ?airplane at location ?loc, (drive-truck ?truck ?loc-from ?loc-to ?city) - navigate the truck ?truck from its current location ?loc-from in city ?city to the new location ?loc-to within the same city, and (fly-airplane ?airplane ?loc-from ?loc-to) - fly airplane ?airplane from airport ?loc-from to airport ?loc-to."
+    question: "What action can never become applicable, in any state reachable from the current state??"
+    answer: "(load-truck p2 t0 l1-1)"
+    PDDL_domain: "(define (domain logistics-strips)\n  (:requirements :strips :typing) \n\n  (:types \n    location locatable city - object \n    package movable - locatable\n    airport - location\n    airplane truck - movable    \n  )\t\t\n  \n  (:predicates \t\n\t\t(at ?obj - locatable ?loc - location)\n\t\t(in ?obj1 - package ?obj2 - movable)\n\t\t(in-city ?obj - location ?city - city))\n\n\n(:action LOAD-TRUCK\n  :parameters\n   (?obj - package\n    ?truck - truck\n    ?loc - location)\n  :precondition\n   (and \n   (at ?truck ?loc) (at ?obj ?loc))\n  :effect\n   (and (not (at ?obj ?loc)) (in ?obj ?truck)))\n\n(:action LOAD-AIRPLANE\n  :parameters\n   (?obj - package\n    ?airplane - airplane\n    ?loc - location)\n  :precondition\n   (and \n   (at ?obj ?loc) (at ?airplane ?loc))\n  :effect\n   (and (not (at ?obj ?loc)) (in ?obj ?airplane)))\n\n\n\n(:action UNLOAD-TRUCK\n  :parameters\n   (?obj - package\n    ?truck - truck\n    ?loc - location)\n  :precondition\n   (and \n        (at ?truck ?loc) (in ?obj ?truck))\n  :effect\n   (and (not (in ?obj ?truck)) (at ?obj ?loc)))\n\n(:action UNLOAD-AIRPLANE\n  :parameters\n   (?obj - package\n    ?airplane - airplane\n    ?loc - location)\n  :precondition\n   (and \n        (in ?obj ?airplane) (at ?airplane ?loc))\n  :effect\n   (and (not (in ?obj ?airplane)) (at ?obj ?loc)))\n\n(:action DRIVE-TRUCK\n  :parameters\n   (?truck - truck\n    ?loc-from - location\n    ?loc-to - location\n    ?city - city)\n  :precondition\n   (and \n   (at ?truck ?loc-from)\n   (in-city ?loc-from ?city)\n   (in-city ?loc-to ?city))\n  :effect\n   (and (not (at ?truck ?loc-from)) (at ?truck ?loc-to)))\n\n(:action FLY-AIRPLANE\n  :parameters\n   (?airplane - airplane\n    ?loc-from - airport\n    ?loc-to - airport)\n  :precondition\n   (and \n\t(at ?airplane ?loc-from))\n  :effect\n   (and (not (at ?airplane ?loc-from)) (at ?airplane ?loc-to)))\n)"
+    PDDL_problem: "(define (problem logistics-c2-s2-p4-a1)\n    (:domain logistics-strips)\n    (:requirements :strips :typing)\n    (:objects a0 - airplane l0-0 l1-0 - airport c0 c1 - city l0-1 l1-1 - location p0 p1 p2 p3 - package t0 t1 - truck)\n    (:init (at a0 l1-0) (at p0 l0-0) (at p1 l1-0) (at p2 l1-0) (at p3 l0-0) (at t0 l0-1) (at t1 l1-1) (in-city l0-0 c0) (in-city l0-1 c0) (in-city l1-0 c1) (in-city l1-1 c1))\n    (:goal (and (at p0 l0-0) (at p1 l1-0) (at p2 l1-0) (at p3 l0-1)))\n)"
+doc_to_text: "# PDDL DOMAIN \n\n```\n{{PDDL_domain}}\n```\n\n# PDDL PROBLEM \n\n```\n{{PDDL_problem}}\n```\n\n**Question**: {{context}} {{question}} Each action starts with an opening parenthesis and ends with closing parenthesis. Provide one action or None. **Final Answer**:"
+filter_list:
+  - name: "acp_grammar_parse"
+    filter:
+      - function: "ACP_grammar_filter"
+        grammar_task: "act"
+      - function: "take_first"
diff --git a/lm_eval/tasks/acpbench/gen_2shot_with_pddl/app.yaml b/lm_eval/tasks/acpbench/gen_2shot_with_pddl/app.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c212924c04222f24918161e9fb46587705bd520d
--- /dev/null
+++ b/lm_eval/tasks/acpbench/gen_2shot_with_pddl/app.yaml
@@ -0,0 +1,23 @@
+task: acp_app_gen_with_pddl
+dataset_name: acp_app_gen
+include: _gen_yaml_2shot
+fewshot_config:
+  sampler: first_n
+  samples:
+  - context: "A robot is in a grid and can only move to places that are connected to its current position. \nThe grid size is 5x5, and the locations are of the form fi-jf (e.g., f3-2f or f0-1f). The grid cells are connected to their neighbors (e.g., f1-2f is connected to the four neighbors f0-2f, f2-2f, f1-1f, and f1-3f). Some positions on the grid are locked and can be opened with a key of a matching shape. The robot has an arm that can pick up a key when the key is in same location as the robot and the arm is empty.  \nThere are 2 keys in 0 different shapes: Key key0-0 is of shape shape0, Key key0-1 is of shape shape0.  \nCurrently, the robot is at position f4-3f and its arm is empty. All the positions are open except the following: f2-0f has shape0 shaped lock, f4-2f has shape0 shaped lock. Key key0-0 is at position f3-1f. Key key0-1 is at position f1-3f. The available actions are: (unlock ?curpos ?lockpos ?key ?shape) - use the key ?key of shape ?shape to unlock the place ?lockpos from the current position ?curpos, (move ?curpos ?nextpos) - transition from the current position ?curpos to the next position ?nextpos, (pickup ?curpos ?key) - pick up key ?key from place ?curpos, (pickup-and-loose ?curpos ?newkey ?oldkey) - pick up the key ?newkey from the current position ?curpos and loose the key ?oldkey which is being held, and (putdown ?curpos ?key) - place the key ?key at the current position ?curpos."
+    question: "Generate the list of all ground actions that are applicable in this state."
+    answer: "[(move f4-3f f3-3f), (move f4-3f f4-4f)]"
+    PDDL_domain: "(define (domain grid)\n    (:requirements :strips :typing)\n    (:types key place shape - object)\n    (:predicates (arm-empty) (at ?r - key ?x - place)  (at-robot ?x - place)  (conn ?x - place ?y - place)  (holding ?k - key)  (key-shape ?k - key ?s - shape)  (lock-shape ?x - place ?s - shape)  (locked ?x - place)  (open ?x - place))\n    (:action move\n        :parameters (?curpos - place ?nextpos - place)\n        :precondition (and (at-robot ?curpos) (conn ?curpos ?nextpos) (open ?nextpos))\n        :effect (and (at-robot ?nextpos) (not (at-robot ?curpos)))\n    )\n     (:action pickup\n        :parameters (?curpos - place ?key - key)\n        :precondition (and (at-robot ?curpos) (at ?key ?curpos) (arm-empty))\n        :effect (and (holding ?key) (not (at ?key ?curpos)) (not (arm-empty)))\n    )\n     (:action pickup-and-loose\n        :parameters (?curpos - place ?newkey - key ?oldkey - key)\n        :precondition (and (at-robot ?curpos) (holding ?oldkey) (at ?newkey ?curpos))\n        :effect (and (holding ?newkey) (at ?oldkey ?curpos) (not (holding ?oldkey)) (not (at ?newkey ?curpos)))\n    )\n     (:action putdown\n        :parameters (?curpos - place ?key - key)\n        :precondition (and (at-robot ?curpos) (holding ?key))\n        :effect (and (arm-empty) (at ?key ?curpos) (not (holding ?key)))\n    )\n     (:action unlock\n        :parameters (?curpos - place ?lockpos - place ?key - key ?shape - shape)\n        :precondition (and (conn ?curpos ?lockpos) (key-shape ?key ?shape) (lock-shape ?lockpos ?shape) (at-robot ?curpos) (locked ?lockpos) (holding ?key))\n        :effect (and (open ?lockpos) (not (locked ?lockpos)))\n    )\n)"
+    PDDL_problem: "(define (problem grid-x5-y5-t1-k2-l2-p100)\n    (:domain grid)\n    (:requirements :strips :typing)\n    (:objects key0-0 key0-1 - key f0-0f f0-1f f0-2f f0-3f f0-4f f1-0f f1-1f f1-2f f1-3f f1-4f f2-0f f2-1f f2-2f f2-3f f2-4f f3-0f f3-1f f3-2f f3-3f f3-4f f4-0f f4-1f f4-2f f4-3f f4-4f - place shape0 - shape)\n    (:init (arm-empty) (at key0-0 f3-1f) (at key0-1 f1-3f) (at-robot f4-3f) (conn f0-0f f0-1f) (conn f0-0f f1-0f) (conn f0-1f f0-0f) (conn f0-1f f0-2f) (conn f0-1f f1-1f) (conn f0-2f f0-1f) (conn f0-2f f0-3f) (conn f0-2f f1-2f) (conn f0-3f f0-2f) (conn f0-3f f0-4f) (conn f0-3f f1-3f) (conn f0-4f f0-3f) (conn f0-4f f1-4f) (conn f1-0f f0-0f) (conn f1-0f f1-1f) (conn f1-0f f2-0f) (conn f1-1f f0-1f) (conn f1-1f f1-0f) (conn f1-1f f1-2f) (conn f1-1f f2-1f) (conn f1-2f f0-2f) (conn f1-2f f1-1f) (conn f1-2f f1-3f) (conn f1-2f f2-2f) (conn f1-3f f0-3f) (conn f1-3f f1-2f) (conn f1-3f f1-4f) (conn f1-3f f2-3f) (conn f1-4f f0-4f) (conn f1-4f f1-3f) (conn f1-4f f2-4f) (conn f2-0f f1-0f) (conn f2-0f f2-1f) (conn f2-0f f3-0f) (conn f2-1f f1-1f) (conn f2-1f f2-0f) (conn f2-1f f2-2f) (conn f2-1f f3-1f) (conn f2-2f f1-2f) (conn f2-2f f2-1f) (conn f2-2f f2-3f) (conn f2-2f f3-2f) (conn f2-3f f1-3f) (conn f2-3f f2-2f) (conn f2-3f f2-4f) (conn f2-3f f3-3f) (conn f2-4f f1-4f) (conn f2-4f f2-3f) (conn f2-4f f3-4f) (conn f3-0f f2-0f) (conn f3-0f f3-1f) (conn f3-0f f4-0f) (conn f3-1f f2-1f) (conn f3-1f f3-0f) (conn f3-1f f3-2f) (conn f3-1f f4-1f) (conn f3-2f f2-2f) (conn f3-2f f3-1f) (conn f3-2f f3-3f) (conn f3-2f f4-2f) (conn f3-3f f2-3f) (conn f3-3f f3-2f) (conn f3-3f f3-4f) (conn f3-3f f4-3f) (conn f3-4f f2-4f) (conn f3-4f f3-3f) (conn f3-4f f4-4f) (conn f4-0f f3-0f) (conn f4-0f f4-1f) (conn f4-1f f3-1f) (conn f4-1f f4-0f) (conn f4-1f f4-2f) (conn f4-2f f3-2f) (conn f4-2f f4-1f) (conn f4-2f f4-3f) (conn f4-3f f3-3f) (conn f4-3f f4-2f) (conn f4-3f f4-4f) (conn f4-4f f3-4f) (conn f4-4f f4-3f) (key-shape key0-0 shape0) (key-shape key0-1 shape0) (lock-shape f2-0f shape0) (lock-shape f4-2f shape0) (locked f2-0f) (locked f4-2f) (open f0-0f) (open f0-1f) (open f0-2f) (open f0-3f) (open f0-4f) (open f1-0f) (open f1-1f) (open f1-2f) (open f1-3f) (open f1-4f) (open f2-1f) (open f2-2f) (open f2-3f) (open f2-4f) (open f3-0f) (open f3-1f) (open f3-2f) (open f3-3f) (open f3-4f) (open f4-0f) (open f4-1f) (open f4-3f) (open f4-4f))\n    (:goal (and (at key0-0 f2-0f) (at key0-1 f1-3f)))\n)"
+  - context: "There are several cities, each containing several locations, some of which are airports. There are also trucks, which can drive within a single city, and airplanes, which can fly between airports. The goal is to get some packages from various locations to various new locations. \nThere are 2 trucks and 1 airplane, as well as 4 packages. There are 4 locations across 2 cities. \nThe locations are in cities as follows: l0-1 and l0-0 are in c0; l1-1 and l1-0 are in c1. \nCurrently, t1 is at l1-0, p0, a0, t0, and p3 are at l0-0, p1 and p2 are in t1. The available actions are: (load-truck ?obj ?truck ?loc) - load the object ?obj from location ?loc into the truck ?truck, (load-airplane ?obj ?airplane ?loc) - place the object ?obj onto the airplane ?airplane at location ?loc, (unload-truck ?obj ?truck ?loc) - remove the object ?obj from the truck ?truck and place it on the location ?loc, (unload-airplane ?obj ?airplane ?loc) - unload object ?obj from airplane ?airplane at location ?loc, (drive-truck ?truck ?loc-from ?loc-to ?city) - navigate the truck ?truck from location ?loc-from in city ?city to location ?loc-to in the same city, and (fly-airplane ?airplane ?loc-from ?loc-to) - fly airplane ?airplane from airport ?loc-from to airport ?loc-to."
+    question: "Generate the list of all ground actions that are applicable in this state."
+    answer: "[(unload-truck p2 t1 l1-0), (drive-truck t0 l0-0 l0-0 c0), (load-airplane p0 a0 l0-0), (load-truck p0 t0 l0-0), (unload-truck p1 t1 l1-0), (drive-truck t1 l1-0 l1-0 c1), (drive-truck t0 l0-0 l0-1 c0), (drive-truck t1 l1-0 l1-1 c1), (fly-airplane a0 l0-0 l0-0), (load-truck p3 t0 l0-0), (fly-airplane a0 l0-0 l1-0), (load-airplane p3 a0 l0-0)]"
+    PDDL_domain: "(define (domain logistics-strips)\n  (:requirements :strips :typing) \n\n  (:types \n    location locatable city - object \n    package movable - locatable\n    airport - location\n    airplane truck - movable    \n  )\t\t\n  \n  (:predicates \t\n\t\t(at ?obj - locatable ?loc - location)\n\t\t(in ?obj1 - package ?obj2 - movable)\n\t\t(in-city ?obj - location ?city - city))\n\n\n(:action LOAD-TRUCK\n  :parameters\n   (?obj - package\n    ?truck - truck\n    ?loc - location)\n  :precondition\n   (and \n   (at ?truck ?loc) (at ?obj ?loc))\n  :effect\n   (and (not (at ?obj ?loc)) (in ?obj ?truck)))\n\n(:action LOAD-AIRPLANE\n  :parameters\n   (?obj - package\n    ?airplane - airplane\n    ?loc - location)\n  :precondition\n   (and \n   (at ?obj ?loc) (at ?airplane ?loc))\n  :effect\n   (and (not (at ?obj ?loc)) (in ?obj ?airplane)))\n\n\n\n(:action UNLOAD-TRUCK\n  :parameters\n   (?obj - package\n    ?truck - truck\n    ?loc - location)\n  :precondition\n   (and \n        (at ?truck ?loc) (in ?obj ?truck))\n  :effect\n   (and (not (in ?obj ?truck)) (at ?obj ?loc)))\n\n(:action UNLOAD-AIRPLANE\n  :parameters\n   (?obj - package\n    ?airplane - airplane\n    ?loc - location)\n  :precondition\n   (and \n        (in ?obj ?airplane) (at ?airplane ?loc))\n  :effect\n   (and (not (in ?obj ?airplane)) (at ?obj ?loc)))\n\n(:action DRIVE-TRUCK\n  :parameters\n   (?truck - truck\n    ?loc-from - location\n    ?loc-to - location\n    ?city - city)\n  :precondition\n   (and \n   (at ?truck ?loc-from)\n   (in-city ?loc-from ?city)\n   (in-city ?loc-to ?city))\n  :effect\n   (and (not (at ?truck ?loc-from)) (at ?truck ?loc-to)))\n\n(:action FLY-AIRPLANE\n  :parameters\n   (?airplane - airplane\n    ?loc-from - airport\n    ?loc-to - airport)\n  :precondition\n   (and \n\t(at ?airplane ?loc-from))\n  :effect\n   (and (not (at ?airplane ?loc-from)) (at ?airplane ?loc-to)))\n)"
+    PDDL_problem: "(define (problem logistics-c2-s2-p4-a1)\n    (:domain logistics-strips)\n    (:requirements :strips :typing)\n    (:objects a0 - airplane l0-0 l1-0 - airport c0 c1 - city l0-1 l1-1 - location p0 p1 p2 p3 - package t0 t1 - truck)\n    (:init (at a0 l0-0) (at p0 l0-0) (at p3 l0-0) (at t0 l0-0) (at t1 l1-0) (in p1 t1) (in p2 t1) (in-city l0-0 c0) (in-city l0-1 c0) (in-city l1-0 c1) (in-city l1-1 c1))\n    (:goal (and (at p0 l0-0) (at p1 l1-0) (at p2 l1-0) (at p3 l0-1)))\n)"
+doc_to_text: "# PDDL DOMAIN \n\n```\n{{PDDL_domain}}\n```\n\n# PDDL PROBLEM \n\n```\n{{PDDL_problem}}\n```\n\n**Question**: {{context}} {{question}} Each action starts with an opening parenthesis and ends with closing parenthesis. Provide only the actions. \n**Final Answer**:"
+filter_list:
+  - name: "acp_grammar_parse"
+    filter:
+      - function: "ACP_grammar_filter"
+        grammar_task: "action_list"
+      - function: "take_first"
diff --git a/lm_eval/tasks/acpbench/gen_2shot_with_pddl/just.yaml b/lm_eval/tasks/acpbench/gen_2shot_with_pddl/just.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9685b8b8f3c3c9274ca83f9daacff26de7d26200
--- /dev/null
+++ b/lm_eval/tasks/acpbench/gen_2shot_with_pddl/just.yaml
@@ -0,0 +1,24 @@
+task: acp_just_gen_with_pddl
+dataset_name: acp_just_gen
+include: _gen_yaml_2shot
+fewshot_config:
+  sampler: first_n
+  samples:
+  - context: "A robot is in a grid and can only move to places that are connected to its current position. \nThe grid size is 5x5, and the locations are of the form fi-jf (e.g., f3-2f or f0-1f). The grid cells are connected to their neighbors (e.g., f1-2f is connected to the four neighbors f0-2f, f2-2f, f1-1f, and f1-3f). Some positions on the grid are locked and can be opened with a key of a matching shape. The robot has an arm that can pick up a key when the key is in same location as the robot and the arm is empty.  \nThere are 2 keys in 0 different shapes: Key key0-0 is of shape shape0, Key key0-1 is of shape shape0.  \nCurrently, the robot is at position f3-3f and its arm is empty. All the positions are open except the following: f4-2f has shape0 shaped lock, f2-0f has shape0 shaped lock. Key key0-1 is at position f1-3f. Key key0-0 is at position f2-2f. The available actions are: (unlock ?curpos ?lockpos ?key ?shape) - unlock place ?lockpos with key ?key of shape ?shape from current position place ?curpos, (move ?curpos ?nextpos) - transition from the current position ?curpos to the next position ?nextpos, (pickup ?curpos ?key) - acquire the key ?key from the place ?curpos, (pickup-and-loose ?curpos ?newkey ?oldkey) - pick up the key ?newkey from the current position ?curpos and loose the key ?oldkey which is being held, and (putdown ?curpos ?key) - place the key ?key at the current position place ?curpos. The goal is to reach a state where the following facts hold: Key key0-1 is at f1-3f location and Key key0-0 is at f2-0f location."
+    question: "Simplify the plan \"(move f3-3f f3-2f) (move f3-2f f2-2f) (pickup f2-2f key0-0) (move f2-2f f2-1f) (putdown f2-1f key0-0) (pickup f2-1f key0-0) (unlock f2-1f f2-0f key0-0 shape0) (move f2-1f f2-0f) (putdown f2-0f key0-0)\" by removing either a single action or a pair of consecutive actions, while still maintaining a valid plan. Provide the resulting simplified plan."
+    answer: "[(move f3-3f f3-2f), (move f3-2f f2-2f), (pickup f2-2f key0-0), (move f2-2f f2-1f), (unlock f2-1f f2-0f key0-0 shape0), (move f2-1f f2-0f), (putdown f2-0f key0-0)]"
+    PDDL_domain: "(define (domain grid)\n    (:requirements :strips :typing)\n    (:types key place shape - object)\n    (:predicates (arm-empty) (at ?r - key ?x - place)  (at-robot ?x - place)  (conn ?x - place ?y - place)  (holding ?k - key)  (key-shape ?k - key ?s - shape)  (lock-shape ?x - place ?s - shape)  (locked ?x - place)  (open ?x - place))\n    (:action move\n        :parameters (?curpos - place ?nextpos - place)\n        :precondition (and (at-robot ?curpos) (conn ?curpos ?nextpos) (open ?nextpos))\n        :effect (and (at-robot ?nextpos) (not (at-robot ?curpos)))\n    )\n     (:action pickup\n        :parameters (?curpos - place ?key - key)\n        :precondition (and (at-robot ?curpos) (at ?key ?curpos) (arm-empty))\n        :effect (and (holding ?key) (not (at ?key ?curpos)) (not (arm-empty)))\n    )\n     (:action pickup-and-loose\n        :parameters (?curpos - place ?newkey - key ?oldkey - key)\n        :precondition (and (at-robot ?curpos) (holding ?oldkey) (at ?newkey ?curpos))\n        :effect (and (holding ?newkey) (at ?oldkey ?curpos) (not (holding ?oldkey)) (not (at ?newkey ?curpos)))\n    )\n     (:action putdown\n        :parameters (?curpos - place ?key - key)\n        :precondition (and (at-robot ?curpos) (holding ?key))\n        :effect (and (arm-empty) (at ?key ?curpos) (not (holding ?key)))\n    )\n     (:action unlock\n        :parameters (?curpos - place ?lockpos - place ?key - key ?shape - shape)\n        :precondition (and (conn ?curpos ?lockpos) (key-shape ?key ?shape) (lock-shape ?lockpos ?shape) (at-robot ?curpos) (locked ?lockpos) (holding ?key))\n        :effect (and (open ?lockpos) (not (locked ?lockpos)))\n    )\n)"
+    PDDL_problem: "(define (problem grid-x5-y5-t1-k2-l2-p100)\n    (:domain grid)\n    (:requirements :strips :typing)\n    (:objects key0-0 key0-1 - key f0-0f f0-1f f0-2f f0-3f f0-4f f1-0f f1-1f f1-2f f1-3f f1-4f f2-0f f2-1f f2-2f f2-3f f2-4f f3-0f f3-1f f3-2f f3-3f f3-4f f4-0f f4-1f f4-2f f4-3f f4-4f - place shape0 - shape)\n    (:init (arm-empty) (at key0-0 f2-2f) (at key0-1 f1-3f) (at-robot f3-3f) (conn f0-0f f0-1f) (conn f0-0f f1-0f) (conn f0-1f f0-0f) (conn f0-1f f0-2f) (conn f0-1f f1-1f) (conn f0-2f f0-1f) (conn f0-2f f0-3f) (conn f0-2f f1-2f) (conn f0-3f f0-2f) (conn f0-3f f0-4f) (conn f0-3f f1-3f) (conn f0-4f f0-3f) (conn f0-4f f1-4f) (conn f1-0f f0-0f) (conn f1-0f f1-1f) (conn f1-0f f2-0f) (conn f1-1f f0-1f) (conn f1-1f f1-0f) (conn f1-1f f1-2f) (conn f1-1f f2-1f) (conn f1-2f f0-2f) (conn f1-2f f1-1f) (conn f1-2f f1-3f) (conn f1-2f f2-2f) (conn f1-3f f0-3f) (conn f1-3f f1-2f) (conn f1-3f f1-4f) (conn f1-3f f2-3f) (conn f1-4f f0-4f) (conn f1-4f f1-3f) (conn f1-4f f2-4f) (conn f2-0f f1-0f) (conn f2-0f f2-1f) (conn f2-0f f3-0f) (conn f2-1f f1-1f) (conn f2-1f f2-0f) (conn f2-1f f2-2f) (conn f2-1f f3-1f) (conn f2-2f f1-2f) (conn f2-2f f2-1f) (conn f2-2f f2-3f) (conn f2-2f f3-2f) (conn f2-3f f1-3f) (conn f2-3f f2-2f) (conn f2-3f f2-4f) (conn f2-3f f3-3f) (conn f2-4f f1-4f) (conn f2-4f f2-3f) (conn f2-4f f3-4f) (conn f3-0f f2-0f) (conn f3-0f f3-1f) (conn f3-0f f4-0f) (conn f3-1f f2-1f) (conn f3-1f f3-0f) (conn f3-1f f3-2f) (conn f3-1f f4-1f) (conn f3-2f f2-2f) (conn f3-2f f3-1f) (conn f3-2f f3-3f) (conn f3-2f f4-2f) (conn f3-3f f2-3f) (conn f3-3f f3-2f) (conn f3-3f f3-4f) (conn f3-3f f4-3f) (conn f3-4f f2-4f) (conn f3-4f f3-3f) (conn f3-4f f4-4f) (conn f4-0f f3-0f) (conn f4-0f f4-1f) (conn f4-1f f3-1f) (conn f4-1f f4-0f) (conn f4-1f f4-2f) (conn f4-2f f3-2f) (conn f4-2f f4-1f) (conn f4-2f f4-3f) (conn f4-3f f3-3f) (conn f4-3f f4-2f) (conn f4-3f f4-4f) (conn f4-4f f3-4f) (conn f4-4f f4-3f) (key-shape key0-0 shape0) (key-shape key0-1 shape0) (lock-shape f2-0f shape0) (lock-shape f4-2f shape0) (locked f2-0f) (locked f4-2f) (open f0-0f) (open f0-1f) (open f0-2f) (open f0-3f) (open f0-4f) (open f1-0f) (open f1-1f) (open f1-2f) (open f1-3f) (open f1-4f) (open f2-1f) (open f2-2f) (open f2-3f) (open f2-4f) (open f3-0f) (open f3-1f) (open f3-2f) (open f3-3f) (open f3-4f) (open f4-0f) (open f4-1f) (open f4-3f) (open f4-4f))\n    (:goal (and (at key0-0 f2-0f) (at key0-1 f1-3f)))\n)"
+  - context: "There are several cities, each containing several locations, some of which are airports. There are also trucks, which can drive within a single city, and airplanes, which can fly between airports. The goal is to get some packages from various locations to various new locations. \nThere are 2 trucks and 1 airplane, as well as 4 packages. There are 4 locations across 2 cities. \nThe locations are in cities as follows: l1-0 and l1-1 are in c1; l0-1 and l0-0 are in c0. \nCurrently, p3, p2, and p1 are at l1-0, t0 is at l0-1, p0 and t1 are at l1-1, a0 is at l0-0. The available actions are: (load-truck ?obj ?truck ?loc) - place the object ?obj into the truck ?truck at location ?loc, (load-airplane ?obj ?airplane ?loc) - load the object ?obj from location ?loc into the airplane ?airplane, (unload-truck ?obj ?truck ?loc) - unload the object ?obj from the truck ?truck at location ?loc, (unload-airplane ?obj ?airplane ?loc) - remove the object ?obj from the airplane ?airplane and place it on the location ?loc, (drive-truck ?truck ?loc-from ?loc-to ?city) - navigate the truck ?truck which is in location ?loc-from in city ?city to another location ?loc-to in the same city, and (fly-airplane ?airplane ?loc-from ?loc-to) - fly the airplane ?airplane from airport ?loc-from to airport ?loc-to. The goal is to reach a state where the following facts hold: p2 is at l1-0, p0 is at l0-0, p3 is at l0-1, and p1 is at l1-0."
+    question: "Simplify the plan \"(load-truck p0 t1 l1-1) (unload-truck p0 t1 l1-1) (load-truck p0 t1 l1-1) (drive-truck t1 l1-1 l1-0 c1) (unload-truck p0 t1 l1-0) (fly-airplane a0 l0-0 l1-0) (load-airplane p0 a0 l1-0) (load-airplane p3 a0 l1-0) (fly-airplane a0 l1-0 l0-0) (unload-airplane p0 a0 l0-0) (unload-airplane p3 a0 l0-0) (drive-truck t0 l0-1 l0-0 c0) (load-truck p3 t0 l0-0) (drive-truck t0 l0-0 l0-1 c0) (unload-truck p3 t0 l0-1)\" by removing either a single action or a pair of consecutive actions, while still maintaining a valid plan. Provide the resulting simplified plan."
+    answer: "[(load-truck p0 t1 l1-1), (drive-truck t1 l1-1 l1-0 c1), (unload-truck p0 t1 l1-0), (fly-airplane a0 l0-0 l1-0), (load-airplane p0 a0 l1-0), (load-airplane p3 a0 l1-0), (fly-airplane a0 l1-0 l0-0), (unload-airplane p0 a0 l0-0), (unload-airplane p3 a0 l0-0), (drive-truck t0 l0-1 l0-0 c0), (load-truck p3 t0 l0-0), (drive-truck t0 l0-0 l0-1 c0), (unload-truck p3 t0 l0-1)]"
+    PDDL_domain: "(define (domain logistics-strips)\n  (:requirements :strips :typing) \n\n  (:types \n    location locatable city - object \n    package movable - locatable\n    airport - location\n    airplane truck - movable    \n  )\t\t\n  \n  (:predicates \t\n\t\t(at ?obj - locatable ?loc - location)\n\t\t(in ?obj1 - package ?obj2 - movable)\n\t\t(in-city ?obj - location ?city - city))\n\n\n(:action LOAD-TRUCK\n  :parameters\n   (?obj - package\n    ?truck - truck\n    ?loc - location)\n  :precondition\n   (and \n   (at ?truck ?loc) (at ?obj ?loc))\n  :effect\n   (and (not (at ?obj ?loc)) (in ?obj ?truck)))\n\n(:action LOAD-AIRPLANE\n  :parameters\n   (?obj - package\n    ?airplane - airplane\n    ?loc - location)\n  :precondition\n   (and \n   (at ?obj ?loc) (at ?airplane ?loc))\n  :effect\n   (and (not (at ?obj ?loc)) (in ?obj ?airplane)))\n\n\n\n(:action UNLOAD-TRUCK\n  :parameters\n   (?obj - package\n    ?truck - truck\n    ?loc - location)\n  :precondition\n   (and \n        (at ?truck ?loc) (in ?obj ?truck))\n  :effect\n   (and (not (in ?obj ?truck)) (at ?obj ?loc)))\n\n(:action UNLOAD-AIRPLANE\n  :parameters\n   (?obj - package\n    ?airplane - airplane\n    ?loc - location)\n  :precondition\n   (and \n        (in ?obj ?airplane) (at ?airplane ?loc))\n  :effect\n   (and (not (in ?obj ?airplane)) (at ?obj ?loc)))\n\n(:action DRIVE-TRUCK\n  :parameters\n   (?truck - truck\n    ?loc-from - location\n    ?loc-to - location\n    ?city - city)\n  :precondition\n   (and \n   (at ?truck ?loc-from)\n   (in-city ?loc-from ?city)\n   (in-city ?loc-to ?city))\n  :effect\n   (and (not (at ?truck ?loc-from)) (at ?truck ?loc-to)))\n\n(:action FLY-AIRPLANE\n  :parameters\n   (?airplane - airplane\n    ?loc-from - airport\n    ?loc-to - airport)\n  :precondition\n   (and \n\t(at ?airplane ?loc-from))\n  :effect\n   (and (not (at ?airplane ?loc-from)) (at ?airplane ?loc-to)))\n)"
+    PDDL_problem: "(define (problem logistics-c2-s2-p4-a1)\n    (:domain logistics-strips)\n    (:requirements :strips :typing)\n    (:objects a0 - airplane l0-0 l1-0 - airport c0 c1 - city l0-1 l1-1 - location p0 p1 p2 p3 - package t0 t1 - truck)\n    (:init (at a0 l0-0) (at p0 l1-1) (at p1 l1-0) (at p2 l1-0) (at p3 l1-0) (at t0 l0-1) (at t1 l1-1) (in-city l0-0 c0) (in-city l0-1 c0) (in-city l1-0 c1) (in-city l1-1 c1))\n    (:goal (and (at p0 l0-0) (at p1 l1-0) (at p2 l1-0) (at p3 l0-1)))\n)"
+doc_to_text: "# PDDL DOMAIN \n\n```\n{{PDDL_domain}}\n```\n\n# PDDL PROBLEM \n\n```\n{{PDDL_problem}}\n```\n\n**Question**: {{context}} {{question}} **Final Answer**:"
+filter_list:
+  - name: "acp_grammar_parse"
+    filter:
+      - function: "ACP_grammar_filter"
+        grammar_task: "action_list"
+        clean: "simplified plan"
+      - function: "take_first"
diff --git a/lm_eval/tasks/acpbench/gen_2shot_with_pddl/land.yaml b/lm_eval/tasks/acpbench/gen_2shot_with_pddl/land.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..da7bf1c6987e773a7246cd30b1e2e6de691e005e
--- /dev/null
+++ b/lm_eval/tasks/acpbench/gen_2shot_with_pddl/land.yaml
@@ -0,0 +1,23 @@
+task: acp_land_gen_with_pddl
+dataset_name: acp_land_gen
+include: _gen_yaml_2shot
+fewshot_config:
+  sampler: first_n
+  samples:
+  - context: "A robot is in a grid and can only move to places that are connected to its current position. \nThe grid size is 5x5, and the locations are of the form fi-jf (e.g., f3-2f or f0-1f). The grid cells are connected to their neighbors (e.g., f1-2f is connected to the four neighbors f0-2f, f2-2f, f1-1f, and f1-3f). Some positions on the grid are locked and can be opened with a key of a matching shape. The robot has an arm that can pick up a key when the key is in same location as the robot and the arm is empty.  \nThere are 2 keys in 0 different shapes: Key key0-0 is of shape shape0, Key key0-1 is of shape shape0.  \nCurrently, the robot is at position f1-0f and its arm is empty. All the positions are open except the following: f4-2f has shape0 shaped lock. Key key0-1 is at position f1-3f. Key key0-0 is at position f1-0f. The goal is to reach a state where the following facts hold: Key key0-1 is at f1-3f location and Key key0-0 is at f2-0f location. The available propositions are: (at ?r ?x) - Key ?r is at ?x location, (at-robot ?x) - Robot is at ?x location, (locked ?x) - Location ?x is locked, (holding ?k) - Robot is holding ?k, (open ?x) - Location ?x is open, and (arm-empty) - Robot's arm is empty."
+    question: "Generate a non-trivial fact landmark, one that does not hold in the initial state or goal."
+    answer: "(at-robot f2-0f)"
+    PDDL_domain: "(define (domain grid)\n    (:requirements :strips :typing)\n    (:types key place shape - object)\n    (:predicates (arm-empty) (at ?r - key ?x - place)  (at-robot ?x - place)  (conn ?x - place ?y - place)  (holding ?k - key)  (key-shape ?k - key ?s - shape)  (lock-shape ?x - place ?s - shape)  (locked ?x - place)  (open ?x - place))\n    (:action move\n        :parameters (?curpos - place ?nextpos - place)\n        :precondition (and (at-robot ?curpos) (conn ?curpos ?nextpos) (open ?nextpos))\n        :effect (and (at-robot ?nextpos) (not (at-robot ?curpos)))\n    )\n     (:action pickup\n        :parameters (?curpos - place ?key - key)\n        :precondition (and (at-robot ?curpos) (at ?key ?curpos) (arm-empty))\n        :effect (and (holding ?key) (not (at ?key ?curpos)) (not (arm-empty)))\n    )\n     (:action pickup-and-loose\n        :parameters (?curpos - place ?newkey - key ?oldkey - key)\n        :precondition (and (at-robot ?curpos) (holding ?oldkey) (at ?newkey ?curpos))\n        :effect (and (holding ?newkey) (at ?oldkey ?curpos) (not (holding ?oldkey)) (not (at ?newkey ?curpos)))\n    )\n     (:action putdown\n        :parameters (?curpos - place ?key - key)\n        :precondition (and (at-robot ?curpos) (holding ?key))\n        :effect (and (arm-empty) (at ?key ?curpos) (not (holding ?key)))\n    )\n     (:action unlock\n        :parameters (?curpos - place ?lockpos - place ?key - key ?shape - shape)\n        :precondition (and (conn ?curpos ?lockpos) (key-shape ?key ?shape) (lock-shape ?lockpos ?shape) (at-robot ?curpos) (locked ?lockpos) (holding ?key))\n        :effect (and (open ?lockpos) (not (locked ?lockpos)))\n    )\n)"
+    PDDL_problem: "(define (problem grid-x5-y5-t1-k2-l2-p100)\n    (:domain grid)\n    (:requirements :strips :typing)\n    (:objects key0-0 key0-1 - key f0-0f f0-1f f0-2f f0-3f f0-4f f1-0f f1-1f f1-2f f1-3f f1-4f f2-0f f2-1f f2-2f f2-3f f2-4f f3-0f f3-1f f3-2f f3-3f f3-4f f4-0f f4-1f f4-2f f4-3f f4-4f - place shape0 - shape)\n    (:init (arm-empty) (at key0-0 f1-0f) (at key0-1 f1-3f) (at-robot f1-0f) (conn f0-0f f0-1f) (conn f0-0f f1-0f) (conn f0-1f f0-0f) (conn f0-1f f0-2f) (conn f0-1f f1-1f) (conn f0-2f f0-1f) (conn f0-2f f0-3f) (conn f0-2f f1-2f) (conn f0-3f f0-2f) (conn f0-3f f0-4f) (conn f0-3f f1-3f) (conn f0-4f f0-3f) (conn f0-4f f1-4f) (conn f1-0f f0-0f) (conn f1-0f f1-1f) (conn f1-0f f2-0f) (conn f1-1f f0-1f) (conn f1-1f f1-0f) (conn f1-1f f1-2f) (conn f1-1f f2-1f) (conn f1-2f f0-2f) (conn f1-2f f1-1f) (conn f1-2f f1-3f) (conn f1-2f f2-2f) (conn f1-3f f0-3f) (conn f1-3f f1-2f) (conn f1-3f f1-4f) (conn f1-3f f2-3f) (conn f1-4f f0-4f) (conn f1-4f f1-3f) (conn f1-4f f2-4f) (conn f2-0f f1-0f) (conn f2-0f f2-1f) (conn f2-0f f3-0f) (conn f2-1f f1-1f) (conn f2-1f f2-0f) (conn f2-1f f2-2f) (conn f2-1f f3-1f) (conn f2-2f f1-2f) (conn f2-2f f2-1f) (conn f2-2f f2-3f) (conn f2-2f f3-2f) (conn f2-3f f1-3f) (conn f2-3f f2-2f) (conn f2-3f f2-4f) (conn f2-3f f3-3f) (conn f2-4f f1-4f) (conn f2-4f f2-3f) (conn f2-4f f3-4f) (conn f3-0f f2-0f) (conn f3-0f f3-1f) (conn f3-0f f4-0f) (conn f3-1f f2-1f) (conn f3-1f f3-0f) (conn f3-1f f3-2f) (conn f3-1f f4-1f) (conn f3-2f f2-2f) (conn f3-2f f3-1f) (conn f3-2f f3-3f) (conn f3-2f f4-2f) (conn f3-3f f2-3f) (conn f3-3f f3-2f) (conn f3-3f f3-4f) (conn f3-3f f4-3f) (conn f3-4f f2-4f) (conn f3-4f f3-3f) (conn f3-4f f4-4f) (conn f4-0f f3-0f) (conn f4-0f f4-1f) (conn f4-1f f3-1f) (conn f4-1f f4-0f) (conn f4-1f f4-2f) (conn f4-2f f3-2f) (conn f4-2f f4-1f) (conn f4-2f f4-3f) (conn f4-3f f3-3f) (conn f4-3f f4-2f) (conn f4-3f f4-4f) (conn f4-4f f3-4f) (conn f4-4f f4-3f) (key-shape key0-0 shape0) (key-shape key0-1 shape0) (lock-shape f2-0f shape0) (lock-shape f4-2f shape0) (locked f4-2f) (open f0-0f) (open f0-1f) (open f0-2f) (open f0-3f) (open f0-4f) (open f1-0f) (open f1-1f) (open f1-2f) (open f1-3f) (open f1-4f) (open f2-0f) (open f2-1f) (open f2-2f) (open f2-3f) (open f2-4f) (open f3-0f) (open f3-1f) (open f3-2f) (open f3-3f) (open f3-4f) (open f4-0f) (open f4-1f) (open f4-3f) (open f4-4f))\n    (:goal (and (at key0-0 f2-0f) (at key0-1 f1-3f)))\n)"
+  - context: "There are several cities, each containing several locations, some of which are airports. There are also trucks, which can drive within a single city, and airplanes, which can fly between airports. The goal is to get some packages from various locations to various new locations. \nThere are 2 trucks and 1 airplane, as well as 4 packages. There are 4 locations across 2 cities. \nThe locations are in cities as follows: l0-1 and l0-0 are in c0; l1-0 and l1-1 are in c1. \nCurrently, a0 and t0 are at l0-0, t1 is at l1-1, p2, p1, and p3 are at l1-0, p0 is in t1. The goal is to reach a state where the following facts hold: p1 is at l1-0, p0 is at l0-0, p2 is at l1-0, and p3 is at l0-1. The available propositions are: (at ?obj ?loc) - ?obj is at ?loc and (in ?obj1 ?obj2) - ?obj1 is in ?obj2."
+    question: "Generate a non-trivial fact landmark, one that does not hold in the initial state or goal."
+    answer: "(in p3 t0)"
+    PDDL_domain: "(define (domain logistics-strips)\n  (:requirements :strips :typing) \n\n  (:types \n    location locatable city - object \n    package movable - locatable\n    airport - location\n    airplane truck - movable    \n  )\t\t\n  \n  (:predicates \t\n\t\t(at ?obj - locatable ?loc - location)\n\t\t(in ?obj1 - package ?obj2 - movable)\n\t\t(in-city ?obj - location ?city - city))\n\n\n(:action LOAD-TRUCK\n  :parameters\n   (?obj - package\n    ?truck - truck\n    ?loc - location)\n  :precondition\n   (and \n   (at ?truck ?loc) (at ?obj ?loc))\n  :effect\n   (and (not (at ?obj ?loc)) (in ?obj ?truck)))\n\n(:action LOAD-AIRPLANE\n  :parameters\n   (?obj - package\n    ?airplane - airplane\n    ?loc - location)\n  :precondition\n   (and \n   (at ?obj ?loc) (at ?airplane ?loc))\n  :effect\n   (and (not (at ?obj ?loc)) (in ?obj ?airplane)))\n\n\n\n(:action UNLOAD-TRUCK\n  :parameters\n   (?obj - package\n    ?truck - truck\n    ?loc - location)\n  :precondition\n   (and \n        (at ?truck ?loc) (in ?obj ?truck))\n  :effect\n   (and (not (in ?obj ?truck)) (at ?obj ?loc)))\n\n(:action UNLOAD-AIRPLANE\n  :parameters\n   (?obj - package\n    ?airplane - airplane\n    ?loc - location)\n  :precondition\n   (and \n        (in ?obj ?airplane) (at ?airplane ?loc))\n  :effect\n   (and (not (in ?obj ?airplane)) (at ?obj ?loc)))\n\n(:action DRIVE-TRUCK\n  :parameters\n   (?truck - truck\n    ?loc-from - location\n    ?loc-to - location\n    ?city - city)\n  :precondition\n   (and \n   (at ?truck ?loc-from)\n   (in-city ?loc-from ?city)\n   (in-city ?loc-to ?city))\n  :effect\n   (and (not (at ?truck ?loc-from)) (at ?truck ?loc-to)))\n\n(:action FLY-AIRPLANE\n  :parameters\n   (?airplane - airplane\n    ?loc-from - airport\n    ?loc-to - airport)\n  :precondition\n   (and \n\t(at ?airplane ?loc-from))\n  :effect\n   (and (not (at ?airplane ?loc-from)) (at ?airplane ?loc-to)))\n)"
+    PDDL_problem: "(define (problem logistics-c2-s2-p4-a1)\n    (:domain logistics-strips)\n    (:requirements :strips :typing)\n    (:objects a0 - airplane l0-0 l1-0 - airport c0 c1 - city l0-1 l1-1 - location p0 p1 p2 p3 - package t0 t1 - truck)\n    (:init (at a0 l0-0) (at p1 l1-0) (at p2 l1-0) (at p3 l1-0) (at t0 l0-0) (at t1 l1-1) (in p0 t1) (in-city l0-0 c0) (in-city l0-1 c0) (in-city l1-0 c1) (in-city l1-1 c1))\n    (:goal (and (at p0 l0-0) (at p1 l1-0) (at p2 l1-0) (at p3 l0-1)))\n)"
+doc_to_text: "# PDDL DOMAIN \n\n```\n{{PDDL_domain}}\n```\n\n# PDDL PROBLEM \n\n```\n{{PDDL_problem}}\n```\n\n**Question**: {{context}} {{question}} Provide only the ground proposition or None. **Final Answer**:"
+filter_list:
+  - name: "acp_grammar_parse"
+    filter:
+      - function: "ACP_grammar_filter"
+        grammar_task: "act"
+      - function: "take_first"
diff --git a/lm_eval/tasks/acpbench/gen_2shot_with_pddl/next_act.yaml b/lm_eval/tasks/acpbench/gen_2shot_with_pddl/next_act.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0f43ca61c73417762d250c83d64e1d7b1bfde37f
--- /dev/null
+++ b/lm_eval/tasks/acpbench/gen_2shot_with_pddl/next_act.yaml
@@ -0,0 +1,23 @@
+task: acp_nexta_gen_with_pddl
+dataset_name: acp_nexta_gen
+include: _gen_yaml_2shot
+fewshot_config:
+  sampler: first_n
+  samples:
+  - context: "A robot is in a grid and can only move to places that are connected to its current position. \nThe grid size is 5x5, and the locations are of the form fi-jf (e.g., f3-2f or f0-1f). The grid cells are connected to their neighbors (e.g., f1-2f is connected to the four neighbors f0-2f, f2-2f, f1-1f, and f1-3f). Some positions on the grid are locked and can be opened with a key of a matching shape. The robot has an arm that can pick up a key when the key is in same location as the robot and the arm is empty.  \nThere are 2 keys in 0 different shapes: Key key0-0 is of shape shape0, Key key0-1 is of shape shape0.  \nCurrently, the robot is at position f1-1f and its arm is empty. All the positions are open except the following: f4-2f has shape0 shaped lock. Key key0-0 is at position f1-0f. Key key0-1 is at position f1-3f. The goal is to reach a state where the following facts hold: Key key0-1 is at f1-3f location and Key key0-0 is at f2-0f location. The available actions are: (unlock ?curpos ?lockpos ?key ?shape) - use the key ?key of shape ?shape to unlock the place ?lockpos from the current position ?curpos, (move ?curpos ?nextpos) - move to place ?nextpos from place ?curpos, (pickup ?curpos ?key) - retrieve the key ?key from its current position ?curpos, (pickup-and-loose ?curpos ?newkey ?oldkey) - pick up key ?newkey at current position place ?curpos and loose key ?oldkey being held, and (putdown ?curpos ?key) - put the key ?key at the current position place ?curpos."
+    question: "What is the next action that takes us towards the goal?"
+    answer: "(move f1-1f f1-0f)"
+    PDDL_domain: "(define (domain grid)\n    (:requirements :strips :typing)\n    (:types key place shape - object)\n    (:predicates (arm-empty) (at ?r - key ?x - place)  (at-robot ?x - place)  (conn ?x - place ?y - place)  (holding ?k - key)  (key-shape ?k - key ?s - shape)  (lock-shape ?x - place ?s - shape)  (locked ?x - place)  (open ?x - place))\n    (:action move\n        :parameters (?curpos - place ?nextpos - place)\n        :precondition (and (at-robot ?curpos) (conn ?curpos ?nextpos) (open ?nextpos))\n        :effect (and (at-robot ?nextpos) (not (at-robot ?curpos)))\n    )\n     (:action pickup\n        :parameters (?curpos - place ?key - key)\n        :precondition (and (at-robot ?curpos) (at ?key ?curpos) (arm-empty))\n        :effect (and (holding ?key) (not (at ?key ?curpos)) (not (arm-empty)))\n    )\n     (:action pickup-and-loose\n        :parameters (?curpos - place ?newkey - key ?oldkey - key)\n        :precondition (and (at-robot ?curpos) (holding ?oldkey) (at ?newkey ?curpos))\n        :effect (and (holding ?newkey) (at ?oldkey ?curpos) (not (holding ?oldkey)) (not (at ?newkey ?curpos)))\n    )\n     (:action putdown\n        :parameters (?curpos - place ?key - key)\n        :precondition (and (at-robot ?curpos) (holding ?key))\n        :effect (and (arm-empty) (at ?key ?curpos) (not (holding ?key)))\n    )\n     (:action unlock\n        :parameters (?curpos - place ?lockpos - place ?key - key ?shape - shape)\n        :precondition (and (conn ?curpos ?lockpos) (key-shape ?key ?shape) (lock-shape ?lockpos ?shape) (at-robot ?curpos) (locked ?lockpos) (holding ?key))\n        :effect (and (open ?lockpos) (not (locked ?lockpos)))\n    )\n)"
+    PDDL_problem: "(define (problem grid-x5-y5-t1-k2-l2-p100)\n    (:domain grid)\n    (:requirements :strips :typing)\n    (:objects key0-0 key0-1 - key f0-0f f0-1f f0-2f f0-3f f0-4f f1-0f f1-1f f1-2f f1-3f f1-4f f2-0f f2-1f f2-2f f2-3f f2-4f f3-0f f3-1f f3-2f f3-3f f3-4f f4-0f f4-1f f4-2f f4-3f f4-4f - place shape0 - shape)\n    (:init (arm-empty) (at key0-0 f1-0f) (at key0-1 f1-3f) (at-robot f1-1f) (conn f0-0f f0-1f) (conn f0-0f f1-0f) (conn f0-1f f0-0f) (conn f0-1f f0-2f) (conn f0-1f f1-1f) (conn f0-2f f0-1f) (conn f0-2f f0-3f) (conn f0-2f f1-2f) (conn f0-3f f0-2f) (conn f0-3f f0-4f) (conn f0-3f f1-3f) (conn f0-4f f0-3f) (conn f0-4f f1-4f) (conn f1-0f f0-0f) (conn f1-0f f1-1f) (conn f1-0f f2-0f) (conn f1-1f f0-1f) (conn f1-1f f1-0f) (conn f1-1f f1-2f) (conn f1-1f f2-1f) (conn f1-2f f0-2f) (conn f1-2f f1-1f) (conn f1-2f f1-3f) (conn f1-2f f2-2f) (conn f1-3f f0-3f) (conn f1-3f f1-2f) (conn f1-3f f1-4f) (conn f1-3f f2-3f) (conn f1-4f f0-4f) (conn f1-4f f1-3f) (conn f1-4f f2-4f) (conn f2-0f f1-0f) (conn f2-0f f2-1f) (conn f2-0f f3-0f) (conn f2-1f f1-1f) (conn f2-1f f2-0f) (conn f2-1f f2-2f) (conn f2-1f f3-1f) (conn f2-2f f1-2f) (conn f2-2f f2-1f) (conn f2-2f f2-3f) (conn f2-2f f3-2f) (conn f2-3f f1-3f) (conn f2-3f f2-2f) (conn f2-3f f2-4f) (conn f2-3f f3-3f) (conn f2-4f f1-4f) (conn f2-4f f2-3f) (conn f2-4f f3-4f) (conn f3-0f f2-0f) (conn f3-0f f3-1f) (conn f3-0f f4-0f) (conn f3-1f f2-1f) (conn f3-1f f3-0f) (conn f3-1f f3-2f) (conn f3-1f f4-1f) (conn f3-2f f2-2f) (conn f3-2f f3-1f) (conn f3-2f f3-3f) (conn f3-2f f4-2f) (conn f3-3f f2-3f) (conn f3-3f f3-2f) (conn f3-3f f3-4f) (conn f3-3f f4-3f) (conn f3-4f f2-4f) (conn f3-4f f3-3f) (conn f3-4f f4-4f) (conn f4-0f f3-0f) (conn f4-0f f4-1f) (conn f4-1f f3-1f) (conn f4-1f f4-0f) (conn f4-1f f4-2f) (conn f4-2f f3-2f) (conn f4-2f f4-1f) (conn f4-2f f4-3f) (conn f4-3f f3-3f) (conn f4-3f f4-2f) (conn f4-3f f4-4f) (conn f4-4f f3-4f) (conn f4-4f f4-3f) (key-shape key0-0 shape0) (key-shape key0-1 shape0) (lock-shape f2-0f shape0) (lock-shape f4-2f shape0) (locked f4-2f) (open f0-0f) (open f0-1f) (open f0-2f) (open f0-3f) (open f0-4f) (open f1-0f) (open f1-1f) (open f1-2f) (open f1-3f) (open f1-4f) (open f2-0f) (open f2-1f) (open f2-2f) (open f2-3f) (open f2-4f) (open f3-0f) (open f3-1f) (open f3-2f) (open f3-3f) (open f3-4f) (open f4-0f) (open f4-1f) (open f4-3f) (open f4-4f))\n    (:goal (and (at key0-0 f2-0f) (at key0-1 f1-3f)))\n)"
+  - context: "There are several cities, each containing several locations, some of which are airports. There are also trucks, which can drive within a single city, and airplanes, which can fly between airports. The goal is to get some packages from various locations to various new locations. \nThere are 2 trucks and 1 airplane, as well as 4 packages. There are 4 locations across 2 cities. \nThe locations are in cities as follows: l1-1 and l1-0 are in c1; l0-0 and l0-1 are in c0. \nCurrently, p1, p3, t1, p2, and a0 are at l1-0, t0 is at l0-0, p0 is in a0. The goal is to reach a state where the following facts hold: p1 is at l1-0, p3 is at l0-1, p0 is at l0-0, and p2 is at l1-0. The available actions are: (load-truck ?obj ?truck ?loc) - load the object ?obj from location ?loc into the truck ?truck, (load-airplane ?obj ?airplane ?loc) - load the object ?obj from location ?loc onto the airplane ?airplane, (unload-truck ?obj ?truck ?loc) - unload the object ?obj from the truck ?truck at location ?loc, (unload-airplane ?obj ?airplane ?loc) - offload the object ?obj from the airplane ?airplane at location ?loc, (drive-truck ?truck ?loc-from ?loc-to ?city) - navigate the truck ?truck which is in location ?loc-from in city ?city to another location ?loc-to in the same city, and (fly-airplane ?airplane ?loc-from ?loc-to) - fly the airplane ?airplane from airport ?loc-from to airport ?loc-to."
+    question: "What is the next action that takes us towards the goal?"
+    answer: "(load-airplane p3 a0 l1-0)"
+    PDDL_domain: "(define (domain logistics-strips)\n  (:requirements :strips :typing) \n\n  (:types \n    location locatable city - object \n    package movable - locatable\n    airport - location\n    airplane truck - movable    \n  )\t\t\n  \n  (:predicates \t\n\t\t(at ?obj - locatable ?loc - location)\n\t\t(in ?obj1 - package ?obj2 - movable)\n\t\t(in-city ?obj - location ?city - city))\n\n\n(:action LOAD-TRUCK\n  :parameters\n   (?obj - package\n    ?truck - truck\n    ?loc - location)\n  :precondition\n   (and \n   (at ?truck ?loc) (at ?obj ?loc))\n  :effect\n   (and (not (at ?obj ?loc)) (in ?obj ?truck)))\n\n(:action LOAD-AIRPLANE\n  :parameters\n   (?obj - package\n    ?airplane - airplane\n    ?loc - location)\n  :precondition\n   (and \n   (at ?obj ?loc) (at ?airplane ?loc))\n  :effect\n   (and (not (at ?obj ?loc)) (in ?obj ?airplane)))\n\n\n\n(:action UNLOAD-TRUCK\n  :parameters\n   (?obj - package\n    ?truck - truck\n    ?loc - location)\n  :precondition\n   (and \n        (at ?truck ?loc) (in ?obj ?truck))\n  :effect\n   (and (not (in ?obj ?truck)) (at ?obj ?loc)))\n\n(:action UNLOAD-AIRPLANE\n  :parameters\n   (?obj - package\n    ?airplane - airplane\n    ?loc - location)\n  :precondition\n   (and \n        (in ?obj ?airplane) (at ?airplane ?loc))\n  :effect\n   (and (not (in ?obj ?airplane)) (at ?obj ?loc)))\n\n(:action DRIVE-TRUCK\n  :parameters\n   (?truck - truck\n    ?loc-from - location\n    ?loc-to - location\n    ?city - city)\n  :precondition\n   (and \n   (at ?truck ?loc-from)\n   (in-city ?loc-from ?city)\n   (in-city ?loc-to ?city))\n  :effect\n   (and (not (at ?truck ?loc-from)) (at ?truck ?loc-to)))\n\n(:action FLY-AIRPLANE\n  :parameters\n   (?airplane - airplane\n    ?loc-from - airport\n    ?loc-to - airport)\n  :precondition\n   (and \n\t(at ?airplane ?loc-from))\n  :effect\n   (and (not (at ?airplane ?loc-from)) (at ?airplane ?loc-to)))\n)"
+    PDDL_problem: "(define (problem logistics-c2-s2-p4-a1)\n    (:domain logistics-strips)\n    (:requirements :strips :typing)\n    (:objects a0 - airplane l0-0 l1-0 - airport c0 c1 - city l0-1 l1-1 - location p0 p1 p2 p3 - package t0 t1 - truck)\n    (:init (at a0 l1-0) (at p1 l1-0) (at p2 l1-0) (at p3 l1-0) (at t0 l0-0) (at t1 l1-0) (in p0 a0) (in-city l0-0 c0) (in-city l0-1 c0) (in-city l1-0 c1) (in-city l1-1 c1))\n    (:goal (and (at p0 l0-0) (at p1 l1-0) (at p2 l1-0) (at p3 l0-1)))\n)"
+doc_to_text: "# PDDL DOMAIN \n\n```\n{{PDDL_domain}}\n```\n\n# PDDL PROBLEM \n\n```\n{{PDDL_problem}}\n```\n\n**Question**: {{context}} {{question}} Each action starts with an opening parenthesis and ends with closing parenthesis. Provide only the action. **Final Answer**:"
+filter_list:
+  - name: "acp_grammar_parse"
+    filter:
+      - function: "ACP_grammar_filter"
+        grammar_task: "action_name"
+      - function: "take_first"
diff --git a/lm_eval/tasks/acpbench/gen_2shot_with_pddl/prog.yaml b/lm_eval/tasks/acpbench/gen_2shot_with_pddl/prog.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..545c56ee2bef5fec008217e434385f2b0c8f1f8e
--- /dev/null
+++ b/lm_eval/tasks/acpbench/gen_2shot_with_pddl/prog.yaml
@@ -0,0 +1,24 @@
+task: acp_prog_gen_with_pddl
+dataset_name: acp_prog_gen
+include: _gen_yaml_2shot
+fewshot_config:
+  sampler: first_n
+  samples:
+  - context: "A robot is in a grid and can only move to places that are connected to its current position. \nThe grid size is 5x5, and the locations are of the form fi-jf (e.g., f3-2f or f0-1f). The grid cells are connected to their neighbors (e.g., f1-2f is connected to the four neighbors f0-2f, f2-2f, f1-1f, and f1-3f). Some positions on the grid are locked and can be opened with a key of a matching shape. The robot has an arm that can pick up a key when the key is in same location as the robot and the arm is empty.  \nThere are 2 keys in 0 different shapes: Key key0-0 is of shape shape0, Key key0-1 is of shape shape0.  \nCurrently, the robot is at position f2-2f and its arm is empty. All the positions are open except the following: f4-2f has shape0 shaped lock, f2-0f has shape0 shaped lock. Key key0-1 is at position f1-3f. Key key0-0 is at position f2-2f. The available propositions are: (at ?r ?x) - Key ?r is at ?x location, (at-robot ?x) - Robot is at ?x location, (locked ?x) - Location ?x is locked, (holding ?k) - Robot is holding ?k, (open ?x) - Location ?x is open, and (arm-empty) - Robot is not holding anything."
+    question: "Break down the outcomes of performing the action \"retrieve the key key0-0 from its current position f0-1f\" into two lists, positive effects and negative effects. Positive effects are the propositions that are false in the current state but will become true after performing the action. Negative effects are the propositions that are true in the current state and will become false after performing the action."
+    answer: "[(at-robot f1-2f)] [(at-robot f2-2f)]"
+    PDDL_domain: "(define (domain grid)\n    (:requirements :strips :typing)\n    (:types key place shape - object)\n    (:predicates (arm-empty) (at ?r - key ?x - place)  (at-robot ?x - place)  (conn ?x - place ?y - place)  (holding ?k - key)  (key-shape ?k - key ?s - shape)  (lock-shape ?x - place ?s - shape)  (locked ?x - place)  (open ?x - place))\n    (:action move\n        :parameters (?curpos - place ?nextpos - place)\n        :precondition (and (at-robot ?curpos) (conn ?curpos ?nextpos) (open ?nextpos))\n        :effect (and (at-robot ?nextpos) (not (at-robot ?curpos)))\n    )\n     (:action pickup\n        :parameters (?curpos - place ?key - key)\n        :precondition (and (at-robot ?curpos) (at ?key ?curpos) (arm-empty))\n        :effect (and (holding ?key) (not (at ?key ?curpos)) (not (arm-empty)))\n    )\n     (:action pickup-and-loose\n        :parameters (?curpos - place ?newkey - key ?oldkey - key)\n        :precondition (and (at-robot ?curpos) (holding ?oldkey) (at ?newkey ?curpos))\n        :effect (and (holding ?newkey) (at ?oldkey ?curpos) (not (holding ?oldkey)) (not (at ?newkey ?curpos)))\n    )\n     (:action putdown\n        :parameters (?curpos - place ?key - key)\n        :precondition (and (at-robot ?curpos) (holding ?key))\n        :effect (and (arm-empty) (at ?key ?curpos) (not (holding ?key)))\n    )\n     (:action unlock\n        :parameters (?curpos - place ?lockpos - place ?key - key ?shape - shape)\n        :precondition (and (conn ?curpos ?lockpos) (key-shape ?key ?shape) (lock-shape ?lockpos ?shape) (at-robot ?curpos) (locked ?lockpos) (holding ?key))\n        :effect (and (open ?lockpos) (not (locked ?lockpos)))\n    )\n)"
+    PDDL_problem: "(define (problem grid-x5-y5-t1-k2-l2-p100)\n    (:domain grid)\n    (:requirements :strips :typing)\n    (:objects key0-0 key0-1 - key f0-0f f0-1f f0-2f f0-3f f0-4f f1-0f f1-1f f1-2f f1-3f f1-4f f2-0f f2-1f f2-2f f2-3f f2-4f f3-0f f3-1f f3-2f f3-3f f3-4f f4-0f f4-1f f4-2f f4-3f f4-4f - place shape0 - shape)\n    (:init (arm-empty) (at key0-0 f2-2f) (at key0-1 f1-3f) (at-robot f2-2f) (conn f0-0f f0-1f) (conn f0-0f f1-0f) (conn f0-1f f0-0f) (conn f0-1f f0-2f) (conn f0-1f f1-1f) (conn f0-2f f0-1f) (conn f0-2f f0-3f) (conn f0-2f f1-2f) (conn f0-3f f0-2f) (conn f0-3f f0-4f) (conn f0-3f f1-3f) (conn f0-4f f0-3f) (conn f0-4f f1-4f) (conn f1-0f f0-0f) (conn f1-0f f1-1f) (conn f1-0f f2-0f) (conn f1-1f f0-1f) (conn f1-1f f1-0f) (conn f1-1f f1-2f) (conn f1-1f f2-1f) (conn f1-2f f0-2f) (conn f1-2f f1-1f) (conn f1-2f f1-3f) (conn f1-2f f2-2f) (conn f1-3f f0-3f) (conn f1-3f f1-2f) (conn f1-3f f1-4f) (conn f1-3f f2-3f) (conn f1-4f f0-4f) (conn f1-4f f1-3f) (conn f1-4f f2-4f) (conn f2-0f f1-0f) (conn f2-0f f2-1f) (conn f2-0f f3-0f) (conn f2-1f f1-1f) (conn f2-1f f2-0f) (conn f2-1f f2-2f) (conn f2-1f f3-1f) (conn f2-2f f1-2f) (conn f2-2f f2-1f) (conn f2-2f f2-3f) (conn f2-2f f3-2f) (conn f2-3f f1-3f) (conn f2-3f f2-2f) (conn f2-3f f2-4f) (conn f2-3f f3-3f) (conn f2-4f f1-4f) (conn f2-4f f2-3f) (conn f2-4f f3-4f) (conn f3-0f f2-0f) (conn f3-0f f3-1f) (conn f3-0f f4-0f) (conn f3-1f f2-1f) (conn f3-1f f3-0f) (conn f3-1f f3-2f) (conn f3-1f f4-1f) (conn f3-2f f2-2f) (conn f3-2f f3-1f) (conn f3-2f f3-3f) (conn f3-2f f4-2f) (conn f3-3f f2-3f) (conn f3-3f f3-2f) (conn f3-3f f3-4f) (conn f3-3f f4-3f) (conn f3-4f f2-4f) (conn f3-4f f3-3f) (conn f3-4f f4-4f) (conn f4-0f f3-0f) (conn f4-0f f4-1f) (conn f4-1f f3-1f) (conn f4-1f f4-0f) (conn f4-1f f4-2f) (conn f4-2f f3-2f) (conn f4-2f f4-1f) (conn f4-2f f4-3f) (conn f4-3f f3-3f) (conn f4-3f f4-2f) (conn f4-3f f4-4f) (conn f4-4f f3-4f) (conn f4-4f f4-3f) (key-shape key0-0 shape0) (key-shape key0-1 shape0) (lock-shape f2-0f shape0) (lock-shape f4-2f shape0) (locked f2-0f) (locked f4-2f) (open f0-0f) (open f0-1f) (open f0-2f) (open f0-3f) (open f0-4f) (open f1-0f) (open f1-1f) (open f1-2f) (open f1-3f) (open f1-4f) (open f2-1f) (open f2-2f) (open f2-3f) (open f2-4f) (open f3-0f) (open f3-1f) (open f3-2f) (open f3-3f) (open f3-4f) (open f4-0f) (open f4-1f) (open f4-3f) (open f4-4f))\n    (:goal (and (at key0-0 f2-0f) (at key0-1 f1-3f)))\n)"
+  - context: "There are several cities, each containing several locations, some of which are airports. There are also trucks, which can drive within a single city, and airplanes, which can fly between airports. The goal is to get some packages from various locations to various new locations. \nThere are 2 trucks and 1 airplane, as well as 4 packages. There are 4 locations across 2 cities. \nThe locations are in cities as follows: l0-0 and l0-1 are in c0; l1-0 and l1-1 are in c1. \nCurrently, p2 and t1 are at l1-0, a0 and t0 are at l0-0, p0 and p3 are in a0, p1 is in t1. The available propositions are: (at ?obj ?loc) - ?obj is at ?loc and (in ?obj1 ?obj2) - ?obj1 is in ?obj2."
+    question: "Break down the outcomes of performing the action \"navigate the truck t1 which is in location l1-0 in city c1 to another location l1-1 in the same city\" into two lists, positive effects and negative effects. Positive effects are the propositions that are false in the current state but will become true after performing the action. Negative effects are the propositions that are true in the current state and will become false after performing the action."
+    answer: "[(at t1 l1-1)] [(at t1 l1-0)]"
+    PDDL_domain: "(define (domain logistics-strips)\n  (:requirements :strips :typing) \n\n  (:types \n    location locatable city - object \n    package movable - locatable\n    airport - location\n    airplane truck - movable    \n  )\t\t\n  \n  (:predicates \t\n\t\t(at ?obj - locatable ?loc - location)\n\t\t(in ?obj1 - package ?obj2 - movable)\n\t\t(in-city ?obj - location ?city - city))\n\n\n(:action LOAD-TRUCK\n  :parameters\n   (?obj - package\n    ?truck - truck\n    ?loc - location)\n  :precondition\n   (and \n   (at ?truck ?loc) (at ?obj ?loc))\n  :effect\n   (and (not (at ?obj ?loc)) (in ?obj ?truck)))\n\n(:action LOAD-AIRPLANE\n  :parameters\n   (?obj - package\n    ?airplane - airplane\n    ?loc - location)\n  :precondition\n   (and \n   (at ?obj ?loc) (at ?airplane ?loc))\n  :effect\n   (and (not (at ?obj ?loc)) (in ?obj ?airplane)))\n\n\n\n(:action UNLOAD-TRUCK\n  :parameters\n   (?obj - package\n    ?truck - truck\n    ?loc - location)\n  :precondition\n   (and \n        (at ?truck ?loc) (in ?obj ?truck))\n  :effect\n   (and (not (in ?obj ?truck)) (at ?obj ?loc)))\n\n(:action UNLOAD-AIRPLANE\n  :parameters\n   (?obj - package\n    ?airplane - airplane\n    ?loc - location)\n  :precondition\n   (and \n        (in ?obj ?airplane) (at ?airplane ?loc))\n  :effect\n   (and (not (in ?obj ?airplane)) (at ?obj ?loc)))\n\n(:action DRIVE-TRUCK\n  :parameters\n   (?truck - truck\n    ?loc-from - location\n    ?loc-to - location\n    ?city - city)\n  :precondition\n   (and \n   (at ?truck ?loc-from)\n   (in-city ?loc-from ?city)\n   (in-city ?loc-to ?city))\n  :effect\n   (and (not (at ?truck ?loc-from)) (at ?truck ?loc-to)))\n\n(:action FLY-AIRPLANE\n  :parameters\n   (?airplane - airplane\n    ?loc-from - airport\n    ?loc-to - airport)\n  :precondition\n   (and \n\t(at ?airplane ?loc-from))\n  :effect\n   (and (not (at ?airplane ?loc-from)) (at ?airplane ?loc-to)))\n)"
+    PDDL_problem: "(define (problem logistics-c2-s2-p4-a1)\n    (:domain logistics-strips)\n    (:requirements :strips :typing)\n    (:objects a0 - airplane l0-0 l1-0 - airport c0 c1 - city l0-1 l1-1 - location p0 p1 p2 p3 - package t0 t1 - truck)\n    (:init (at a0 l0-0) (at p2 l1-0) (at t0 l0-0) (at t1 l1-0) (in p0 a0) (in p1 t1) (in p3 a0) (in-city l0-0 c0) (in-city l0-1 c0) (in-city l1-0 c1) (in-city l1-1 c1))\n    (:goal (and (at p0 l0-0) (at p1 l1-0) (at p2 l1-0) (at p3 l0-1)))\n)"
+doc_to_text: "# PDDL DOMAIN \n\n```\n{{PDDL_domain}}\n```\n\n# PDDL PROBLEM \n\n```\n{{PDDL_problem}}\n```\n\n**Question**: {{context}} {{question}} Provide only the two lists with the ground propositions. **Final Answer**:"
+filter_list:
+  - name: "acp_grammar_parse"
+    filter:
+      - function: "ACP_grammar_filter"
+        grammar_task: "progression_list"
+        clean: "pos_neg"
+      - function: "take_first"
diff --git a/lm_eval/tasks/acpbench/gen_2shot_with_pddl/reach.yaml b/lm_eval/tasks/acpbench/gen_2shot_with_pddl/reach.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6cb78bbd836d6448911bce0d4b6357a9f43139a0
--- /dev/null
+++ b/lm_eval/tasks/acpbench/gen_2shot_with_pddl/reach.yaml
@@ -0,0 +1,23 @@
+task: acp_reach_gen_with_pddl
+dataset_name: acp_reach_gen
+include: _gen_yaml_2shot
+fewshot_config:
+  sampler: first_n
+  samples:
+  - context: "A robot is in a grid and can only move to places that are connected to its current position. \nThe grid size is 5x5, and the locations are of the form fi-jf (e.g., f3-2f or f0-1f). The grid cells are connected to their neighbors (e.g., f1-2f is connected to the four neighbors f0-2f, f2-2f, f1-1f, and f1-3f). Some positions on the grid are locked and can be opened with a key of a matching shape. The robot has an arm that can pick up a key when the key is in same location as the robot and the arm is empty.  \nThere are 2 keys in 0 different shapes: Key key0-1 is of shape shape0, Key key0-0 is of shape shape0.  \nCurrently, the robot is at position f3-1f and its arm is empty. All the positions are open except the following: f2-0f has shape0 shaped lock. Key key0-1 is at position f1-3f. Key key0-0 is at position f3-1f. The available propositions are: (at ?r ?x) - Key ?r is at ?x location, (at-robot ?x) - Robot is at ?x location, (locked ?x) - Location ?x is locked, (holding ?k) - Robot is holding ?k, (open ?x) - Location ?x is open, and (arm-empty) - Robot's arm is empty."
+    question: "What proposition can never hold in any potentially reachable state?"
+    answer: "(locked f2-2f)"
+    PDDL_domain: "(define (domain grid)\n    (:requirements :strips :typing)\n    (:types key place shape - object)\n    (:predicates (arm-empty) (at ?r - key ?x - place)  (at-robot ?x - place)  (conn ?x - place ?y - place)  (holding ?k - key)  (key-shape ?k - key ?s - shape)  (lock-shape ?x - place ?s - shape)  (locked ?x - place)  (open ?x - place))\n    (:action move\n        :parameters (?curpos - place ?nextpos - place)\n        :precondition (and (at-robot ?curpos) (conn ?curpos ?nextpos) (open ?nextpos))\n        :effect (and (at-robot ?nextpos) (not (at-robot ?curpos)))\n    )\n     (:action pickup\n        :parameters (?curpos - place ?key - key)\n        :precondition (and (at-robot ?curpos) (at ?key ?curpos) (arm-empty))\n        :effect (and (holding ?key) (not (at ?key ?curpos)) (not (arm-empty)))\n    )\n     (:action pickup-and-loose\n        :parameters (?curpos - place ?newkey - key ?oldkey - key)\n        :precondition (and (at-robot ?curpos) (holding ?oldkey) (at ?newkey ?curpos))\n        :effect (and (holding ?newkey) (at ?oldkey ?curpos) (not (holding ?oldkey)) (not (at ?newkey ?curpos)))\n    )\n     (:action putdown\n        :parameters (?curpos - place ?key - key)\n        :precondition (and (at-robot ?curpos) (holding ?key))\n        :effect (and (arm-empty) (at ?key ?curpos) (not (holding ?key)))\n    )\n     (:action unlock\n        :parameters (?curpos - place ?lockpos - place ?key - key ?shape - shape)\n        :precondition (and (conn ?curpos ?lockpos) (key-shape ?key ?shape) (lock-shape ?lockpos ?shape) (at-robot ?curpos) (locked ?lockpos) (holding ?key))\n        :effect (and (open ?lockpos) (not (locked ?lockpos)))\n    )\n)"
+    PDDL_problem: "(define (problem grid-x5-y5-t1-k2-l2-p100)\n    (:domain grid)\n    (:requirements :strips :typing)\n    (:objects key0-0 key0-1 - key f0-0f f0-1f f0-2f f0-3f f0-4f f1-0f f1-1f f1-2f f1-3f f1-4f f2-0f f2-1f f2-2f f2-3f f2-4f f3-0f f3-1f f3-2f f3-3f f3-4f f4-0f f4-1f f4-2f f4-3f f4-4f - place shape0 - shape)\n    (:init (arm-empty) (at key0-0 f3-1f) (at key0-1 f1-3f) (at-robot f3-1f) (conn f0-0f f0-1f) (conn f0-0f f1-0f) (conn f0-1f f0-0f) (conn f0-1f f0-2f) (conn f0-1f f1-1f) (conn f0-2f f0-1f) (conn f0-2f f0-3f) (conn f0-2f f1-2f) (conn f0-3f f0-2f) (conn f0-3f f0-4f) (conn f0-3f f1-3f) (conn f0-4f f0-3f) (conn f0-4f f1-4f) (conn f1-0f f0-0f) (conn f1-0f f1-1f) (conn f1-0f f2-0f) (conn f1-1f f0-1f) (conn f1-1f f1-0f) (conn f1-1f f1-2f) (conn f1-1f f2-1f) (conn f1-2f f0-2f) (conn f1-2f f1-1f) (conn f1-2f f1-3f) (conn f1-2f f2-2f) (conn f1-3f f0-3f) (conn f1-3f f1-2f) (conn f1-3f f1-4f) (conn f1-3f f2-3f) (conn f1-4f f0-4f) (conn f1-4f f1-3f) (conn f1-4f f2-4f) (conn f2-0f f1-0f) (conn f2-0f f2-1f) (conn f2-0f f3-0f) (conn f2-1f f1-1f) (conn f2-1f f2-0f) (conn f2-1f f2-2f) (conn f2-1f f3-1f) (conn f2-2f f1-2f) (conn f2-2f f2-1f) (conn f2-2f f2-3f) (conn f2-2f f3-2f) (conn f2-3f f1-3f) (conn f2-3f f2-2f) (conn f2-3f f2-4f) (conn f2-3f f3-3f) (conn f2-4f f1-4f) (conn f2-4f f2-3f) (conn f2-4f f3-4f) (conn f3-0f f2-0f) (conn f3-0f f3-1f) (conn f3-0f f4-0f) (conn f3-1f f2-1f) (conn f3-1f f3-0f) (conn f3-1f f3-2f) (conn f3-1f f4-1f) (conn f3-2f f2-2f) (conn f3-2f f3-1f) (conn f3-2f f3-3f) (conn f3-2f f4-2f) (conn f3-3f f2-3f) (conn f3-3f f3-2f) (conn f3-3f f3-4f) (conn f3-3f f4-3f) (conn f3-4f f2-4f) (conn f3-4f f3-3f) (conn f3-4f f4-4f) (conn f4-0f f3-0f) (conn f4-0f f4-1f) (conn f4-1f f3-1f) (conn f4-1f f4-0f) (conn f4-1f f4-2f) (conn f4-2f f3-2f) (conn f4-2f f4-1f) (conn f4-2f f4-3f) (conn f4-3f f3-3f) (conn f4-3f f4-2f) (conn f4-3f f4-4f) (conn f4-4f f3-4f) (conn f4-4f f4-3f) (key-shape key0-0 shape0) (key-shape key0-1 shape0) (lock-shape f2-0f shape0) (lock-shape f4-2f shape0) (locked f2-0f) (open f0-0f) (open f0-1f) (open f0-2f) (open f0-3f) (open f0-4f) (open f1-0f) (open f1-1f) (open f1-2f) (open f1-3f) (open f1-4f) (open f2-1f) (open f2-2f) (open f2-3f) (open f2-4f) (open f3-0f) (open f3-1f) (open f3-2f) (open f3-3f) (open f3-4f) (open f4-0f) (open f4-1f) (open f4-2f) (open f4-3f) (open f4-4f))\n    (:goal (and (at key0-0 f2-0f) (at key0-1 f1-3f)))\n)"
+  - context: "There are several cities, each containing several locations, some of which are airports. There are also trucks, which can drive within a single city, and airplanes, which can fly between airports. The goal is to get some packages from various locations to various new locations. \nThere are 2 trucks and 1 airplane, as well as 4 packages. There are 4 locations across 2 cities. \nThe locations are in cities as follows: l1-0 and l1-1 are in c1; l0-1 and l0-0 are in c0. \nCurrently, t1, a0, and p2 are at l1-0, t0 and p0 are at l0-0, p1 is in t1, p3 is in t0. The available propositions are: (at ?obj ?loc) - ?obj is at ?loc and (in ?obj1 ?obj2) - ?obj1 is in ?obj2."
+    question: "What proposition can never hold in any potentially reachable state?"
+    answer: "(at t0 l1-0)"
+    PDDL_domain: "(define (domain logistics-strips)\n  (:requirements :strips :typing) \n\n  (:types \n    location locatable city - object \n    package movable - locatable\n    airport - location\n    airplane truck - movable    \n  )\t\t\n  \n  (:predicates \t\n\t\t(at ?obj - locatable ?loc - location)\n\t\t(in ?obj1 - package ?obj2 - movable)\n\t\t(in-city ?obj - location ?city - city))\n\n\n(:action LOAD-TRUCK\n  :parameters\n   (?obj - package\n    ?truck - truck\n    ?loc - location)\n  :precondition\n   (and \n   (at ?truck ?loc) (at ?obj ?loc))\n  :effect\n   (and (not (at ?obj ?loc)) (in ?obj ?truck)))\n\n(:action LOAD-AIRPLANE\n  :parameters\n   (?obj - package\n    ?airplane - airplane\n    ?loc - location)\n  :precondition\n   (and \n   (at ?obj ?loc) (at ?airplane ?loc))\n  :effect\n   (and (not (at ?obj ?loc)) (in ?obj ?airplane)))\n\n\n\n(:action UNLOAD-TRUCK\n  :parameters\n   (?obj - package\n    ?truck - truck\n    ?loc - location)\n  :precondition\n   (and \n        (at ?truck ?loc) (in ?obj ?truck))\n  :effect\n   (and (not (in ?obj ?truck)) (at ?obj ?loc)))\n\n(:action UNLOAD-AIRPLANE\n  :parameters\n   (?obj - package\n    ?airplane - airplane\n    ?loc - location)\n  :precondition\n   (and \n        (in ?obj ?airplane) (at ?airplane ?loc))\n  :effect\n   (and (not (in ?obj ?airplane)) (at ?obj ?loc)))\n\n(:action DRIVE-TRUCK\n  :parameters\n   (?truck - truck\n    ?loc-from - location\n    ?loc-to - location\n    ?city - city)\n  :precondition\n   (and \n   (at ?truck ?loc-from)\n   (in-city ?loc-from ?city)\n   (in-city ?loc-to ?city))\n  :effect\n   (and (not (at ?truck ?loc-from)) (at ?truck ?loc-to)))\n\n(:action FLY-AIRPLANE\n  :parameters\n   (?airplane - airplane\n    ?loc-from - airport\n    ?loc-to - airport)\n  :precondition\n   (and \n\t(at ?airplane ?loc-from))\n  :effect\n   (and (not (at ?airplane ?loc-from)) (at ?airplane ?loc-to)))\n)"
+    PDDL_problem: "(define (problem logistics-c2-s2-p4-a1)\n    (:domain logistics-strips)\n    (:requirements :strips :typing)\n    (:objects a0 - airplane l0-0 l1-0 - airport c0 c1 - city l0-1 l1-1 - location p0 p1 p2 p3 - package t0 t1 - truck)\n    (:init (at a0 l1-0) (at p0 l0-0) (at p2 l1-0) (at t0 l0-0) (at t1 l1-0) (in p1 t1) (in p3 t0) (in-city l0-0 c0) (in-city l0-1 c0) (in-city l1-0 c1) (in-city l1-1 c1))\n    (:goal (and (at p0 l0-0) (at p1 l1-0) (at p2 l1-0) (at p3 l0-1)))\n)"
+doc_to_text: "# PDDL DOMAIN \n\n```\n{{PDDL_domain}}\n```\n\n# PDDL PROBLEM \n\n```\n{{PDDL_problem}}\n```\n\n**Question**: {{context}} {{question}} Provide one proposition or None. **Final Answer**:"
+filter_list:
+  - name: "acp_grammar_parse"
+    filter:
+      - function: "ACP_grammar_filter"
+        grammar_task: "act"
+      - function: "take_first"
diff --git a/lm_eval/tasks/acpbench/gen_2shot_with_pddl/val.yaml b/lm_eval/tasks/acpbench/gen_2shot_with_pddl/val.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6012ebf92a546a200ab42987e8406dacd11c75e4
--- /dev/null
+++ b/lm_eval/tasks/acpbench/gen_2shot_with_pddl/val.yaml
@@ -0,0 +1,23 @@
+task: acp_val_gen_with_pddl
+dataset_name: acp_val_gen
+include: _gen_yaml_2shot
+fewshot_config:
+  sampler: first_n
+  samples:
+  - context: "A robot is in a grid and can only move to places that are connected to its current position. \nThe grid size is 5x5, and the locations are of the form fi-jf (e.g., f3-2f or f0-1f). The grid cells are connected to their neighbors (e.g., f1-2f is connected to the four neighbors f0-2f, f2-2f, f1-1f, and f1-3f). Some positions on the grid are locked and can be opened with a key of a matching shape. The robot has an arm that can pick up a key when the key is in same location as the robot and the arm is empty.  \nThere are 2 keys in 0 different shapes: Key key0-1 is of shape shape0, Key key0-0 is of shape shape0.  \nCurrently, the robot is at position f3-3f and its arm is empty. All the positions are open except the following: f2-0f has shape0 shaped lock, f4-2f has shape0 shaped lock. Key key0-1 is at position f1-3f. Key key0-0 is at position f2-2f. The goal is to reach a state where the following facts hold: Key key0-1 is at f1-3f location and Key key0-0 is at f2-0f location. The available actions are: (unlock ?curpos ?lockpos ?key ?shape) - unlock the place ?lockpos with the key ?key of the shape ?shape from the current position place ?curpos, (move ?curpos ?nextpos) - move from place ?curpos to place ?nextpos, (pickup ?curpos ?key) - retrieve the key ?key from its current position ?curpos, (pickup-and-loose ?curpos ?newkey ?oldkey) - pick up the key ?newkey at the current position place ?curpos and loose the key ?oldkey being held, and (putdown ?curpos ?key) - put the key ?key at the current position place ?curpos."
+    question: "What is the first inapplicable action in the next sequence of actions: \"(unlock f1-0f f2-0f key0-1 shape0) (move f2-3f f2-2f) (pickup f2-2f key0-0) (move f2-2f f2-1f) (unlock f2-1f f2-0f key0-0 shape0) (move f2-1f f2-0f) (putdown f2-0f key0-0)\"?"
+    answer: "0"
+    PDDL_domain: "(define (domain grid)\n    (:requirements :strips :typing)\n    (:types key place shape - object)\n    (:predicates (arm-empty) (at ?r - key ?x - place)  (at-robot ?x - place)  (conn ?x - place ?y - place)  (holding ?k - key)  (key-shape ?k - key ?s - shape)  (lock-shape ?x - place ?s - shape)  (locked ?x - place)  (open ?x - place))\n    (:action move\n        :parameters (?curpos - place ?nextpos - place)\n        :precondition (and (at-robot ?curpos) (conn ?curpos ?nextpos) (open ?nextpos))\n        :effect (and (at-robot ?nextpos) (not (at-robot ?curpos)))\n    )\n     (:action pickup\n        :parameters (?curpos - place ?key - key)\n        :precondition (and (at-robot ?curpos) (at ?key ?curpos) (arm-empty))\n        :effect (and (holding ?key) (not (at ?key ?curpos)) (not (arm-empty)))\n    )\n     (:action pickup-and-loose\n        :parameters (?curpos - place ?newkey - key ?oldkey - key)\n        :precondition (and (at-robot ?curpos) (holding ?oldkey) (at ?newkey ?curpos))\n        :effect (and (holding ?newkey) (at ?oldkey ?curpos) (not (holding ?oldkey)) (not (at ?newkey ?curpos)))\n    )\n     (:action putdown\n        :parameters (?curpos - place ?key - key)\n        :precondition (and (at-robot ?curpos) (holding ?key))\n        :effect (and (arm-empty) (at ?key ?curpos) (not (holding ?key)))\n    )\n     (:action unlock\n        :parameters (?curpos - place ?lockpos - place ?key - key ?shape - shape)\n        :precondition (and (conn ?curpos ?lockpos) (key-shape ?key ?shape) (lock-shape ?lockpos ?shape) (at-robot ?curpos) (locked ?lockpos) (holding ?key))\n        :effect (and (open ?lockpos) (not (locked ?lockpos)))\n    )\n)"
+    PDDL_problem: "(define (problem grid-x5-y5-t1-k2-l2-p100)\n    (:domain grid)\n    (:requirements :strips :typing)\n    (:objects key0-0 key0-1 - key f0-0f f0-1f f0-2f f0-3f f0-4f f1-0f f1-1f f1-2f f1-3f f1-4f f2-0f f2-1f f2-2f f2-3f f2-4f f3-0f f3-1f f3-2f f3-3f f3-4f f4-0f f4-1f f4-2f f4-3f f4-4f - place shape0 - shape)\n    (:init (arm-empty) (at key0-0 f2-2f) (at key0-1 f1-3f) (at-robot f3-3f) (conn f0-0f f0-1f) (conn f0-0f f1-0f) (conn f0-1f f0-0f) (conn f0-1f f0-2f) (conn f0-1f f1-1f) (conn f0-2f f0-1f) (conn f0-2f f0-3f) (conn f0-2f f1-2f) (conn f0-3f f0-2f) (conn f0-3f f0-4f) (conn f0-3f f1-3f) (conn f0-4f f0-3f) (conn f0-4f f1-4f) (conn f1-0f f0-0f) (conn f1-0f f1-1f) (conn f1-0f f2-0f) (conn f1-1f f0-1f) (conn f1-1f f1-0f) (conn f1-1f f1-2f) (conn f1-1f f2-1f) (conn f1-2f f0-2f) (conn f1-2f f1-1f) (conn f1-2f f1-3f) (conn f1-2f f2-2f) (conn f1-3f f0-3f) (conn f1-3f f1-2f) (conn f1-3f f1-4f) (conn f1-3f f2-3f) (conn f1-4f f0-4f) (conn f1-4f f1-3f) (conn f1-4f f2-4f) (conn f2-0f f1-0f) (conn f2-0f f2-1f) (conn f2-0f f3-0f) (conn f2-1f f1-1f) (conn f2-1f f2-0f) (conn f2-1f f2-2f) (conn f2-1f f3-1f) (conn f2-2f f1-2f) (conn f2-2f f2-1f) (conn f2-2f f2-3f) (conn f2-2f f3-2f) (conn f2-3f f1-3f) (conn f2-3f f2-2f) (conn f2-3f f2-4f) (conn f2-3f f3-3f) (conn f2-4f f1-4f) (conn f2-4f f2-3f) (conn f2-4f f3-4f) (conn f3-0f f2-0f) (conn f3-0f f3-1f) (conn f3-0f f4-0f) (conn f3-1f f2-1f) (conn f3-1f f3-0f) (conn f3-1f f3-2f) (conn f3-1f f4-1f) (conn f3-2f f2-2f) (conn f3-2f f3-1f) (conn f3-2f f3-3f) (conn f3-2f f4-2f) (conn f3-3f f2-3f) (conn f3-3f f3-2f) (conn f3-3f f3-4f) (conn f3-3f f4-3f) (conn f3-4f f2-4f) (conn f3-4f f3-3f) (conn f3-4f f4-4f) (conn f4-0f f3-0f) (conn f4-0f f4-1f) (conn f4-1f f3-1f) (conn f4-1f f4-0f) (conn f4-1f f4-2f) (conn f4-2f f3-2f) (conn f4-2f f4-1f) (conn f4-2f f4-3f) (conn f4-3f f3-3f) (conn f4-3f f4-2f) (conn f4-3f f4-4f) (conn f4-4f f3-4f) (conn f4-4f f4-3f) (key-shape key0-0 shape0) (key-shape key0-1 shape0) (lock-shape f2-0f shape0) (lock-shape f4-2f shape0) (locked f2-0f) (locked f4-2f) (open f0-0f) (open f0-1f) (open f0-2f) (open f0-3f) (open f0-4f) (open f1-0f) (open f1-1f) (open f1-2f) (open f1-3f) (open f1-4f) (open f2-1f) (open f2-2f) (open f2-3f) (open f2-4f) (open f3-0f) (open f3-1f) (open f3-2f) (open f3-3f) (open f3-4f) (open f4-0f) (open f4-1f) (open f4-3f) (open f4-4f))\n    (:goal (and (at key0-0 f2-0f) (at key0-1 f1-3f)))\n)"
+  - context: "There are several cities, each containing several locations, some of which are airports. There are also trucks, which can drive within a single city, and airplanes, which can fly between airports. The goal is to get some packages from various locations to various new locations. \nThere are 2 trucks and 1 airplane, as well as 4 packages. There are 4 locations across 2 cities. \nThe locations are in cities as follows: l1-1 and l1-0 are in c1; l0-1 and l0-0 are in c0. \nCurrently, t1 and p0 are at l1-1, p3, p2, and p1 are at l1-0, t0 is at l0-1, a0 is at l0-0. The goal is to reach a state where the following facts hold: p0 is at l0-0, p3 is at l0-1, p2 is at l1-0, and p1 is at l1-0. The available actions are: (load-truck ?obj ?truck ?loc) - load object ?obj into truck ?truck at location ?loc, (load-airplane ?obj ?airplane ?loc) - load the object ?obj from location ?loc into the airplane ?airplane, (unload-truck ?obj ?truck ?loc) - offload the object ?obj from the truck ?truck at location ?loc, (unload-airplane ?obj ?airplane ?loc) - remove the object ?obj from the airplane ?airplane and place it on the location ?loc, (drive-truck ?truck ?loc-from ?loc-to ?city) - navigate the truck ?truck which is in location ?loc-from in city ?city to another location ?loc-to in the same city, and (fly-airplane ?airplane ?loc-from ?loc-to) - fly airplane ?airplane from airport ?loc-from to airport ?loc-to."
+    question: "What is the first inapplicable action in the next sequence of actions: \"(drive-truck t0 l0-1 l0-0 c0) (fly-airplane a0 l0-0 l1-0) (load-airplane p3 a0 l1-0) (load-truck p0 t1 l1-1) (drive-truck t1 l1-1 l1-0 c1) (unload-truck p0 t1 l1-0) (load-airplane p0 a0 l1-0) (fly-airplane a0 l1-0 l0-0) (unload-airplane p0 a0 l0-0) (unload-airplane p3 a0 l0-0) (load-truck p3 t0 l0-0) (drive-truck t0 l0-0 l0-1 c0) (unload-airplane p3 a0 l0-0)\"?"
+    answer: "12"
+    PDDL_domain: "(define (domain logistics-strips)\n  (:requirements :strips :typing) \n\n  (:types \n    location locatable city - object \n    package movable - locatable\n    airport - location\n    airplane truck - movable    \n  )\t\t\n  \n  (:predicates \t\n\t\t(at ?obj - locatable ?loc - location)\n\t\t(in ?obj1 - package ?obj2 - movable)\n\t\t(in-city ?obj - location ?city - city))\n\n\n(:action LOAD-TRUCK\n  :parameters\n   (?obj - package\n    ?truck - truck\n    ?loc - location)\n  :precondition\n   (and \n   (at ?truck ?loc) (at ?obj ?loc))\n  :effect\n   (and (not (at ?obj ?loc)) (in ?obj ?truck)))\n\n(:action LOAD-AIRPLANE\n  :parameters\n   (?obj - package\n    ?airplane - airplane\n    ?loc - location)\n  :precondition\n   (and \n   (at ?obj ?loc) (at ?airplane ?loc))\n  :effect\n   (and (not (at ?obj ?loc)) (in ?obj ?airplane)))\n\n\n\n(:action UNLOAD-TRUCK\n  :parameters\n   (?obj - package\n    ?truck - truck\n    ?loc - location)\n  :precondition\n   (and \n        (at ?truck ?loc) (in ?obj ?truck))\n  :effect\n   (and (not (in ?obj ?truck)) (at ?obj ?loc)))\n\n(:action UNLOAD-AIRPLANE\n  :parameters\n   (?obj - package\n    ?airplane - airplane\n    ?loc - location)\n  :precondition\n   (and \n        (in ?obj ?airplane) (at ?airplane ?loc))\n  :effect\n   (and (not (in ?obj ?airplane)) (at ?obj ?loc)))\n\n(:action DRIVE-TRUCK\n  :parameters\n   (?truck - truck\n    ?loc-from - location\n    ?loc-to - location\n    ?city - city)\n  :precondition\n   (and \n   (at ?truck ?loc-from)\n   (in-city ?loc-from ?city)\n   (in-city ?loc-to ?city))\n  :effect\n   (and (not (at ?truck ?loc-from)) (at ?truck ?loc-to)))\n\n(:action FLY-AIRPLANE\n  :parameters\n   (?airplane - airplane\n    ?loc-from - airport\n    ?loc-to - airport)\n  :precondition\n   (and \n\t(at ?airplane ?loc-from))\n  :effect\n   (and (not (at ?airplane ?loc-from)) (at ?airplane ?loc-to)))\n)"
+    PDDL_problem: "(define (problem logistics-c2-s2-p4-a1)\n    (:domain logistics-strips)\n    (:requirements :strips :typing)\n    (:objects a0 - airplane l0-0 l1-0 - airport c0 c1 - city l0-1 l1-1 - location p0 p1 p2 p3 - package t0 t1 - truck)\n    (:init (at a0 l0-0) (at p0 l1-1) (at p1 l1-0) (at p2 l1-0) (at p3 l1-0) (at t0 l0-1) (at t1 l1-1) (in-city l0-0 c0) (in-city l0-1 c0) (in-city l1-0 c1) (in-city l1-1 c1))\n    (:goal (and (at p0 l0-0) (at p1 l1-0) (at p2 l1-0) (at p3 l0-1)))\n)"
+doc_to_text: "# PDDL DOMAIN \n\n```\n{{PDDL_domain}}\n```\n\n# PDDL PROBLEM \n\n```\n{{PDDL_problem}}\n```\n\n**Question**: {{context}} {{question}} Provide only the index of the action. **Final Answer**:"
+filter_list:
+  - name: "acp_grammar_parse"
+    filter:
+      - function: "ACP_grammar_filter"
+        grammar_task: "index"
+      - function: "take_first"
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2316a748bd6f72f0c234544c016ddfd6b33fd9ff
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/afrimgsm.yaml
@@ -0,0 +1,13 @@
+group: afrimgsm-irokobench
+task:
+  - afrimgsm_tasks_prompt_1
+  - afrimgsm_tasks_prompt_2
+  - afrimgsm_tasks_prompt_3
+  - afrimgsm_tasks_prompt_4
+  - afrimgsm_tasks_prompt_5
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 2
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_amh.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_amh.yaml
deleted file mode 100644
index 04d0bdd67114f3c0887979fdce210f0fa94616e7..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_amh.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: amh
-doc_to_target: '{% if answer is not none %}{{answer[15:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: direct_yaml
-task: afrimgsm_direct_amh
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_eng.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_eng.yaml
deleted file mode 100644
index 5804270d4d0072764ca3d1190a75d7629bc251e9..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_eng.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: eng
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: direct_yaml
-task: afrimgsm_direct_eng
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_ewe.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_ewe.yaml
deleted file mode 100644
index 4eae6fc4c790968040080aee824c345bd786db44..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_ewe.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: ewe
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: direct_yaml
-task: afrimgsm_direct_ewe
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_fra.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_fra.yaml
deleted file mode 100644
index 16aeacf2c54706a18165bd1230ee812bb080ceb8..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_fra.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: fra
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: direct_yaml
-task: afrimgsm_direct_fra
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_hau.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_hau.yaml
deleted file mode 100644
index 3a6668e989af297b60b1aafd53a3cb44e3936a60..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_hau.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: hau
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: direct_yaml
-task: afrimgsm_direct_hau
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_ibo.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_ibo.yaml
deleted file mode 100644
index ab79986a5dec2af92711a675b3a4d79b31b044a9..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_ibo.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: ibo
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: direct_yaml
-task: afrimgsm_direct_ibo
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_kin.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_kin.yaml
deleted file mode 100644
index d4c9c75af0ccfc6d2b0b18138dec074e10b6047e..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_kin.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: kin
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: direct_yaml
-task: afrimgsm_direct_kin
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_lin.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_lin.yaml
deleted file mode 100644
index 7136d7370cfd8f9e35b4ebc5e0615330b84edddc..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_lin.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: lin
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: direct_yaml
-task: afrimgsm_direct_lin
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_lug.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_lug.yaml
deleted file mode 100644
index 03fc0c2884cf9d14cadcf583cce1e81c47938963..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_lug.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: lug
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: direct_yaml
-task: afrimgsm_direct_lug
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_orm.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_orm.yaml
deleted file mode 100644
index 49d7e93390dc5c63ce83364ea1ec8ede77537ea8..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_orm.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: orm
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: direct_yaml
-task: afrimgsm_direct_orm
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_sna.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_sna.yaml
deleted file mode 100644
index a61de85a3ffbbd5c2f3e91d5f26eb63a6241d78c..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_sna.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: sna
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: direct_yaml
-task: afrimgsm_direct_sna
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_sot.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_sot.yaml
deleted file mode 100644
index 455c1adcc5b896ce2c2140c9f30e8fa1857e60a2..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_sot.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: sot
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: direct_yaml
-task: afrimgsm_direct_sot
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_swa.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_swa.yaml
deleted file mode 100644
index 462ddfd378f8c02a872780a8013f0f74378551e0..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_swa.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: swa
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: direct_yaml
-task: afrimgsm_direct_swa
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_twi.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_twi.yaml
deleted file mode 100644
index 8c4673b7ba00668d5d3bdcacfd2e00f342362194..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_twi.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: twi
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: direct_yaml
-task: afrimgsm_direct_twi
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_wol.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_wol.yaml
deleted file mode 100644
index 08a8e030a4c0c0d444ac464b974d9886e434ff43..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_wol.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: wol
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: direct_yaml
-task: afrimgsm_direct_wol
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_xho.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_xho.yaml
deleted file mode 100644
index 2103d182f3ca1703c43e03279a6d1aa9bcc9532d..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_xho.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: xho
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: direct_yaml
-task: afrimgsm_direct_xho
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_yor.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_yor.yaml
deleted file mode 100644
index aa084c32a645cab532b002565f3c8a324708d6ba..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_yor.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: yor
-doc_to_target: '{% if answer is not none %}{{answer[16:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: direct_yaml
-task: afrimgsm_direct_yor
diff --git a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_zul.yaml b/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_zul.yaml
deleted file mode 100644
index dcffb6944658282d620f7dbcec9d6513bcaf36c5..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/direct/afrimgsm_direct_zul.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: zul
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: direct_yaml
-task: afrimgsm_direct_zul
diff --git a/lm_eval/tasks/afrimgsm/direct/direct_yaml b/lm_eval/tasks/afrimgsm/direct/direct_yaml
deleted file mode 100644
index f9819fe6f8470e37e73b3f3bc7d6b5cf8147a290..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/direct/direct_yaml
+++ /dev/null
@@ -1,37 +0,0 @@
-# This file will be included in the generated language-specific task configs.
-# It doesn't have a yaml file extension as it is not meant to be imported directly
-# by the harness.
-tag:
-    - afrimgsm
-    - afrimgsm_direct
-dataset_path: masakhane/afrimgsm
-dataset_name: null  # Overridden by language-specific config.
-output_type: generate_until
-# training_split: train
-test_split: test
-target_delimiter: ""
-generation_kwargs:
-  until:
-    - "\n\n"
-    - "\n"
-  do_sample: false
-  temperature: 0.0
-filter_list:
-  - name: remove_whitespace
-    filter:
-      - function: remove_whitespace
-      - function: take_first
-  - filter:
-    - function: regex
-      group_select: -1
-      regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
-    - function: take_first
-    name: flexible-extract
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-metadata:
-  version: 2.0
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_amh.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..23007e3657c85b3b42ac5591180096c54740a240
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_amh.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: amh
+include: afrimgsm_yaml
+task: afrimgsm_amh_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_eng.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1d5694225089b96dfeb06d331482bafa821cede8
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_eng.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: eng
+include: afrimgsm_yaml
+task: afrimgsm_eng_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_ewe.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..68980400de696f30ce325ff260b5f8cefd5d95dc
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_ewe.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ewe
+include: afrimgsm_yaml
+task: afrimgsm_ewe_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_fra.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..04d57dbd329cedd535476cd7980ac5a201d84847
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_fra.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fra
+include: afrimgsm_yaml
+task: afrimgsm_fra_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_hau.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..aef377d292006471eecc551eb414ced3c751eaa5
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_hau.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hau
+include: afrimgsm_yaml
+task: afrimgsm_hau_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_ibo.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bbbb7ef859249191cdf52db9bdab6135319e7a60
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_ibo.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ibo
+include: afrimgsm_yaml
+task: afrimgsm_ibo_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_kin.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dcfc7160d7b3262ebb78b30a4fa070f748e1e619
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_kin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kin
+include: afrimgsm_yaml
+task: afrimgsm_kin_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_lin.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..be6a24ceb1cdec606a11b31865f75a8ef5188b4a
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_lin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lin
+include: afrimgsm_yaml
+task: afrimgsm_lin_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_lug.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4bffe69f252d7edda8e146a6f61b81dc0bd550c5
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_lug.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lug
+include: afrimgsm_yaml
+task: afrimgsm_lug_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_orm.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b916281cee9217fafbfdfe62a9c451ef918b36d2
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_orm.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: orm
+include: afrimgsm_yaml
+task: afrimgsm_orm_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_sna.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1afa6bb3a5455f405d9735b933844a0974ec0899
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_sna.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sna
+include: afrimgsm_yaml
+task: afrimgsm_sna_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_sot.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f6cd4cdcd7e4860f73fa0bbdffe16ce02b2d2234
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_sot.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sot
+include: afrimgsm_yaml
+task: afrimgsm_sot_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_swa.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..be6dba7151542a8b6ecb4e5cb1da18ab0d9121a3
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_swa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swa
+include: afrimgsm_yaml
+task: afrimgsm_swa_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_twi.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a82235d7883d0fb0b23e92129ed373d2503a31e1
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_twi.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: twi
+include: afrimgsm_yaml
+task: afrimgsm_twi_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_vai.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_vai.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e04d28f0d33f1da1d6282431c8d4e1655a1f175b
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_vai.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: vai
+include: afrimgsm_yaml
+task: afrimgsm_vai_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_wol.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..34b773765f90db8695611663bf615744fd6cfaa8
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_wol.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: wol
+include: afrimgsm_yaml
+task: afrimgsm_wol_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_xho.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d17530bd22284dfd33556f043fb2c18d0325a174
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_xho.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: xho
+include: afrimgsm_yaml
+task: afrimgsm_xho_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_yaml b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..19d4f7d1fdf39118e2fc774097619b837d34122e
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_yaml
@@ -0,0 +1,35 @@
+tag:
+    - afrimgsm_tasks
+    - afrimgsm_tasks_prompt_1
+dataset_path: masakhane/afrimgsm
+dataset_name: null  # Overridden by language-specific config.
+output_type: generate_until
+test_split: test
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+target_delimiter: ""
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+filter_list:
+  - name: remove_whitespace
+    filter:
+      - function: remove_whitespace
+      - function: take_first
+  - filter:
+    - function: regex
+      group_select: -1
+      regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+    - function: take_first
+    name: flexible-extract
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 2.0
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_yor.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ba89f9ace0bf2aff3e467e021408d6de790ddd0f
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_yor.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: yor
+include: afrimgsm_yaml
+task: afrimgsm_yor_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_zul.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..07b89135ac35c6c3f67df44ff3004e5aeab4197e
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_1/afrimgsm_zul.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: zul
+include: afrimgsm_yaml
+task: afrimgsm_zul_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_amh.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ac0812c1181c50fc51457b65f2cbeb8f64b5a78d
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_amh.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: amh
+include: afrimgsm_yaml
+task: afrimgsm_amh_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_eng.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..940000376a025ef42c7352c2db416b2fe1a9b38f
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_eng.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: eng
+include: afrimgsm_yaml
+task: afrimgsm_eng_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_ewe.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d0cd4926120ab7de80c13ba9b13bd327c81866bb
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_ewe.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ewe
+include: afrimgsm_yaml
+task: afrimgsm_ewe_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_fra.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2d709e9ccfdb0251096156eecea495576aca1c13
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_fra.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fra
+include: afrimgsm_yaml
+task: afrimgsm_fra_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_hau.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3e7d62ab615e7776f714f717d3b0f246782c8f21
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_hau.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hau
+include: afrimgsm_yaml
+task: afrimgsm_hau_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_ibo.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fce1d51899bd307d460f312cbc60c8820295db6f
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_ibo.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ibo
+include: afrimgsm_yaml
+task: afrimgsm_ibo_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_kin.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9c7b65251577ccb7bfa1fe74ddbe06114247dae5
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_kin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kin
+include: afrimgsm_yaml
+task: afrimgsm_kin_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_lin.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..71594885ab6bb615f95c4a0a4f17d24ba41bbd1d
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_lin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lin
+include: afrimgsm_yaml
+task: afrimgsm_lin_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_lug.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cf4633745e06b7af03cf890f7d2b9426179bbdbc
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_lug.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lug
+include: afrimgsm_yaml
+task: afrimgsm_lug_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_orm.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e3d9d96ed900fb7583da0b676616f069ec9bf5ee
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_orm.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: orm
+include: afrimgsm_yaml
+task: afrimgsm_orm_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_sna.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8e27f832a3b24b39d7515d657c01c54587fa3ac7
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_sna.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sna
+include: afrimgsm_yaml
+task: afrimgsm_sna_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_sot.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5ad77562229ec921e07ebcfe8a35ce9c7b072d99
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_sot.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sot
+include: afrimgsm_yaml
+task: afrimgsm_sot_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_swa.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fea74a3d1bd1bad4946491457ef689d953b8f66a
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_swa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swa
+include: afrimgsm_yaml
+task: afrimgsm_swa_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_twi.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..311639a12c103194dec5ccc6894bb175ac67cc26
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_twi.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: twi
+include: afrimgsm_yaml
+task: afrimgsm_twi_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_vai.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_vai.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..655b23dec64404bb271b558726ef5f279096f4c0
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_vai.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: vai
+include: afrimgsm_yaml
+task: afrimgsm_vai_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_wol.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..493551623d4071cfebbf59f0a179f487a0f46866
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_wol.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: wol
+include: afrimgsm_yaml
+task: afrimgsm_wol_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_xho.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1c076be5a556970703a4f1581700d1c0b2a3217d
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_xho.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: xho
+include: afrimgsm_yaml
+task: afrimgsm_xho_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_yaml b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2eaceade5342296cc43a1d343ee9af582792922e
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_yaml
@@ -0,0 +1,34 @@
+tag:
+    - afrimgsm_tasks
+    - afrimgsm_tasks_prompt_1
+dataset_path: masakhane/afrimgsm
+output_type: generate_until
+test_split: test
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: "Give direct numerical answers for the question provided. \n\nQuestion: {{question}} \nAnswer: "
+target_delimiter: ""
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+filter_list:
+  - name: remove_whitespace
+    filter:
+      - function: remove_whitespace
+      - function: take_first
+  - filter:
+    - function: regex
+      group_select: -1
+      regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+    - function: take_first
+    name: flexible-extract
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 2.0
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_yor.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6faf98bdd2ebe70aa1fbe7a0bc955eedce4c726a
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_yor.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: yor
+include: afrimgsm_yaml
+task: afrimgsm_yor_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_zul.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0b340608c1289ef7a6257cec74ea0956039d12a6
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_zul.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: zul
+include: afrimgsm_yaml
+task: afrimgsm_zul_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_amh.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..eb3ed8309c481abfa483761fefa9f8792f9a600a
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_amh.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: amh
+include: afrimgsm_yaml
+task: afrimgsm_amh_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_eng.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..87efd748800c67b8703318535649280952b21208
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_eng.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: eng
+include: afrimgsm_yaml
+task: afrimgsm_eng_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_ewe.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4636069a5a6e9291fd071b1221c87927f895264f
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_ewe.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ewe
+include: afrimgsm_yaml
+task: afrimgsm_ewe_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_fra.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b1faf85c8be5594d03e73c57fcea2fcfba1508a9
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_fra.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fra
+include: afrimgsm_yaml
+task: afrimgsm_fra_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_hau.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c0cfcbe44e862238c16a5c12feadbedca447a3ee
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_hau.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hau
+include: afrimgsm_yaml
+task: afrimgsm_hau_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_ibo.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fc451b94feac4afc4ba0405ee9643e0ab6790080
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_ibo.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ibo
+include: afrimgsm_yaml
+task: afrimgsm_ibo_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_kin.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f01edaae271f79a4f3f9e6ce444e62af86a85032
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_kin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kin
+include: afrimgsm_yaml
+task: afrimgsm_kin_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_lin.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..86ccf1e44be09e34aa5b597f1db171a1c6b7f9a5
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_lin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lin
+include: afrimgsm_yaml
+task: afrimgsm_lin_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_lug.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ff6b6de36aa85f43da7c26b0bccf33b92a4f5d9a
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_lug.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lug
+include: afrimgsm_yaml
+task: afrimgsm_lug_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_orm.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8a5ff414f680b347e5d9b7f9504be35b968b4d7d
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_orm.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: orm
+include: afrimgsm_yaml
+task: afrimgsm_orm_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_sna.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4000b9dc41fa79e16f673db2f54fa31d77dded02
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_sna.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sna
+include: afrimgsm_yaml
+task: afrimgsm_sna_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_sot.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a9941fce3105d32d2610c4e41b938360ac8b961c
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_sot.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sot
+include: afrimgsm_yaml
+task: afrimgsm_sot_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_swa.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..abd8358024d6cc567f6044b7e0dfa69cd9d1a5eb
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_swa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swa
+include: afrimgsm_yaml
+task: afrimgsm_swa_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_twi.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..12652498abab497f610192e3dd6d377110e913d4
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_twi.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: twi
+include: afrimgsm_yaml
+task: afrimgsm_twi_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_vai.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_vai.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..aae0b5f51a347f8b52841b0e0fc49197a8c8490b
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_vai.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: vai
+include: afrimgsm_yaml
+task: afrimgsm_vai_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_wol.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2a8fd58b055584229c5143e5cc862c250a2b3e0b
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_wol.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: wol
+include: afrimgsm_yaml
+task: afrimgsm_wol_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_xho.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..067e88d709cd0f06c217d6823ae2cd99ebd5f003
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_xho.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: xho
+include: afrimgsm_yaml
+task: afrimgsm_xho_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_yaml b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8dd3f5ca74a7c6d4dcba9daab9d8c7653b9c8e6e
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_yaml
@@ -0,0 +1,34 @@
+tag:
+    - afrimgsm_tasks
+    - afrimgsm_tasks_prompt_3
+dataset_path: masakhane/afrimgsm
+output_type: generate_until
+test_split: test
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: "Solve the following math question \n\nQuestion: {{question}} \nAnswer: "
+target_delimiter: ""
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+filter_list:
+  - name: remove_whitespace
+    filter:
+      - function: remove_whitespace
+      - function: take_first
+  - filter:
+    - function: regex
+      group_select: -1
+      regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+    - function: take_first
+    name: flexible-extract
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 2.0
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_yor.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2d64481065491583c980a83d9ea40507b8830552
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_yor.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: yor
+include: afrimgsm_yaml
+task: afrimgsm_yor_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_zul.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4b94a2c13dae19e6ed50ab49a321b1f73d326a97
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_zul.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: zul
+include: afrimgsm_yaml
+task: afrimgsm_zul_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_amh.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..00e45eb4d433718f0ee36896297a20e3aced9dc6
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_amh.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: amh
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_amh_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_eng.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0e3abef526bb0571d2a6fbac07a2abeff6acb3a8
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_eng.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_eng_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_ewe.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cda1994b4dd2b18fb18e0200342a58c1e42d47b8
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_ewe.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ewe
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_ewe_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_fra.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b7e5c155519d43922d3391bfe903ab8720d9ddbf
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_fra.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: fra
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_fra_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_hau.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e6ad1a00dce65e79adac55593e53e9683b48decd
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_hau.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_hau_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_ibo.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7d214574ade50c17d374e7daad4b4a8e1670db78
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_ibo.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_ibo_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_kin.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a2c7cb9ffe60577d49d6aa87c191e0fb81c974ac
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_kin.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_kin_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_lin.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..44d2d2f39bc83011268c1bcffaa4ea8564e3b5a4
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_lin.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: lin
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_lin_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_lug.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d8a12945fd73da0abe42bc7504ec5a861ef86a8d
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_lug.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: lug
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_lug_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_orm.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dcf7123f8e45f9d5d32ffc97f5038c29d946958d
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_orm.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: orm
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_orm_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_sna.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3d3414e3ca1273567847599c244e266e66e495dc
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_sna.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sna
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_sna_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_sot.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ac0017a36b03a55102fab40f5c6a3cb2d6fffa84
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_sot.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sot
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_sot_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_swa.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..043311d3e634759e7b08892126dfc64b94e39310
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_swa.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_swa_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_twi.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f3e030d0bc072ba50afa51a988ee0fa4d37f8e7f
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_twi.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_twi_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_vai.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_vai.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2dbe04c16eab6ea207a384c01860cc2555ffb6bb
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_vai.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: vai
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_vai_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_wol.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..55f546ab101fb6ee26158d5803fe1ad80f3cb8e4
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_wol.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: wol
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_wol_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_xho.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a759549811b8e6f009ed49c7b4bd16bc46163d8f
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_xho.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: xho
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_xho_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_yaml b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5f34f774e9bc1cf7fb925eac9fd55b123389b738
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_yaml
@@ -0,0 +1,33 @@
+tag:
+    - afrimgsm_tasks
+    - afrimgsm_tasks_prompt_4
+dataset_path: masakhane/afrimgsm
+output_type: generate_until
+test_split: test
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+target_delimiter: ""
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+filter_list:
+  - name: remove_whitespace
+    filter:
+      - function: remove_whitespace
+      - function: take_first
+  - filter:
+    - function: regex
+      group_select: -1
+      regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+    - function: take_first
+    name: flexible-extract
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 2.0
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_yor.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fd4d01bbf4f5ddb22b8057f3ffd99996c82b92d1
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_yor.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_yor_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_zul.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5151f266e673a0106c64c548f53d2b2df18f7896
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_zul.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: zul
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_zul_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_amh.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e7ad215fac134e19c62187205f141a73d1978e22
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_amh.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: amh
+doc_to_text: "For mathematical questions provided in Amharic language. Supply the\
+  \ accurate numeric answer to the provided question. \n\nQuestion: {{question}} \n\
+  Answer: "
+include: afrimgsm_yaml
+task: afrimgsm_amh_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_eng.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a4de5e95948fd28d990f224c9cb0f6e82b6f5cf5
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_eng.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng
+doc_to_text: "For mathematical questions provided in English language. Supply the\
+  \ accurate numeric answer to the provided question. \n\nQuestion: {{question}} \n\
+  Answer: "
+include: afrimgsm_yaml
+task: afrimgsm_eng_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_ewe.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bdb3c4f278e1e31e5c20cb399df2a4a374aaa253
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_ewe.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ewe
+doc_to_text: "For mathematical questions provided in Ewe language. Supply the accurate\
+  \ numeric answer to the provided question. \n\nQuestion: {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_ewe_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_fra.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a93e79ec90099beb8c50baac1503bf824cb9bcda
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_fra.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: fra
+doc_to_text: "For mathematical questions provided in French language. Supply the accurate\
+  \ numeric answer to the provided question. \n\nQuestion: {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_fra_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_hau.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..74fabdee06aa0669ada4ba5fe46bd9a6ad6bdd32
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_hau.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: "For mathematical questions provided in Hausa language. Supply the accurate\
+  \ numeric answer to the provided question. \n\nQuestion: {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_hau_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_ibo.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c2dd77f238af5a67eccad3e05f6add96c3d1f321
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_ibo.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: "For mathematical questions provided in Igbo language. Supply the accurate\
+  \ numeric answer to the provided question. \n\nQuestion: {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_ibo_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_kin.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ba3f3c2f2b264b333a6fe1903c6f84d6d6000692
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_kin.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_text: "For mathematical questions provided in Kinyarwanda language. Supply\
+  \ the accurate numeric answer to the provided question. \n\nQuestion: {{question}}\
+  \ \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_kin_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_lin.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..74131916ab079d9c1bee95d169517661e4d11b31
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_lin.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: lin
+doc_to_text: "For mathematical questions provided in Lingala language. Supply the\
+  \ accurate numeric answer to the provided question. \n\nQuestion: {{question}} \n\
+  Answer: "
+include: afrimgsm_yaml
+task: afrimgsm_lin_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_lug.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b92bc4e6574ac7df4aad89c143687747ed5d9863
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_lug.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: lug
+doc_to_text: "For mathematical questions provided in Luganda language. Supply the\
+  \ accurate numeric answer to the provided question. \n\nQuestion: {{question}} \n\
+  Answer: "
+include: afrimgsm_yaml
+task: afrimgsm_lug_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_orm.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0c33dd44cc0019f3485ca72368c767b9161ec983
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_orm.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: orm
+doc_to_text: "For mathematical questions provided in Oromo language. Supply the accurate\
+  \ numeric answer to the provided question. \n\nQuestion: {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_orm_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_sna.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a2518c36981f50157890029d3fb49d596ea688b6
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_sna.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sna
+doc_to_text: "For mathematical questions provided in chiShona language. Supply the\
+  \ accurate numeric answer to the provided question. \n\nQuestion: {{question}} \n\
+  Answer: "
+include: afrimgsm_yaml
+task: afrimgsm_sna_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_sot.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..06cb1b05569674070ee6915f9acf6bd32c6ed72a
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_sot.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sot
+doc_to_text: "For mathematical questions provided in Sesotho language. Supply the\
+  \ accurate numeric answer to the provided question. \n\nQuestion: {{question}} \n\
+  Answer: "
+include: afrimgsm_yaml
+task: afrimgsm_sot_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_swa.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0a08c8e3a98efcc19a3f48ec4dc46d220fd7a9ab
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_swa.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: "For mathematical questions provided in Swahili language. Supply the\
+  \ accurate numeric answer to the provided question. \n\nQuestion: {{question}} \n\
+  Answer: "
+include: afrimgsm_yaml
+task: afrimgsm_swa_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_twi.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..54de3ce507d4f5f4b0f098ec8d9a71e37fb75fc4
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_twi.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_text: "For mathematical questions provided in Twi language. Supply the accurate\
+  \ numeric answer to the provided question. \n\nQuestion: {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_twi_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_vai.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_vai.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ab3337a7eaf4f7f3a4d06945fd56b78b7d74763f
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_vai.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: vai
+doc_to_text: "For mathematical questions provided in Vai language. Supply the accurate\
+  \ numeric answer to the provided question. \n\nQuestion: {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_vai_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_wol.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4e711e8efa4b2e854d9924f953dce931ea3a0ba1
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_wol.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: wol
+doc_to_text: "For mathematical questions provided in Wolof language. Supply the accurate\
+  \ numeric answer to the provided question. \n\nQuestion: {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_wol_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_xho.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..728cacf881aef3014669a43c2ffc5316664decad
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_xho.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: xho
+doc_to_text: "For mathematical questions provided in isiXhosa language. Supply the\
+  \ accurate numeric answer to the provided question. \n\nQuestion: {{question}} \n\
+  Answer: "
+include: afrimgsm_yaml
+task: afrimgsm_xho_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_yaml b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ca8bb03f88f2a98928fd21794753f0ff990bb309
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_yaml
@@ -0,0 +1,33 @@
+tag:
+    - afrimgsm_tasks
+    - afrimgsm_tasks_prompt_5
+dataset_path: masakhane/afrimgsm
+output_type: generate_until
+test_split: test
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+target_delimiter: ""
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+filter_list:
+  - name: remove_whitespace
+    filter:
+      - function: remove_whitespace
+      - function: take_first
+  - filter:
+    - function: regex
+      group_select: -1
+      regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+    - function: take_first
+    name: flexible-extract
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 2.0
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_yor.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cd0bea64173d14db8bc6d472bb685ee9f7b420a8
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_yor.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: "For mathematical questions provided in Yoruba language. Supply the accurate\
+  \ numeric answer to the provided question. \n\nQuestion: {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_yor_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_zul.yaml b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fb8474e31f328120b855c6e45d5340de8bd6c266
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_zul.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: zul
+doc_to_text: "For mathematical questions provided in Zulu language. Supply the accurate\
+  \ numeric answer to the provided question. \n\nQuestion: {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_zul_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/afrimgsm_cot.yaml b/lm_eval/tasks/afrimgsm/direct_cot/afrimgsm_cot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d07832b4ddb9d7b1306e36cc5c39f11ee840661b
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/afrimgsm_cot.yaml
@@ -0,0 +1,9 @@
+group: afrimgsm_cot-irokobench
+task:
+  - afrimgsm_cot_tasks
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 2
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_amh.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c9f0d9311aa462f2f4c3e9d38c5d0407ae7b72d7
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_amh.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: amh
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_amh_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_eng.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..57c0e564b1b594fe21203a0d73737301167b63cb
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_eng.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: eng
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_eng_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_ewe.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..55fdff7c365f895a4835b6234a7027a28b7c42a3
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_ewe.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ewe
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_ewe_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_fra.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..717a45d98beabb763984f030350252b6d3424747
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_fra.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fra
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_fra_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_hau.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f42e0ee559e562a934c3151cc9fd535442c6cf28
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_hau.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hau
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_hau_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_ibo.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dfabc3e319236dd2112ab74bbb5d1181f63d8d55
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_ibo.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ibo
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_ibo_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_kin.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..55d20e011aa15f290f292411604391e2c761053f
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_kin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kin
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_kin_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_lin.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ecbf38d859c25896eb039d87ed9b36a4797a77de
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_lin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lin
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_lin_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_lug.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..033cbce071fed9d337fecfc002b3988671cafe9a
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_lug.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lug
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_lug_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_orm.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7ce25aee61aa076b7c250c00a3a49f552c82f9d3
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_orm.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: orm
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_orm_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_sna.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fae029f19280d7bf4f0928a63022c6deeae09f6d
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_sna.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sna
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_sna_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_sot.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d0d1207791575accf84f4f6839e8b41f688ba0d9
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_sot.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sot
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_sot_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_swa.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..da4e39cfc0a7b28c341ba8557971bcedd944e9d4
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_swa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swa
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_swa_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_twi.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..24f553497c41bb4c42b99d9ba33abc21d42ebf15
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_twi.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: twi
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_twi_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_vai.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_vai.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cc63717012dd2df5a3965b352363ac9a2457372f
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_vai.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: vai
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_vai_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_wol.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c86b09d6a9961af49a771afd9ef62f991edc8084
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_wol.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: wol
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_wol_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_xho.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6f03080c3c6c986c760578e766e8d23c2e58a17f
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_xho.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: xho
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_xho_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6ab733bf114ba32013ab433ca74a1f05b66f8a78
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_yaml
@@ -0,0 +1,37 @@
+tag:
+    - afrimgsm_cot_tasks
+    - afrimgsm_cot_tasks_prompt_1
+dataset_path: masakhane/afrimgsm
+dataset_name: null  # Overridden by language-specific config.
+output_type: generate_until
+training_split: train
+test_split: test
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+  - <|eot_id|>
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
+      - function: "take_first"
+  - filter:
+    - function: regex
+      group_select: -1
+      regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+    - function: take_first
+    name: flexible-extract
+metadata:
+  version: 2.0
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_yor.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c6858ab21d02d14d797ce3704c940fd627feb2be
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_yor.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: yor
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_yor_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_zul.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5cacc2be5c426dde5bdfeb5dc5e3e62b20e494b8
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_1/afrimgsm_cot_zul.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: zul
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_zul_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_amh.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6d5d43fb8562da74b469cc15636aaac35c082ef2
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_amh.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: amh
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_amh_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_eng.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..84a6b26dd7d5f17dc76f2bc5c28bfa05d0936805
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_eng.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: eng
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_eng_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_ewe.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e7ef2907f54397390ff29aa44612d6a24a3a9380
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_ewe.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ewe
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_ewe_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_fra.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..987ac630ee57139c93588b86dc8ae53bbd6c466e
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_fra.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fra
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_fra_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_hau.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..488f693a60c4a31cec8adf1e2665cf7cd6430b18
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_hau.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hau
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_hau_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_ibo.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..aefa0aa229981bd07b9e221a66d4ebce08c0f086
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_ibo.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ibo
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_ibo_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_kin.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e183dcd48f80b358ed54bdde2fd8d9e3ac64a9ba
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_kin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kin
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_kin_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_lin.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..840a99acfacba2dacfe8c2883fed6445373f72e7
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_lin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lin
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_lin_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_lug.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..75e8b89276ae9db785fbc75cf403bffb6c6a5f1c
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_lug.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lug
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_lug_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_orm.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a36d89355bc25c75f7097c371ed685c1f05a8290
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_orm.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: orm
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_orm_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_sna.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..25187ceccadcbb09e0a8bf417d5b2ae4b4572b0d
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_sna.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sna
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_sna_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_sot.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..22fc718bc20db26210333151faf22c5a5c0bbef7
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_sot.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sot
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_sot_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_swa.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b0d91d0c9e3cebf8352c589c4ab6ac7a30183129
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_swa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swa
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_swa_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_twi.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..03b59a394366bca2b5241bd3a096b7b274354cad
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_twi.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: twi
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_twi_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_vai.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_vai.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8fa4cf5e2db3121bae1a296472cefecc34c31ceb
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_vai.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: vai
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_vai_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_wol.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2611de84f42fd9a0804cb155f94b687bab875a81
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_wol.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: wol
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_wol_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_xho.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..33059776d4a77e62c6a0478efa64ecd8da5c6a0f
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_xho.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: xho
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_xho_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..505336ba01a57df47f10104bedb8af7288e4d98d
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_yaml
@@ -0,0 +1,37 @@
+tag:
+    - afrimgsm_cot_tasks
+    - afrimgsm_cot_tasks_prompt_2
+dataset_path: masakhane/afrimgsm
+dataset_name: null  # Overridden by language-specific config.
+output_type: generate_until
+training_split: train
+test_split: test
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: 'Give direct numerical answers for the question provided. \n\nQuestion: {{question}} \Step-by-Step Answer: '
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+  - <|eot_id|>
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
+      - function: "take_first"
+  - filter:
+    - function: regex
+      group_select: -1
+      regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+    - function: take_first
+    name: flexible-extract
+metadata:
+  version: 2.0
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_yor.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..991297c4dd40aaeab196f45fbf6dcb6521432c25
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_yor.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: yor
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_yor_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_zul.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..833edbb1a704f59fc39000ccb8447fec46d55849
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_2/afrimgsm_cot_zul.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: zul
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_zul_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_amh.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..00f830a20eb0e580600868cd88ca9d25231352c1
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_amh.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: amh
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_amh_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_eng.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ea0937f233d75fea06dde81274381f2085a4416e
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_eng.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: eng
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_eng_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_ewe.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dfe111d7fc2b3ebc91463719305e734c02a2360f
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_ewe.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ewe
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_ewe_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_fra.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..eb82d3a44416adc5abb640d460d49c428e71f1bf
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_fra.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fra
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_fra_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_hau.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3162114b1eeb9a931ad6c006636fa8d57605c272
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_hau.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hau
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_hau_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_ibo.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f46191a331e0498b6a93cdda5f51750b42702423
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_ibo.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ibo
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_ibo_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_kin.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8ddc82ee85066be8890b863bc042bc66cc44316e
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_kin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kin
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_kin_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_lin.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..769ae73aa82813832af62ccf69a1e37e0bf688ae
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_lin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lin
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_lin_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_lug.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e04769a6c77dc035b03e48dfffb6fc2ef90081b6
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_lug.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lug
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_lug_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_orm.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..79a696581beac371a7afbf71e918d6e6f43263c5
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_orm.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: orm
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_orm_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_sna.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f08d44259f104d9b72c5ed732c0dbb6938524703
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_sna.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sna
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_sna_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_sot.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..76501f53d90f79d70f86ec52b8cbe3d329db7b7e
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_sot.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sot
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_sot_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_swa.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..76ea5f96ae6a2618b54a1009684d8e632c9ebd0e
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_swa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swa
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_swa_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_twi.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c45b3f0fcfab17390f077b2d161f76f097dfda33
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_twi.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: twi
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_twi_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_vai.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_vai.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ca50c481fd6c8470bc6d61e012969033cbf2bcfb
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_vai.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: vai
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_vai_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_wol.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..16dbc506ef36a0799834d1cd5af1166d029453d3
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_wol.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: wol
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_wol_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_xho.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a329b8ebf4487eb5b922f539274b6592d6d2e75f
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_xho.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: xho
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_xho_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d4d3657da5f68eb670173ff86034aa2276c2c0ae
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_yaml
@@ -0,0 +1,37 @@
+tag:
+    - afrimgsm_cot_tasks
+    - afrimgsm_cot_tasks_prompt_3
+dataset_path: masakhane/afrimgsm
+dataset_name: null  # Overridden by language-specific config.
+output_type: generate_until
+training_split: train
+test_split: test
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: 'Solve the following math question \n\nQuestion: {{question}} \nStep-by-Step Answer: '
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+  - <|eot_id|>
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
+      - function: "take_first"
+  - filter:
+    - function: regex
+      group_select: -1
+      regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+    - function: take_first
+    name: flexible-extract
+metadata:
+  version: 2.0
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_yor.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..003fb63482132f44de4252d96afb25808f99c876
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_yor.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: yor
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_yor_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_zul.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c01468ec7f6a5e6d87f132599fc5df4879516c95
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_3/afrimgsm_cot_zul.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: zul
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_zul_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_amh.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6624ddfe5319d574caf011f2de342bb68d516771
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_amh.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: amh
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_amh_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_eng.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fa82cb876081070e9a300dd1471f18c78a8cc311
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_eng.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_eng_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_ewe.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..135bd975b0a9a4a37f2d3fcce07c6d96b9c9579e
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_ewe.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ewe
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_ewe_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_fra.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..81a060b2c2b7ce84836f88aac2e5e386c2ad2e6b
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_fra.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: fra
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_fra_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_hau.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b53dba5852f274ada14954ef6b839f288d1629dd
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_hau.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_hau_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_ibo.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2a4236e1d6a36e7d241c9d65ca1c111b8bdac536
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_ibo.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_ibo_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_kin.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..51407a6626539a7a79899f5691122c4c5c0881db
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_kin.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_kin_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_lin.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..248ffeee0130f295668f8b9596dea68b6b077527
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_lin.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: lin
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_lin_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_lug.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fbf7c8cd5edf41c05c347423dcb61cb5f84420f3
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_lug.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: lug
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_lug_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_orm.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..218c3f90a1882f702fed06585f30694ef8e9e96b
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_orm.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: orm
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_orm_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_sna.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..81e4840a177d442dd62d2fd08a3e3b36458a65b0
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_sna.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sna
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_sna_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_sot.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..47bcd414523bf5a4b81440e08476c9f9a2d4e794
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_sot.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sot
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_sot_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_swa.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e0b57a14edab9695ceda40dc5a1d44fcd0eb2230
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_swa.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_swa_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_twi.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..abdbdec70ee3203fd39cc99293c52aecc88f37e0
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_twi.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_twi_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_vai.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_vai.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a0b7913b381f35893889225d9cfd3aaaf25555c9
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_vai.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: vai
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_vai_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_wol.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..aa75a3f599284d5dfc9dd58f4647140180e93378
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_wol.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: wol
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_wol_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_xho.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8c125ebedfab2aa922c27627ec29b582b3d6fa37
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_xho.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: xho
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_xho_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..59013d84ed916ab9728f3345f6323b9fbee4c8d6
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_yaml
@@ -0,0 +1,36 @@
+tag:
+    - afrimgsm_cot_tasks
+    - afrimgsm_cot_tasks_prompt_4
+dataset_path: masakhane/afrimgsm
+dataset_name: null  # Overridden by language-specific config.
+output_type: generate_until
+training_split: train
+test_split: test
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+  - <|eot_id|>
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
+      - function: "take_first"
+  - filter:
+    - function: regex
+      group_select: -1
+      regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+    - function: take_first
+    name: flexible-extract
+metadata:
+  version: 2.0
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_yor.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2c960b75f60dcfef53fac9f5cff333baeeb00ef2
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_yor.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_yor_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_zul.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2641b2e58c631b88f85ee8c156a9028c8b319c86
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_4/afrimgsm_cot_zul.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: zul
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_zul_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_amh.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ea5124850e7ce8cdf270c5cd53020c61d9e10491
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_amh.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: amh
+doc_to_text: "For mathematical questions provided in Amharic language. Supply the\
+  \ accurate step by step answer to the provided question. \n\nQuestion: {{question}}\
+  \ \nStep by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_amh_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_eng.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9b485061e5974c691035eaaf18af67001b921e73
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_eng.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng
+doc_to_text: "For mathematical questions provided in English language. Supply the\
+  \ accurate step by step answer to the provided question. \n\nQuestion: {{question}}\
+  \ \nStep by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_eng_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_ewe.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e52f43276704281a95bd366f8512ca6c9af3f4c3
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_ewe.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ewe
+doc_to_text: "For mathematical questions provided in Ewe language. Supply the accurate\
+  \ step by step answer to the provided question. \n\nQuestion: {{question}} \nStep\
+  \ by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_ewe_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_fra.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f311e12a3b8bbb77e9f24d11fa15efbea38330f5
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_fra.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: fra
+doc_to_text: "For mathematical questions provided in French language. Supply the accurate\
+  \ step by step answer to the provided question. \n\nQuestion: {{question}} \nStep\
+  \ by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_fra_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_hau.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..91cc7ace3922ace059d7c4f789d70ba1162d183b
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_hau.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: "For mathematical questions provided in Hausa language. Supply the accurate\
+  \ step by step answer to the provided question. \n\nQuestion: {{question}} \nStep\
+  \ by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_hau_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_ibo.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e2c6a5cc7fb2a54afcfdb98b2d176bda42b23aaf
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_ibo.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: "For mathematical questions provided in Igbo language. Supply the accurate\
+  \ step by step answer to the provided question. \n\nQuestion: {{question}} \nStep\
+  \ by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_ibo_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_kin.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..36c19a99fdf7dfe8410a2d3183e31f633455932c
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_kin.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_text: "For mathematical questions provided in Kinyarwanda language. Supply\
+  \ the accurate step by step answer to the provided question. \n\nQuestion: {{question}}\
+  \ \nStep by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_kin_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_lin.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..419da8ab7aebe31e28c40bb9bd80b34e5b87f867
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_lin.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: lin
+doc_to_text: "For mathematical questions provided in Lingala language. Supply the\
+  \ accurate step by step answer to the provided question. \n\nQuestion: {{question}}\
+  \ \nStep by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_lin_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_lug.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..918a3e31484a4f42b5c78bd53a5ad2194e23507d
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_lug.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: lug
+doc_to_text: "For mathematical questions provided in Luganda language. Supply the\
+  \ accurate step by step answer to the provided question. \n\nQuestion: {{question}}\
+  \ \nStep by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_lug_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_orm.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a9a448f2571248957e397d2660d6bc019ac9245d
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_orm.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: orm
+doc_to_text: "For mathematical questions provided in Oromo language. Supply the accurate\
+  \ step by step answer to the provided question. \n\nQuestion: {{question}} \nStep\
+  \ by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_orm_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_sna.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..645b2898c4ad5ec017af12c813ea3d27f1231d55
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_sna.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sna
+doc_to_text: "For mathematical questions provided in chiShona language. Supply the\
+  \ accurate step by step answer to the provided question. \n\nQuestion: {{question}}\
+  \ \nStep by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_sna_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_sot.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a0b940d919edb0a710784ffcbdab00110b568a65
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_sot.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sot
+doc_to_text: "For mathematical questions provided in Sesotho language. Supply the\
+  \ accurate step by step answer to the provided question. \n\nQuestion: {{question}}\
+  \ \nStep by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_sot_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_swa.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..093ccfa2f705e7f94cffed63e0406f0653f1c867
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_swa.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: "For mathematical questions provided in Swahili language. Supply the\
+  \ accurate step by step answer to the provided question. \n\nQuestion: {{question}}\
+  \ \nStep by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_swa_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_twi.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dd0436e7e8ad0a80e32554fb9991158fe66df7be
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_twi.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_text: "For mathematical questions provided in Twi language. Supply the accurate\
+  \ step by step answer to the provided question. \n\nQuestion: {{question}} \nStep\
+  \ by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_twi_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_vai.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_vai.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0b348be3a2a337a96936ddafe9315fe060dc8516
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_vai.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: vai
+doc_to_text: "For mathematical questions provided in Vai language. Supply the accurate\
+  \ step by step answer to the provided question. \n\nQuestion: {{question}} \nStep\
+  \ by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_vai_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_wol.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b73863adafc318c81c058efdfb3fd251cb987a95
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_wol.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: wol
+doc_to_text: "For mathematical questions provided in Wolof language. Supply the accurate\
+  \ step by step answer to the provided question. \n\nQuestion: {{question}} \nStep\
+  \ by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_wol_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_xho.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1b77d56f2115fc188b5d973d5aaeb33b506830b5
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_xho.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: xho
+doc_to_text: "For mathematical questions provided in isiXhosa language. Supply the\
+  \ accurate step by step answer to the provided question. \n\nQuestion: {{question}}\
+  \ \nStep by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_xho_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..de15089149d133ac1f84ee89d1a20634286ed10c
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_yaml
@@ -0,0 +1,36 @@
+tag:
+    - afrimgsm_cot_tasks
+    - afrimgsm_cot_tasks_prompt_5
+dataset_path: masakhane/afrimgsm
+dataset_name: null  # Overridden by language-specific config.
+output_type: generate_until
+training_split: train
+test_split: test
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+  - <|eot_id|>
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
+      - function: "take_first"
+  - filter:
+    - function: regex
+      group_select: -1
+      regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+    - function: take_first
+    name: flexible-extract
+metadata:
+  version: 2.0
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_yor.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9032313ad485bd0d9e2b90854bb626b432dc1a46
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_yor.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: "For mathematical questions provided in Yoruba language. Supply the accurate\
+  \ step by step answer to the provided question. \n\nQuestion: {{question}} \nStep\
+  \ by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_yor_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_zul.yaml b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0b6ef0037a6bb530304d7fd5031c2f6816d678a3
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/direct_cot/prompt_5/afrimgsm_cot_zul.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: zul
+doc_to_text: "For mathematical questions provided in Zulu language. Supply the accurate\
+  \ step by step answer to the provided question. \n\nQuestion: {{question}} \nStep\
+  \ by step answer: "
+include: afrimgsm_cot_yaml
+task: afrimgsm_cot_zul_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_amh.yaml b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_amh.yaml
deleted file mode 100644
index f00400d96d15547bb73acd53c84ad5d4ce6f024f..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_amh.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: amh
-doc_to_target: '{% if answer is not none %}{{answer[15:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: cot_yaml
-task: afrimgsm_en_cot_amh
diff --git a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_eng.yaml b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_eng.yaml
deleted file mode 100644
index c62bf206a3ff5644c5d213ef394f4f0cbe3667d0..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_eng.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: eng
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: cot_yaml
-task: afrimgsm_en_cot_eng
diff --git a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_ewe.yaml b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_ewe.yaml
deleted file mode 100644
index ea246f7c16cec59da6562b0e17b43da0268caa0e..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_ewe.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: ewe
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: cot_yaml
-task: afrimgsm_en_cot_ewe
diff --git a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_fra.yaml b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_fra.yaml
deleted file mode 100644
index 16bf57b76e4d48384ee909854ce7ac4050215894..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_fra.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: fra
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: cot_yaml
-task: afrimgsm_en_cot_fra
diff --git a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_hau.yaml b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_hau.yaml
deleted file mode 100644
index 2a397baf1e40185883569b53ffc9bb82265b4257..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_hau.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: hau
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: cot_yaml
-task: afrimgsm_en_cot_hau
diff --git a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_ibo.yaml b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_ibo.yaml
deleted file mode 100644
index 9bd7bf62b4c9fed96aa01280c9d157a08cc04efb..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_ibo.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: ibo
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: cot_yaml
-task: afrimgsm_en_cot_ibo
diff --git a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_kin.yaml b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_kin.yaml
deleted file mode 100644
index 841913b7c689a30833282cd40fdbc6a6db4a3dac..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_kin.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: kin
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: cot_yaml
-task: afrimgsm_en_cot_kin
diff --git a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_lin.yaml b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_lin.yaml
deleted file mode 100644
index 76d7fdb91fb8dd39b23d4c8c5a0513eaa6538a6d..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_lin.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: lin
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: cot_yaml
-task: afrimgsm_en_cot_lin
diff --git a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_lug.yaml b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_lug.yaml
deleted file mode 100644
index 84c05bb292fdec783de75f708002ad5e53c3e3fc..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_lug.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: lug
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: cot_yaml
-task: afrimgsm_en_cot_lug
diff --git a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_orm.yaml b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_orm.yaml
deleted file mode 100644
index e9e5600e99104054e169ef1d29da528ef5a9be39..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_orm.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: orm
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: cot_yaml
-task: afrimgsm_en_cot_orm
diff --git a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_sna.yaml b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_sna.yaml
deleted file mode 100644
index 058689623d3fa6147743052f840ab25f8ef0bb4f..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_sna.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: sna
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: cot_yaml
-task: afrimgsm_en_cot_sna
diff --git a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_sot.yaml b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_sot.yaml
deleted file mode 100644
index ae443f1833c3b248941bd0cdbae2e0a058625d4a..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_sot.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: sot
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: cot_yaml
-task: afrimgsm_en_cot_sot
diff --git a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_swa.yaml b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_swa.yaml
deleted file mode 100644
index 1aa2d07d0e132e0cf2787d75ab6e7281b4302f97..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_swa.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: swa
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: cot_yaml
-task: afrimgsm_en_cot_swa
diff --git a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_twi.yaml b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_twi.yaml
deleted file mode 100644
index 2957cb378e5ec6b27f0911eeab048aa91bf40e43..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_twi.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: twi
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: cot_yaml
-task: afrimgsm_en_cot_twi
diff --git a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_wol.yaml b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_wol.yaml
deleted file mode 100644
index 6ecf4c44eff8d04d081a15062272ba168bab7ded..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_wol.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: wol
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: cot_yaml
-task: afrimgsm_en_cot_wol
diff --git a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_xho.yaml b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_xho.yaml
deleted file mode 100644
index 9dc6691bdee31264bcba551b0288980de24b6e7f..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_xho.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: xho
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: cot_yaml
-task: afrimgsm_en_cot_xho
diff --git a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_yor.yaml b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_yor.yaml
deleted file mode 100644
index 8ef29830fa23b3fa561276bf6472a453c7e80384..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_yor.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: yor
-doc_to_target: '{% if answer is not none %}{{answer[16:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: cot_yaml
-task: afrimgsm_en_cot_yor
diff --git a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_zul.yaml b/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_zul.yaml
deleted file mode 100644
index 24f486e0af03eda4a290eee0881da5a3b07dd96c..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/en_cot/afrimgsm_en_cot_zul.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: zul
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: cot_yaml
-task: afrimgsm_en_cot_zul
diff --git a/lm_eval/tasks/afrimgsm/en_cot/cot_yaml b/lm_eval/tasks/afrimgsm/en_cot/cot_yaml
deleted file mode 100644
index b4a0071d0e35ecc03d0899541c2fa3a1af9a32a9..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/en_cot/cot_yaml
+++ /dev/null
@@ -1,37 +0,0 @@
-# This file will be included in the generated language-specific task configs.
-# It doesn't have a yaml file extension as it is not meant to be imported directly by the harness.
-tag:
-    - afrimgsm
-    - afrimgsm_en_cot
-dataset_path: masakhane/afrimgsm
-dataset_name: null  # Overridden by language-specific config.
-output_type: generate_until
-training_split: train
-test_split: test
-generation_kwargs:
-  until:
-    - "\n\n"
-    - "\n"
-  do_sample: false
-  temperature: 0.0
-target_delimiter: " "
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-filter_list:
-  - name: "strict-match"
-    filter:
-      - function: "regex"
-        regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
-      - function: "take_first"
-  - filter:
-    - function: regex
-      group_select: -1
-      regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
-    - function: take_first
-    name: flexible-extract
-metadata:
-  version: 2.0
diff --git a/lm_eval/tasks/afrimgsm/gen_utils.py b/lm_eval/tasks/afrimgsm/gen_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecef389f3a4051e57b652f617b19ddd15d3c26ca
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/gen_utils.py
@@ -0,0 +1,122 @@
+import argparse
+import os
+
+import yaml
+
+
+class FunctionTag:
+    def __init__(self, value):
+        self.value = value
+
+
+def prompt_func(mode, lang):
+    prompt_map = {
+        "prompt_4": "Answer the given question with the step by step solution appropriate numerical value, ensuring that the response is "
+        "clear and without any supplementary information. \n\nQuestion: {{question}} \nStep by step answer: ",
+        "prompt_5": f"For mathematical questions provided in {lang} language. Supply the accurate step by step answer to the "
+        "provided question. \n\nQuestion: {{question}} \nStep by step answer: ",
+    }
+    return prompt_map[mode]
+
+
+def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
+    """
+    Generate a yaml file for each language.
+
+    :param output_dir: The directory to output the files to.
+    :param overwrite: Whether to overwrite files if they already exist.
+    """
+    err = []
+    languages = {
+        "eng": "English",
+        "amh": "Amharic",
+        "ibo": "Igbo",
+        "fra": "French",
+        "sna": "chiShona",
+        "wol": "Wolof",
+        "ewe": "Ewe",
+        "lin": "Lingala",
+        "lug": "Luganda",
+        "xho": "isiXhosa",
+        "kin": "Kinyarwanda",
+        "twi": "Twi",
+        "zul": "Zulu",
+        "orm": "Oromo",
+        "yor": "Yoruba",
+        "hau": "Hausa",
+        "sot": "Sesotho",
+        "swa": "Swahili",
+        "vai": "Vai",
+    }
+
+    for lang in languages.keys():
+        try:
+            file_name = f"afrimgsm_cot_{lang}.yaml"
+            task_name = f"afrimgsm_cot_{lang}_{mode}"
+            yaml_template = "afrimgsm_cot_yaml"
+            if "translate" in output_dir.split("/")[-1]:
+                file_name = f"afrimgsm_cot_translate_{lang}.yaml"
+                task_name = f"afrimgsm_cot_translate_{lang}_{mode}"
+                yaml_template = "afrimgsm_cot_translate_yaml"
+            if int(mode.split("_")[-1]) > 3:
+                yaml_details = {
+                    "include": yaml_template,
+                    "task": task_name,
+                    "dataset_name": lang,
+                    "doc_to_text": prompt_func(mode, languages[lang]),
+                }
+            else:
+                yaml_details = {
+                    "include": yaml_template,
+                    "task": task_name,
+                    "dataset_name": lang,
+                }
+            os.makedirs(f"{output_dir}/{mode}", exist_ok=True)
+            with open(
+                f"{output_dir}/{mode}/{file_name}",
+                "w" if overwrite else "x",
+                encoding="utf8",
+            ) as f:
+                f.write("# Generated by utils.py\n")
+                yaml.dump(
+                    yaml_details,
+                    f,
+                    allow_unicode=True,
+                )
+        except FileExistsError:
+            err.append(file_name)
+
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist (use --overwrite flag):"
+            f" {', '.join(err)}"
+        )
+
+
+def main() -> None:
+    """Parse CLI args and generate language-specific yaml files."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--overwrite",
+        default=True,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default="./translate_cot",
+        help="Directory to write yaml files to",
+    )
+    parser.add_argument(
+        "--mode",
+        default="prompt_5",
+        choices=["prompt_1", "prompt_2", "prompt_3", "prompt_4", "prompt_5"],
+        help="Prompt number",
+    )
+    args = parser.parse_args()
+
+    gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite, mode=args.mode)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_amh.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_amh.yaml
deleted file mode 100644
index 55fbe4bfdb590b6d352b71c16eebefef3cbb3399..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_amh.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: amh
-doc_to_target: '{% if answer is not none %}{{answer[15:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: translate_direct_yaml
-task: afrimgsm_translate_direct_amh
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_eng.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_eng.yaml
deleted file mode 100644
index 1d729a5cab74ddeb5b3e03f97eadef54a5be3a3c..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_eng.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: eng
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: translate_direct_yaml
-task: afrimgsm_translate_direct_eng
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_ewe.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_ewe.yaml
deleted file mode 100644
index 26191dc815bc0747c05af177e38662e4c4581bfb..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_ewe.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: ewe
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: translate_direct_yaml
-task: afrimgsm_translate_direct_ewe
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_fra.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_fra.yaml
deleted file mode 100644
index 9f0331ee8f3f730372c3eaecb0defe0887bd6502..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_fra.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: fra
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: translate_direct_yaml
-task: afrimgsm_translate_direct_fra
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_hau.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_hau.yaml
deleted file mode 100644
index 850dad6351a693c2a738a0a570e15da8b412a63a..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_hau.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: hau
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: translate_direct_yaml
-task: afrimgsm_translate_direct_hau
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_ibo.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_ibo.yaml
deleted file mode 100644
index 8b81178cc719c44419e24b5e14fc5c3e61b73a7a..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_ibo.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: ibo
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: translate_direct_yaml
-task: afrimgsm_translate_direct_ibo
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_kin.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_kin.yaml
deleted file mode 100644
index 5a8f53e2e7e7449b1db465062bfb8524b94d3c85..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_kin.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: kin
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: translate_direct_yaml
-task: afrimgsm_translate_direct_kin
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_lin.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_lin.yaml
deleted file mode 100644
index 58044ee2b887d3a83f9004e303da6c2bc048703f..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_lin.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: lin
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: translate_direct_yaml
-task: afrimgsm_translate_direct_lin
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_lug.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_lug.yaml
deleted file mode 100644
index 87013c146f2ef8bddee0a82c2c21949bcac549b0..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_lug.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: lug
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: translate_direct_yaml
-task: afrimgsm_translate_direct_lug
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_orm.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_orm.yaml
deleted file mode 100644
index 1dd19325a57022df444f04eba5eb1b3ced117b61..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_orm.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: orm
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: translate_direct_yaml
-task: afrimgsm_translate_direct_orm
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_sna.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_sna.yaml
deleted file mode 100644
index d710b1da339ca0012239993417f83c946a7c3e09..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_sna.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: sna
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: translate_direct_yaml
-task: afrimgsm_translate_direct_sna
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_sot.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_sot.yaml
deleted file mode 100644
index 643eaaeef10a1f70b3b7f13b58cb606dd6ae3f73..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_sot.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: sot
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: translate_direct_yaml
-task: afrimgsm_translate_direct_sot
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_swa.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_swa.yaml
deleted file mode 100644
index b882e89c24a75ce06a1790791a084e1c087acc1b..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_swa.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: swa
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: translate_direct_yaml
-task: afrimgsm_translate_direct_swa
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_twi.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_twi.yaml
deleted file mode 100644
index ac946eb7f413d227dfe0fc5b770e0c6c7bc2d159..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_twi.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: twi
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: translate_direct_yaml
-task: afrimgsm_translate_direct_twi
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_wol.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_wol.yaml
deleted file mode 100644
index dbcc6b2e0e553ebe5353abaebbf6030d68c5b024..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_wol.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: wol
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: translate_direct_yaml
-task: afrimgsm_translate_direct_wol
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_xho.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_xho.yaml
deleted file mode 100644
index dfb3d74f40fac640988e1ffba3caf007d56b66ec..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_xho.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: xho
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: translate_direct_yaml
-task: afrimgsm_translate_direct_xho
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_yor.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_yor.yaml
deleted file mode 100644
index 6b4c346ffeeacd42de58efab206db84af0168670..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_yor.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: yor
-doc_to_target: '{% if answer is not none %}{{answer[16:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: translate_direct_yaml
-task: afrimgsm_translate_direct_yor
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_zul.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_zul.yaml
deleted file mode 100644
index 5e79edffadafebb8e31c710e854157046d15b10e..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/translate/afrimgsm_translate_zul.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Generated by utils.py
-dataset_name: zul
-doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
-generation_kwargs:
-  do_sample: false
-  until:
-  - 'Question:'
-  - </s>
-  - <|im_end|>
-include: translate_direct_yaml
-task: afrimgsm_translate_direct_zul
diff --git a/lm_eval/tasks/afrimgsm/translate/afrimgsm_tt.yaml b/lm_eval/tasks/afrimgsm/translate/afrimgsm_tt.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e1cc68abefd07d38ef64c3524337be287a20e779
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/afrimgsm_tt.yaml
@@ -0,0 +1,9 @@
+group: afrimgsm_tt-irokobench
+task:
+  - afrimgsm_tt_tasks
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 2
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_amh.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0f067e53f525c95640c43cd94f02dfca0e4702a8
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_amh.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: amh
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_amh_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_ewe.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1420deed028ca24caea7a72b96bcc53494f7e186
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_ewe.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ewe
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_ewe_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_fra.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b38e82f252e4379f8332ba7b779617db5d53da24
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_fra.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fra
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_fra_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_hau.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..768bcab970c2b6af4edbdaa0897ea4f24e1d0eb5
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_hau.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hau
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_hau_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_ibo.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5333b163698e15aa1b2548395dfa1069e188b6b1
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_ibo.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ibo
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_ibo_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_kin.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ae231d6da0ca0de8571eaaf38cb62121fe57d125
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_kin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kin
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_kin_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_lin.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..65349c7e66fbda73b3262d21fba18c69a88a318a
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_lin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lin
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_lin_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_lug.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7643fc1223f63f698a8ef70835beccea7de9d7a5
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_lug.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lug
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_lug_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_orm.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..55e1992799923fc30ffea628a64b1d4bcab31bf0
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_orm.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: orm
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_orm_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_sna.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2f8826ab4d61ab2a2803c73fd770db538a4db0f0
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_sna.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sna
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_sna_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_sot.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2b206e3fc82f9e519aece099aa2e5d31c783df62
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_sot.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sot
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_sot_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_swa.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3aede319a2ad47e5e0e0a65c18ca8a85da812971
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_swa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swa
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_swa_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_twi.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c8e23103ee7a482a1f84002c3dc40bc10758e659
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_twi.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: twi
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_twi_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_wol.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4b97922fc5d1d051a0a98623a994b695dbf68c86
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_wol.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: wol
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_wol_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_xho.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1abdd50bced23bc6de7768b9f2db931c7f35b4ad
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_xho.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: xho
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_xho_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_yaml b/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..614510895a32f30082ccd6eb5cbdfa87766c4473
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_yaml
@@ -0,0 +1,32 @@
+tag: afrimgsm_tt_tasks
+dataset_path: masakhane/afrimgsm-translate-test
+output_type: generate_until
+test_split: test
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+target_delimiter: ""
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+filter_list:
+  - name: remove_whitespace
+    filter:
+      - function: remove_whitespace
+      - function: take_first
+  - filter:
+    - function: regex
+      group_select: -1
+      regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+    - function: take_first
+    name: flexible-extract
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 2.0
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_yor.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d3927ba8e0369a5220df6d72e4bb474b7e8af7ca
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_yor.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: yor
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_yor_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_zul.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a57260d94ac8b83f7371baecab3e822868dc671b
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_1/afrimgsm_translate_zul.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: zul
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_zul_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_amh.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..49b559be5c1f62ab67498131d29ac0e0091f622d
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_amh.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: amh
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_amh_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_ewe.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cf82f8624adc0bf9b60194b889ee8ccc7df76c70
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_ewe.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ewe
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_ewe_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_fra.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..924ac026e258ab6da6bc0c2be9c55e73f48a3457
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_fra.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fra
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_fra_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_hau.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..86d8dbbca635d347640c699db7bde0cb6d950722
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_hau.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hau
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_hau_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_ibo.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..466ced5c435be500ff453e9242097050cfcc587c
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_ibo.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ibo
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_ibo_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_kin.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..53078341b5822305b3cc5de3652f0e582313aff6
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_kin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kin
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_kin_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_lin.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..72aa73d209de818f83d596148e756752bc44c754
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_lin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lin
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_lin_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_lug.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..88ae24a26d652729a5c52696f8c34fcec358dfd8
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_lug.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lug
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_lug_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_orm.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7e2ffcc32241ed36d042591921737f9c91deabcc
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_orm.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: orm
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_orm_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_sna.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..137ccbcd322421d17c4ea369c34f1beac05ce597
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_sna.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sna
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_sna_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_sot.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5bd7e53ca6ec65492978eb7e2bc95a0569b496e8
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_sot.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sot
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_sot_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_swa.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5134b3c423c88832393b6eb7b74338b8f7e97807
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_swa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swa
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_swa_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_twi.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f6135d99ce5181855cb78f1a0beba3f357c7082c
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_twi.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: twi
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_twi_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_wol.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..db00be881c3170721857cfb3fb685b3adfec4dfc
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_wol.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: wol
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_wol_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_xho.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3be8dd64c7e7aaacebb296c8f51397d4c50fe3ec
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_xho.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: xho
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_xho_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_yaml b/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..63766339e6e0cb5bbb3d9f45d1010d00de0aafd4
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_yaml
@@ -0,0 +1,34 @@
+tag: afrimgsm_tt_tasks
+dataset_path: masakhane/afrimgsm-translate-test
+output_type: generate_until
+test_split: test
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: "Give direct numerical answers for the question provided. \n\nQuestion: {{question}} \nAnswer: "
+target_delimiter: ""
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+should_decontaminate: true
+doc_to_decontamination_query: "Answer: "
+filter_list:
+  - name: remove_whitespace
+    filter:
+      - function: remove_whitespace
+      - function: take_first
+  - filter:
+    - function: regex
+      group_select: -1
+      regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+    - function: take_first
+    name: flexible-extract
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 2.0
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_yor.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..01a54e15eea260f6a4ae5fa28e0ea4c627dc904c
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_yor.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: yor
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_yor_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_zul.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5f7e74df5ba87e25f35e0e13762cd2116797fe65
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_2/afrimgsm_translate_zul.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: zul
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_zul_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_amh.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..04a14a1bf6b69e10873e181d46fcd51e2e901126
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_amh.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: amh
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_amh_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_ewe.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3cda09e47c41ecc23f2e15b16c869fd6e3f13d87
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_ewe.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ewe
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_ewe_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_fra.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..49c95be2cd94d718bd6cb1754eee6a79ae6f5ae8
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_fra.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fra
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_fra_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_hau.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9d16ac8faf84785902baabb92a1a48bcf383a35a
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_hau.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hau
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_hau_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_ibo.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2bbb66ff41f6c55799d77a19e29bfd8e01f0d61d
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_ibo.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ibo
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_ibo_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_kin.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..488061a306ebcbc4e9f53e78214e25d5391475fa
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_kin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kin
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_kin_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_lin.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..928ba457061168ae1524db65a4f03f6ec202349d
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_lin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lin
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_lin_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_lug.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bdc0c80788f1e0e0e4db9bc38d77aab7e2f0f8df
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_lug.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lug
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_lug_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_orm.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..04ec7565c10592c8ff143d0d2a944986e2870c1f
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_orm.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: orm
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_orm_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_sna.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..22ab7bde213b00599cee3e97ef8e4995f2ded97a
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_sna.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sna
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_sna_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_sot.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..617340d07bdab98b185c8e441e1a2e08bca3930b
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_sot.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sot
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_sot_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_swa.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..337ad6e470631085a735ee2133211c8e9258600e
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_swa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swa
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_swa_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_twi.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..eb13aba534c504473735b6cb67c90efab5db9095
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_twi.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: twi
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_twi_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_wol.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f759e6aa6aebd9f6c6f82087dc2756d68a71025e
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_wol.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: wol
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_wol_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_xho.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..50ab80df1d0589e592b072f71628395fe118ef8b
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_xho.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: xho
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_xho_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_yaml b/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..544fa0ccc13fbbeb9eebc4f0eb2cb78b5c68e183
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_yaml
@@ -0,0 +1,32 @@
+tag: afrimgsm_tt_tasks
+dataset_path: masakhane/afrimgsm-translate-test
+output_type: generate_until
+test_split: test
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: "Solve the following math question \n\nQuestion: {{question}} \nAnswer: "
+target_delimiter: ""
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+filter_list:
+  - name: remove_whitespace
+    filter:
+      - function: remove_whitespace
+      - function: take_first
+  - filter:
+    - function: regex
+      group_select: -1
+      regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+    - function: take_first
+    name: flexible-extract
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 2.0
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_yor.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ee1c7917f6356e01b54ad4c545d3f08b2dfcf8a6
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_yor.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: yor
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_yor_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_zul.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f3e21704f20ceba6e34273f8a2d5b1d87b648dda
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_3/afrimgsm_translate_zul.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: zul
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_zul_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_amh.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..60387afce4fa5801492c968fc97a1519fdbeb5a8
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_amh.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: amh
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_amh_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_ewe.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7633bc3efd95541fd6f30aaa8d469fa993a0375e
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_ewe.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ewe
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_ewe_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_fra.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a8e16ea929f8d5639f388c339671f1253554fd1d
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_fra.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: fra
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_fra_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_hau.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9828205094c1b447ce422da748ba347d448f3b0b
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_hau.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_hau_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_ibo.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b8acf8d0fb40630e1c19e54ed5b87687f2bf2897
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_ibo.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_ibo_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_kin.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..74ac117344b5625f1a8671cb938e7d911e849eea
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_kin.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_kin_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_lin.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6cf113619e461a91f0a4517dae08109055ed743f
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_lin.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: lin
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_lin_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_lug.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5dffbdb899140911f0dab979581fc7b0d52674bf
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_lug.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: lug
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_lug_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_orm.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..30f776f4b6c6a05a98b79234dbf45f269170dc0b
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_orm.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: orm
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_orm_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_sna.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..63efa2505e80032a7ef347f1f986f0ec0952c07c
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_sna.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sna
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_sna_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_sot.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..19b86220a5eeb0319c7732add1ed7d969ab72c67
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_sot.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sot
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_sot_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_swa.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..20236ae8e25cbcc8cd43dd54157614518eea26d8
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_swa.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_swa_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_twi.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5fe7e7475ae49642169275f3aff639ba4d3fbeda
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_twi.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_twi_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_wol.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a8fb5640def2bb261c461a80b66ba50324231317
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_wol.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: wol
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_wol_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_xho.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fdb63749c47b38edfb97d20998f4fb8d4479075d
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_xho.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: xho
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_xho_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_yaml b/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2d3903948dd4a8ae2c285e33eb2551f554d2310c
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_yaml
@@ -0,0 +1,31 @@
+tag: afrimgsm_tt_tasks
+dataset_path: masakhane/afrimgsm-translate-test
+output_type: generate_until
+test_split: test
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+target_delimiter: ""
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+filter_list:
+  - name: remove_whitespace
+    filter:
+      - function: remove_whitespace
+      - function: take_first
+  - filter:
+    - function: regex
+      group_select: -1
+      regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+    - function: take_first
+    name: flexible-extract
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 2.0
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_yor.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f5cb74d41b85d71f851f6c3f160edbc85d006151
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_yor.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_yor_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_zul.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6f0a068e9f662d567bb01a208e46a6e0b2d014d2
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_4/afrimgsm_translate_zul.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: zul
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_zul_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_amh.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..48ca09aaafafe7360ca9b0c2872d99e63ff16619
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_amh.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: amh
+doc_to_text: "For mathematical questions provided in Amharic language. Supply the\
+  \ accurate numeric answer to the provided question. \n\nQuestion: {{question}} \n\
+  Answer: "
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_amh_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_ewe.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a4a254f0a2d766380528dde5d2ad9ac7eb67bb2d
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_ewe.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ewe
+doc_to_text: "For mathematical questions provided in Ewe language. Supply the accurate\
+  \ numeric answer to the provided question. \n\nQuestion: {{question}} \nAnswer: "
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_ewe_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_fra.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ac62304550bb0f5b8705c7a9ba3f5934278be55d
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_fra.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: fra
+doc_to_text: "For mathematical questions provided in French language. Supply the accurate\
+  \ numeric answer to the provided question. \n\nQuestion: {{question}} \nAnswer: "
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_fra_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_hau.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..695f1f373464fb93811f28644c0b849fe72de9ed
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_hau.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: "For mathematical questions provided in Hausa language. Supply the accurate\
+  \ numeric answer to the provided question. \n\nQuestion: {{question}} \nAnswer: "
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_hau_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_ibo.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7fd530e7409124357c091d42cbaf5608473976c0
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_ibo.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: "For mathematical questions provided in Igbo language. Supply the accurate\
+  \ numeric answer to the provided question. \n\nQuestion: {{question}} \nAnswer: "
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_ibo_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_kin.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..52ea0a78a2053f7958583167d613ca209b43e22a
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_kin.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_text: "For mathematical questions provided in Kinyarwanda language. Supply\
+  \ the accurate numeric answer to the provided question. \n\nQuestion: {{question}}\
+  \ \nAnswer: "
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_kin_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_lin.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..07cf6a6b0e871bbf004d809d4fcff2f7f063a2a9
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_lin.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: lin
+doc_to_text: "For mathematical questions provided in Lingala language. Supply the\
+  \ accurate numeric answer to the provided question. \n\nQuestion: {{question}} \n\
+  Answer: "
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_lin_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_lug.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fa3461beb8fc3c9f05dd1a58c5ca7e7de4ac6cb3
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_lug.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: lug
+doc_to_text: "For mathematical questions provided in Luganda language. Supply the\
+  \ accurate numeric answer to the provided question. \n\nQuestion: {{question}} \n\
+  Answer: "
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_lug_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_orm.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c1a00385f578feeb5fb5071c7f2835574a68a30e
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_orm.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: orm
+doc_to_text: "For mathematical questions provided in Oromo language. Supply the accurate\
+  \ numeric answer to the provided question. \n\nQuestion: {{question}} \nAnswer: "
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_orm_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_sna.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c7f08a786ecac0379aa660a26e5619055e97063a
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_sna.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sna
+doc_to_text: "For mathematical questions provided in chiShona language. Supply the\
+  \ accurate numeric answer to the provided question. \n\nQuestion: {{question}} \n\
+  Answer: "
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_sna_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_sot.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b258204f4eaed7d4889c597189ab910956e9dbb5
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_sot.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sot
+doc_to_text: "For mathematical questions provided in Sesotho language. Supply the\
+  \ accurate numeric answer to the provided question. \n\nQuestion: {{question}} \n\
+  Answer: "
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_sot_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_swa.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a950c84d3c617353fd9492e6a1d2a028fd836881
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_swa.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: "For mathematical questions provided in Swahili language. Supply the\
+  \ accurate numeric answer to the provided question. \n\nQuestion: {{question}} \n\
+  Answer: "
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_swa_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_twi.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0a0488295249513408b1d33de3a246b663cc523a
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_twi.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_text: "For mathematical questions provided in Twi language. Supply the accurate\
+  \ numeric answer to the provided question. \n\nQuestion: {{question}} \nAnswer: "
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_twi_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_wol.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..61ffc3f938f8cf13da5bac35bed1c6d2a9323acf
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_wol.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: wol
+doc_to_text: "For mathematical questions provided in Wolof language. Supply the accurate\
+  \ numeric answer to the provided question. \n\nQuestion: {{question}} \nAnswer: "
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_wol_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_xho.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c308cc7f57b90ddfe959132e75aad3cc5a0b6f01
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_xho.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: xho
+doc_to_text: "For mathematical questions provided in isiXhosa language. Supply the\
+  \ accurate numeric answer to the provided question. \n\nQuestion: {{question}} \n\
+  Answer: "
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_xho_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_yaml b/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2d3903948dd4a8ae2c285e33eb2551f554d2310c
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_yaml
@@ -0,0 +1,31 @@
+tag: afrimgsm_tt_tasks
+dataset_path: masakhane/afrimgsm-translate-test
+output_type: generate_until
+test_split: test
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+target_delimiter: ""
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+filter_list:
+  - name: remove_whitespace
+    filter:
+      - function: remove_whitespace
+      - function: take_first
+  - filter:
+    - function: regex
+      group_select: -1
+      regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+    - function: take_first
+    name: flexible-extract
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 2.0
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_yor.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f2a0a0fd28fc895e25d309a8f5aaf64769e7de6c
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_yor.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: "For mathematical questions provided in Yoruba language. Supply the accurate\
+  \ numeric answer to the provided question. \n\nQuestion: {{question}} \nAnswer: "
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_yor_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_zul.yaml b/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b52cfb72dfe0f89092718c46e6fd4360a5dd3646
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate/prompt_5/afrimgsm_translate_zul.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: zul
+doc_to_text: "For mathematical questions provided in Zulu language. Supply the accurate\
+  \ numeric answer to the provided question. \n\nQuestion: {{question}} \nAnswer: "
+include: afrimgsm_translate_yaml
+task: afrimgsm_translate_zul_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/translate/translate_direct_yaml b/lm_eval/tasks/afrimgsm/translate/translate_direct_yaml
deleted file mode 100644
index f9f1c866e8f7ef9ac1153b2248d519aac2a9d1b1..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimgsm/translate/translate_direct_yaml
+++ /dev/null
@@ -1,36 +0,0 @@
-# This file will be included in the generated language-specific task configs.
-# It doesn't have a yaml file extension as it is not meant to be imported directly
-# by the harness.
-tag:
-    - afrimgsm
-    - afrimgsm_translate
-dataset_path: masakhane/afrimgsm-translate-test
-dataset_name: null  # Overridden by language-specific config.
-output_type: generate_until
-test_split: test
-generation_kwargs:
-  until:
-    - "\n\n"
-    - "\n"
-  do_sample: false
-  temperature: 0.0
-target_delimiter: " "
-filter_list:
-  - name: remove_whitespace
-    filter:
-      - function: remove_whitespace
-      - function: take_first
-  - filter:
-    - function: regex
-      group_select: -1
-      regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
-    - function: take_first
-    name: flexible-extract
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-metadata:
-  version: 2.0
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/afrimgsm_tt_cot.yaml b/lm_eval/tasks/afrimgsm/translate_cot/afrimgsm_tt_cot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d43ddd233b785cbfba006785de6db94bb4eb5d97
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/afrimgsm_tt_cot.yaml
@@ -0,0 +1,9 @@
+group: afrimgsm_tt_cot-irokobench
+task:
+  - afrimgsm_tt_cot_tasks
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 2
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_amh.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..da7764e81c0665c53c129f42d61629460a74ea1e
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_amh.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: amh
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_amh_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_ewe.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e65e9298656895f4dab45111420da97559d62023
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_ewe.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ewe
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_ewe_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_fra.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a16b91ffb250880c9217e63d6e8c1e46c7d4021c
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_fra.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fra
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_fra_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_hau.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3bee8575de4f774c0ae7510e3a074023919dbbe3
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_hau.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hau
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_hau_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_ibo.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e6f495eaeb6738e0b8b9524341c1bc2453b4c00f
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_ibo.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ibo
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_ibo_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_kin.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..400bf8887718fbe40d6701cb2121a3cd271c1360
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_kin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kin
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_kin_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_lin.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..22599e98c83a4eae8b0c6b103396195af55fbbee
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_lin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lin
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_lin_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_lug.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..83c9565d54a2786fea141b6775681172cf49b592
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_lug.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lug
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_lug_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_orm.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ca19eb14ef29d16affc7efe255e3faa3ae4deb06
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_orm.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: orm
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_orm_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_sna.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e99d3aa7b59d92691fc89687fcf951a993487f89
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_sna.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sna
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_sna_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_sot.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9f8fc2ef28d888800670a9f3068f0d58d670d5ec
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_sot.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sot
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_sot_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_swa.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d0545cccda86167036dc321b11d29c1ca1ca2542
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_swa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swa
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_swa_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_twi.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a0b4f9716cb900a71167b85a68ac402470aec3f0
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_twi.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: twi
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_twi_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_vai.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_vai.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..76c18a3f9172645111dbcb26e0183b6d48f3fc69
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_vai.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: vai
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_vai_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_wol.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ee0d6fc9babb8f968cdcbbd460bdb83f14e14c06
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_wol.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: wol
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_wol_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_xho.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6f340a46529bd305f4bdc5ea73b7cd148ec6d7d1
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_xho.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: xho
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_xho_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8ad7f0069cd8b63090b625ef103a37154356782c
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_yaml
@@ -0,0 +1,33 @@
+tag: afrimgsm_tt_cot_tasks
+dataset_path: masakhane/afrimgsm-translate-test
+dataset_name: null  # Overridden by language-specific config.
+output_type: generate_until
+test_split: test
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
+      - function: "take_first"
+  - filter:
+    - function: regex
+      group_select: -1
+      regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+    - function: take_first
+    name: flexible-extract
+metadata:
+  version: 2.0
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_yor.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cf093766dc687b5a092d984ab8cda6514ffd56a9
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_yor.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: yor
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_yor_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_zul.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1bb302a4d0f2268543c228ba13d0744509c4911d
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_1/afrimgsm_cot_translate_zul.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: zul
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_zul_prompt_1
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_amh.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e9f97735373ee9e47f6234b27002f6f12edb1a13
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_amh.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: amh
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_amh_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_ewe.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1a83764758c3e2540d3a097ed0e0f5ac987604a1
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_ewe.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ewe
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_ewe_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_fra.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b496c775817645ce477d84749de8e71f0badbc22
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_fra.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fra
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_fra_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_hau.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1022ae899b0e2413351e01aafef9de08b00688ba
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_hau.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hau
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_hau_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_ibo.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dd2a2528ef1c104f1714735f1a5b753c10966607
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_ibo.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ibo
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_ibo_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_kin.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4d8a986acd1dd95e9560555b44f2d3f5aed5395d
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_kin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kin
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_kin_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_lin.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..70d4032301398b2124ff128ebb9ed1ba4eb0f0ea
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_lin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lin
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_lin_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_lug.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a774c189513e159a0d9cdf034cd0470cc25d8b84
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_lug.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lug
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_lug_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_orm.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b325b2ce931cbcf05cee50293845043081097387
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_orm.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: orm
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_orm_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_sna.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3e85255881b279d8d4f578bfcbfd96355e8af3cd
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_sna.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sna
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_sna_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_sot.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a298b504439fa6c7d8eab548ecf2a0b997eddc9d
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_sot.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sot
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_sot_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_swa.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e3de9a4c61b8a018007b13e16cd9028d31af92c2
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_swa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swa
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_swa_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_twi.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c2e1ab61ec9af7af947ae656d2c7069230ee02c5
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_twi.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: twi
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_twi_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_vai.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_vai.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9186f1e00c540dc64bb89ca619439e8b927162d5
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_vai.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: vai
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_vai_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_wol.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..185b406be03d84beef24b6c6fc453a4518d7a66f
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_wol.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: wol
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_wol_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_xho.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..52a0e1ca5144bda20184bcc081768098d945a239
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_xho.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: xho
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_xho_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ad059aead35b933cabaf763d549c59592f006fc7
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_yaml
@@ -0,0 +1,33 @@
+tag: afrimgsm_tt_cot_tasks
+dataset_path: masakhane/afrimgsm-translate-test
+dataset_name: null  # Overridden by language-specific config.
+output_type: generate_until
+test_split: test
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: 'Give direct numerical answers for the question provided. \n\nQuestion: {{question}} \Step-by-Step Answer: '
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
+      - function: "take_first"
+  - filter:
+    - function: regex
+      group_select: -1
+      regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+    - function: take_first
+    name: flexible-extract
+metadata:
+  version: 2.0
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_yor.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2452b0fae4a9f939ed115736056091302b9cfa78
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_yor.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: yor
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_yor_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_zul.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2ce8151b79849bbc85be11e8cd535bf7a1e12ede
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_2/afrimgsm_cot_translate_zul.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: zul
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_zul_prompt_2
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_amh.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b627e57564d6839ec2ffde82c0a125e42a5c5b77
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_amh.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: amh
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_amh_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_ewe.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..52dc345f0cc5f2ad617db14a22327d4b2fc298bd
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_ewe.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ewe
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_ewe_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_fra.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d2b7582c34e26f4e6e9cd87aa1c675788a23ccae
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_fra.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fra
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_fra_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_hau.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d57be8c8c4c9dbaebaef523ff4bd9310df1ebc40
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_hau.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hau
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_hau_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_ibo.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..296ea98fc1dd70f50bb72ddb566d738d11d42f68
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_ibo.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ibo
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_ibo_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_kin.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2b555b3e314cc6b50970f06649e7d1f662370073
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_kin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kin
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_kin_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_lin.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0ace69b273060675411488fa639f261b2fb39f8a
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_lin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lin
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_lin_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_lug.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bd25a1661f0720e68e2851701a1a9ed8f0131950
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_lug.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lug
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_lug_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_orm.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..698c1474bd76edf931839d75b3111bafe8b0770c
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_orm.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: orm
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_orm_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_sna.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..354df6bfef4ea01a0b40b6361adb5d21470d395e
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_sna.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sna
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_sna_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_sot.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5990be74d1cceb58eff6a4f9648d1f3de0e11d8a
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_sot.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sot
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_sot_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_swa.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d86662980bf7078dde5636aba335bab9897619c4
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_swa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swa
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_swa_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_twi.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..78ef85fc25c0da4894843624afad970c9ff572b0
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_twi.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: twi
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_twi_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_vai.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_vai.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..25ec4e8fd71e2423370182180efc7b1bc7843e38
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_vai.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: vai
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_vai_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_wol.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7815a0a5f4b77403334b41baebb2e529eda19d1a
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_wol.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: wol
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_wol_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_xho.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e45afd3a0ecc2ce8656e70ebfe29cad1f6ff06ba
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_xho.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: xho
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_xho_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c0bb7d6661f0b78b5e417269c61f0e7fc028848f
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_yaml
@@ -0,0 +1,33 @@
+tag: afrimgsm_tt_cot_tasks
+dataset_path: masakhane/afrimgsm-translate-test
+dataset_name: null  # Overridden by language-specific config.
+output_type: generate_until
+test_split: test
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: 'Solve the following math question \n\nQuestion: {{question}} \nStep-by-Step Answer: '
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
+      - function: "take_first"
+  - filter:
+    - function: regex
+      group_select: -1
+      regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+    - function: take_first
+    name: flexible-extract
+metadata:
+  version: 2.0
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_yor.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..39e18cb48d1f214a13f5beb5a1e2c4d4f34855af
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_yor.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: yor
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_yor_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_zul.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..08fbc9e15e1d963da67e5723f33b926701ca9503
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_3/afrimgsm_cot_translate_zul.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: zul
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_zul_prompt_3
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_amh.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4f73f15f9ef2ba8f0d75e0a0a74cbf00e84cf8e5
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_amh.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: amh
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_amh_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_ewe.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0d57247b8b512c60f5536cade1bbc7804083b2f5
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_ewe.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ewe
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_ewe_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_fra.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6a2f70ca6327e79044f7ed1685282ab8803fcc7d
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_fra.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: fra
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_fra_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_hau.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c5e7903c88c05de0ac942fd0894b523209dad320
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_hau.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_hau_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_ibo.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cf15ed077cd3ff2762deaf532d200e117fb6e9dd
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_ibo.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_ibo_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_kin.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c1b57c1395efb57ed2d8e31df9f2878cfbad59ad
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_kin.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_kin_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_lin.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..81cdb1ff41ede1fd2ceca5c5ebaf099df5df5d45
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_lin.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: lin
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_lin_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_lug.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a949b0289211c9443a6a73880598c762a8c5a8f9
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_lug.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: lug
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_lug_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_orm.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d4deb09238913761e594ec4967a4dabd9d188b02
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_orm.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: orm
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_orm_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_sna.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5ebac1993f84026c0823e52fb41c6574e816b4c7
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_sna.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sna
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_sna_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_sot.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bf83c8f0209f627b01da77a7ab033ab93a276891
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_sot.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sot
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_sot_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_swa.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..87b581b8a5fdb5e9d2d0eede15f5c98b04f88693
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_swa.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_swa_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_twi.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..223901eb208af30d0a8dc549f7d021d343e17076
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_twi.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_twi_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_vai.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_vai.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..92ce3892451070777235d493be10bbe9811ad05d
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_vai.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: vai
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_vai_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_wol.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c626fde4e599303ff73e490d3cbc38b6335d6755
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_wol.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: wol
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_wol_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_xho.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..285b679cc0a7da31dc3d719918ead217d5287b9e
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_xho.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: xho
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_xho_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..241787c7aa25a0ac46e2556efdb8db633e7a0719
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_yaml
@@ -0,0 +1,32 @@
+tag: afrimgsm_tt_cot_tasks
+dataset_path: masakhane/afrimgsm-translate-test
+dataset_name: null  # Overridden by language-specific config.
+output_type: generate_until
+test_split: test
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
+      - function: "take_first"
+  - filter:
+    - function: regex
+      group_select: -1
+      regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+    - function: take_first
+    name: flexible-extract
+metadata:
+  version: 2.0
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_yor.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f76f4cd109026c0c94a578674c4d8140201e5bfa
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_yor.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_yor_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_zul.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7023a5540612c3f7a568313f7d0d26e541839c49
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_4/afrimgsm_cot_translate_zul.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: zul
+doc_to_text: "Answer the given question with the step by step solution appropriate\
+  \ numerical value, ensuring that the response is clear and without any supplementary\
+  \ information. \n\nQuestion: {{question}} \nStep by step answer: "
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_zul_prompt_4
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_amh.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d64f088f46c537580055f91f3eaa347187531da4
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_amh.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: amh
+doc_to_text: "For mathematical questions provided in Amharic language. Supply the\
+  \ accurate step by step answer to the provided question. \n\nQuestion: {{question}}\
+  \ \nStep by step answer: "
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_amh_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_ewe.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..de4aa48d48be2c3ea31b03cb497c4b881ab09ead
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_ewe.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ewe
+doc_to_text: "For mathematical questions provided in Ewe language. Supply the accurate\
+  \ step by step answer to the provided question. \n\nQuestion: {{question}} \nStep\
+  \ by step answer: "
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_ewe_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_fra.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5cf15ea1ad1e7937cb46cf7f5c09014e00e7fea7
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_fra.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: fra
+doc_to_text: "For mathematical questions provided in French language. Supply the accurate\
+  \ step by step answer to the provided question. \n\nQuestion: {{question}} \nStep\
+  \ by step answer: "
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_fra_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_hau.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0dfa643c4cf14b0bbd871f6293f5b142a6a0337a
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_hau.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: "For mathematical questions provided in Hausa language. Supply the accurate\
+  \ step by step answer to the provided question. \n\nQuestion: {{question}} \nStep\
+  \ by step answer: "
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_hau_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_ibo.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..959f389070977606a0d330573c4c4015754b80d7
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_ibo.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: "For mathematical questions provided in Igbo language. Supply the accurate\
+  \ step by step answer to the provided question. \n\nQuestion: {{question}} \nStep\
+  \ by step answer: "
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_ibo_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_kin.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..85ff4196d50b187779c8c39b62cfcb5448ddb258
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_kin.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_text: "For mathematical questions provided in Kinyarwanda language. Supply\
+  \ the accurate step by step answer to the provided question. \n\nQuestion: {{question}}\
+  \ \nStep by step answer: "
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_kin_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_lin.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..87db46c9761e033a42739d1aaaf1d14e51989d14
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_lin.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: lin
+doc_to_text: "For mathematical questions provided in Lingala language. Supply the\
+  \ accurate step by step answer to the provided question. \n\nQuestion: {{question}}\
+  \ \nStep by step answer: "
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_lin_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_lug.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ac0fde85c1607b570abb2c0fa602d0f680ef3fe1
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_lug.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: lug
+doc_to_text: "For mathematical questions provided in Luganda language. Supply the\
+  \ accurate step by step answer to the provided question. \n\nQuestion: {{question}}\
+  \ \nStep by step answer: "
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_lug_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_orm.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bcf34106d924c07a92e63eaacdd3f112e860c25d
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_orm.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: orm
+doc_to_text: "For mathematical questions provided in Oromo language. Supply the accurate\
+  \ step by step answer to the provided question. \n\nQuestion: {{question}} \nStep\
+  \ by step answer: "
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_orm_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_sna.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d5eac98d4e45e5c199cc5a19286c7d0edcd04a9f
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_sna.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sna
+doc_to_text: "For mathematical questions provided in chiShona language. Supply the\
+  \ accurate step by step answer to the provided question. \n\nQuestion: {{question}}\
+  \ \nStep by step answer: "
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_sna_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_sot.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9fc015cdf75b34a6147183cfd3ad2f8bbe4d4660
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_sot.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sot
+doc_to_text: "For mathematical questions provided in Sesotho language. Supply the\
+  \ accurate step by step answer to the provided question. \n\nQuestion: {{question}}\
+  \ \nStep by step answer: "
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_sot_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_swa.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..179af86738feaf0656c65e260cf465284c9bc3e5
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_swa.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: "For mathematical questions provided in Swahili language. Supply the\
+  \ accurate step by step answer to the provided question. \n\nQuestion: {{question}}\
+  \ \nStep by step answer: "
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_swa_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_twi.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ebb680a6f5e54727adabbdd517819e90ca9c2b96
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_twi.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_text: "For mathematical questions provided in Twi language. Supply the accurate\
+  \ step by step answer to the provided question. \n\nQuestion: {{question}} \nStep\
+  \ by step answer: "
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_twi_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_vai.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_vai.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3d2848648e5a752973ca5a21a38b0a2fb82b4127
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_vai.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: vai
+doc_to_text: "For mathematical questions provided in Vai language. Supply the accurate\
+  \ step by step answer to the provided question. \n\nQuestion: {{question}} \nStep\
+  \ by step answer: "
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_vai_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_wol.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..799cc29fbbca2a4a328cec6bf679cc68ad51139f
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_wol.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: wol
+doc_to_text: "For mathematical questions provided in Wolof language. Supply the accurate\
+  \ step by step answer to the provided question. \n\nQuestion: {{question}} \nStep\
+  \ by step answer: "
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_wol_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_xho.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7969fdbabd911f8fe4ffdfb9f7e47364c3a80857
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_xho.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: xho
+doc_to_text: "For mathematical questions provided in isiXhosa language. Supply the\
+  \ accurate step by step answer to the provided question. \n\nQuestion: {{question}}\
+  \ \nStep by step answer: "
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_xho_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..241787c7aa25a0ac46e2556efdb8db633e7a0719
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_yaml
@@ -0,0 +1,32 @@
+tag: afrimgsm_tt_cot_tasks
+dataset_path: masakhane/afrimgsm-translate-test
+dataset_name: null  # Overridden by language-specific config.
+output_type: generate_until
+test_split: test
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
+      - function: "take_first"
+  - filter:
+    - function: regex
+      group_select: -1
+      regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+    - function: take_first
+    name: flexible-extract
+metadata:
+  version: 2.0
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_yor.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0d05de223110d6e434bcf75bb2f9cf71957b76d3
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_yor.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: "For mathematical questions provided in Yoruba language. Supply the accurate\
+  \ step by step answer to the provided question. \n\nQuestion: {{question}} \nStep\
+  \ by step answer: "
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_yor_prompt_5
diff --git a/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_zul.yaml b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..68329068941f34bf7e53739334e8101c46a0ecfe
--- /dev/null
+++ b/lm_eval/tasks/afrimgsm/translate_cot/prompt_5/afrimgsm_cot_translate_zul.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: zul
+doc_to_text: "For mathematical questions provided in Zulu language. Supply the accurate\
+  \ step by step answer to the provided question. \n\nQuestion: {{question}} \nStep\
+  \ by step answer: "
+include: afrimgsm_cot_translate_yaml
+task: afrimgsm_cot_translate_zul_prompt_5
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..202c31825bfcdaa8ea974e8f51444bc864ed4306
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/afrimmlu.yaml
@@ -0,0 +1,13 @@
+group: afrimmlu-irokobench
+task:
+  - afrimmlu_tasks_prompt_1
+  - afrimmlu_tasks_prompt_2
+  - afrimmlu_tasks_prompt_3
+  - afrimmlu_tasks_prompt_4
+  - afrimmlu_tasks_prompt_5
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 2
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_common_yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_common_yaml
deleted file mode 100644
index 53acc4c83206969667d4792eb35ddf8645fcf5ae..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_common_yaml
+++ /dev/null
@@ -1,37 +0,0 @@
-tag:
-  - afrimmlu
-  - afrimmlu_direct
-task: null
-dataset_path: masakhane/afrimmlu
-dataset_name: null
-output_type: multiple_choice
-validation_split: validation
-test_split: test
-fewshot_split: validation
-doc_to_text: !function utils.doc_to_text
-doc_to_target: "{{['A', 'B', 'C', 'D'].index(answer)}}"
-doc_to_choice: !function utils.doc_to_choice
-should_decontaminate: true
-doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
-metric_list:
-  - metric: f1
-    aggregation: !function utils.weighted_f1_score
-    # aggregation: mean
-    average: weighted
-    hf_evaluate: true
-    higher_is_better: True
-    ignore_case: true
-    ignore_punctuation: true
-    regexes_to_ignore:
-      - ","
-      - "\\$"
-  - metric: acc
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-    regexes_to_ignore:
-      - ","
-      - "\\$"
-metadata:
-  version: 1.0
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_amh.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_amh.yaml
deleted file mode 100644
index aa60c668fd9b2879f020f990655e7eedce2b3a81..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_amh.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dataset_name: amh
-include: afrimmlu_common_yaml
-task: afrimmlu_direct_amh
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_eng.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_eng.yaml
deleted file mode 100644
index a1e647cdf1d0278c73744288fa61cd7709550231..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_eng.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dataset_name: eng
-include: afrimmlu_common_yaml
-task: afrimmlu_direct_eng
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ewe.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ewe.yaml
deleted file mode 100644
index 1cc45ddc0e50d1bb4992aecdb4f5208dbb77881b..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ewe.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dataset_name: ewe
-include: afrimmlu_common_yaml
-task: afrimmlu_direct_ewe
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_fra.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_fra.yaml
deleted file mode 100644
index e6adb6c8aa4e50c6efca737792907cb658c30627..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_fra.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dataset_name: fra
-include: afrimmlu_common_yaml
-task: afrimmlu_direct_fra
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_hau.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_hau.yaml
deleted file mode 100644
index 9cc9a1ae7acc7318faf68a241f68b0d5cba93978..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_hau.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dataset_name: hau
-include: afrimmlu_common_yaml
-task: afrimmlu_direct_hau
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ibo.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ibo.yaml
deleted file mode 100644
index 6abb2c4a467986751376679b31ec5db8a7af0886..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ibo.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dataset_name: ibo
-include: afrimmlu_common_yaml
-task: afrimmlu_direct_ibo
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_kin.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_kin.yaml
deleted file mode 100644
index 2f81f709c4812db3ecfa71bbb9cfb74099a10aab..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_kin.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dataset_name: kin
-include: afrimmlu_common_yaml
-task: afrimmlu_direct_kin
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lin.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lin.yaml
deleted file mode 100644
index 55363ed93772284fc54386592ae827c03246d681..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lin.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dataset_name: lin
-include: afrimmlu_common_yaml
-task: afrimmlu_direct_lin
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lug.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lug.yaml
deleted file mode 100644
index 0d484427eda8fcd4b645b3f90b191f075cb88ce9..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lug.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dataset_name: lug
-include: afrimmlu_common_yaml
-task: afrimmlu_direct_lug
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_orm.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_orm.yaml
deleted file mode 100644
index 763eb8a75f894797185436d3a83c9fd57393f4ac..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_orm.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dataset_name: orm
-include: afrimmlu_common_yaml
-task: afrimmlu_direct_orm
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sna.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sna.yaml
deleted file mode 100644
index ed9e69af392838290bac14d08259585c56daace8..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sna.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dataset_name: sna
-include: afrimmlu_common_yaml
-task: afrimmlu_direct_sna
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sot.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sot.yaml
deleted file mode 100644
index acdba0fdccf12f73004669dbed1b7cbee9ded24f..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sot.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dataset_name: sot
-include: afrimmlu_common_yaml
-task: afrimmlu_direct_sot
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_swa.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_swa.yaml
deleted file mode 100644
index c1aa82b0b1d44314c337b904c346806cb3c720a4..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_swa.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dataset_name: swa
-include: afrimmlu_common_yaml
-task: afrimmlu_direct_swa
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_twi.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_twi.yaml
deleted file mode 100644
index 2695d4a156d4b59dbb2c483ebdbbc16e01c7a415..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_twi.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dataset_name: twi
-include: afrimmlu_common_yaml
-task: afrimmlu_direct_twi
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_wol.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_wol.yaml
deleted file mode 100644
index 027f837637fb061d227d33e925d3030af51c3cbe..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_wol.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dataset_name: wol
-include: afrimmlu_common_yaml
-task: afrimmlu_direct_wol
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_xho.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_xho.yaml
deleted file mode 100644
index 8e0c12972d01be342a6838b0eab4c1f609d6dc48..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_xho.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dataset_name: xho
-include: afrimmlu_common_yaml
-task: afrimmlu_direct_xho
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_yor.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_yor.yaml
deleted file mode 100644
index 2a9f7645c2259a607f871e54b07c14ab962ed04c..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_yor.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dataset_name: yor
-include: afrimmlu_common_yaml
-task: afrimmlu_direct_yor
diff --git a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_zul.yaml b/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_zul.yaml
deleted file mode 100644
index 9d8d3b415b44ef4ab0b762f411006c7b00d54226..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_zul.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dataset_name: zul
-include: afrimmlu_common_yaml
-task: afrimmlu_direct_zul
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct
new file mode 100644
index 0000000000000000000000000000000000000000..a3e17f711f6eac83c52fad1d3f0314a01f08d169
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct
@@ -0,0 +1,37 @@
+tag:
+    - afrimmlu_tasks
+    - afrimmlu_tasks_prompt_1
+    - afrobench_mmlu_tasks
+dataset_path: masakhane/afrimmlu
+dataset_name: null
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: validation
+doc_to_text: !function utils.doc_to_text
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(answer)}}"
+doc_to_choice: !function utils.doc_to_choice
+should_decontaminate: true
+doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_amh.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8a26369b36ee47ed6ac21c448c316acaf90af749
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_amh.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: amh
+include: afrimmlu_direct
+task: afrimmlu_direct_amh_prompt_1
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_eng.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..18a34c7b719cdef3254e2472399b7fdd3121d543
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_eng.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: eng
+include: afrimmlu_direct
+task: afrimmlu_direct_eng_prompt_1
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_ewe.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e85bd7dc7dcc4fa2f5ea90f4f540a0a75b160dbd
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_ewe.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ewe
+include: afrimmlu_direct
+task: afrimmlu_direct_ewe_prompt_1
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_fra.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4e8a2875e71f43dbdd148331d24e6440f92ad71f
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_fra.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fra
+include: afrimmlu_direct
+task: afrimmlu_direct_fra_prompt_1
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_hau.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b438ea3198a18caf74d44a97f2d4752337edc082
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_hau.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hau
+include: afrimmlu_direct
+task: afrimmlu_direct_hau_prompt_1
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_ibo.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8b08d48e0a3c42e123c081935d5ffcc71e1c56c7
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_ibo.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ibo
+include: afrimmlu_direct
+task: afrimmlu_direct_ibo_prompt_1
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_kin.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..00d82dfa57e6a77d52f476ae78c54edbc677628d
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_kin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kin
+include: afrimmlu_direct
+task: afrimmlu_direct_kin_prompt_1
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_lin.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7059c941d2dffd27c8eda15dc1fc087a626455a2
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_lin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lin
+include: afrimmlu_direct
+task: afrimmlu_direct_lin_prompt_1
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_lug.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3301647298d216de664ff07e2c8a10e134afe388
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_lug.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lug
+include: afrimmlu_direct
+task: afrimmlu_direct_lug_prompt_1
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_orm.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5047ae98abd0d3ea8ebeb98233ae4fb1ebb42dab
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_orm.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: orm
+include: afrimmlu_direct
+task: afrimmlu_direct_orm_prompt_1
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_sna.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..17222f95253270fdcff74177fbd0474cb75660b3
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_sna.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sna
+include: afrimmlu_direct
+task: afrimmlu_direct_sna_prompt_1
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_sot.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c62ce9bf4957545dc39f96d7bd6dc60ce60e868a
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_sot.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sot
+include: afrimmlu_direct
+task: afrimmlu_direct_sot_prompt_1
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_swa.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c5ebed9f96e857356adf2ccaa2de2cf818874e71
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_swa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swa
+include: afrimmlu_direct
+task: afrimmlu_direct_swa_prompt_1
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_twi.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fb270c949a7a250e4f4810a5086a80ff22716f1f
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_twi.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: twi
+include: afrimmlu_direct
+task: afrimmlu_direct_twi_prompt_1
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_wol.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4ccbc47cd02c5de0c886deed9f4549141884eac8
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_wol.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: wol
+include: afrimmlu_direct
+task: afrimmlu_direct_wol_prompt_1
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_xho.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3e30d2017740585b5c675ec898fa9d9512e4ac52
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_xho.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: xho
+include: afrimmlu_direct
+task: afrimmlu_direct_xho_prompt_1
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_yor.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e3de56f8d3c9fe1f283712ea359ead734a413d93
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_yor.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: yor
+include: afrimmlu_direct
+task: afrimmlu_direct_yor_prompt_1
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_zul.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..86c56fec097dc4c636070f6c0ab0750a23bb2435
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_1/afrimmlu_direct_zul.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: zul
+include: afrimmlu_direct
+task: afrimmlu_direct_zul_prompt_1
diff --git a/lm_eval/tasks/afrimmlu/direct/utils.py b/lm_eval/tasks/afrimmlu/direct/prompt_1/utils.py
similarity index 100%
rename from lm_eval/tasks/afrimmlu/direct/utils.py
rename to lm_eval/tasks/afrimmlu/direct/prompt_1/utils.py
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct
new file mode 100644
index 0000000000000000000000000000000000000000..fefabf7e0b52e644d1e9d922c8f899607eab6075
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct
@@ -0,0 +1,37 @@
+tag:
+    - afrimmlu_tasks
+    - afrimmlu_tasks_prompt_2
+    - afrobench_mmlu_tasks
+dataset_path: masakhane/afrimmlu
+dataset_name: null
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: validation
+doc_to_text: !function utils.doc_to_text
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(answer)}}"
+doc_to_choice: !function utils.doc_to_choice
+should_decontaminate: true
+doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_amh.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..85d85171bfe9d84205a9ab218ed496aed1eecf73
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_amh.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: amh
+include: afrimmlu_direct
+task: afrimmlu_direct_amh_prompt_2
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_eng.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c46eca5e68356372fc43c1b1908e45667ff05d12
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_eng.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: eng
+include: afrimmlu_direct
+task: afrimmlu_direct_eng_prompt_2
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_ewe.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..26acfcfa93b3019a746a5e9a78e4cdb48871c978
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_ewe.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ewe
+include: afrimmlu_direct
+task: afrimmlu_direct_ewe_prompt_2
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_fra.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..47f0bfb14b6cd7eaf34618eb0709f9f3f0c9b666
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_fra.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fra
+include: afrimmlu_direct
+task: afrimmlu_direct_fra_prompt_2
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_hau.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..29b4a4d2029e945c4bf52654a819dcb89b898431
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_hau.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hau
+include: afrimmlu_direct
+task: afrimmlu_direct_hau_prompt_2
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_ibo.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0cf7db0e4c0585c2cde8f0645e64438546ba5818
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_ibo.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ibo
+include: afrimmlu_direct
+task: afrimmlu_direct_ibo_prompt_2
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_kin.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ce7c2e896509b874459deb156f2ba34287a908c9
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_kin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kin
+include: afrimmlu_direct
+task: afrimmlu_direct_kin_prompt_2
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_lin.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..51fcea62af8ca4fa64643ef4ea170444ce25beef
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_lin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lin
+include: afrimmlu_direct
+task: afrimmlu_direct_lin_prompt_2
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_lug.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f4c57ae36fbad9a75737562b6cb619b53e933c34
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_lug.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lug
+include: afrimmlu_direct
+task: afrimmlu_direct_lug_prompt_2
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_orm.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..494d4240693fe9907f965cd8ad5ccc71fcfc2868
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_orm.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: orm
+include: afrimmlu_direct
+task: afrimmlu_direct_orm_prompt_2
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_sna.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7706ad64ccc0baef8fc4964e61871805187db548
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_sna.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sna
+include: afrimmlu_direct
+task: afrimmlu_direct_sna_prompt_2
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_sot.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..353bd2574657f0b7f49d0be77edd777893ac549b
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_sot.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sot
+include: afrimmlu_direct
+task: afrimmlu_direct_sot_prompt_2
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_swa.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..54a16c6c2f5839c6aec545c74f6a5df3293938df
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_swa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swa
+include: afrimmlu_direct
+task: afrimmlu_direct_swa_prompt_2
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_twi.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8bb35bd5f9ad784fbf2101e3ce14e82710c75858
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_twi.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: twi
+include: afrimmlu_direct
+task: afrimmlu_direct_twi_prompt_2
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_wol.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..963f7cd2cdee47fda381af3cbe1a63c56601b709
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_wol.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: wol
+include: afrimmlu_direct
+task: afrimmlu_direct_wol_prompt_2
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_xho.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9da0589a8bfe5791ff3764fd26e1f375a836b701
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_xho.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: xho
+include: afrimmlu_direct
+task: afrimmlu_direct_xho_prompt_2
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_yor.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..39b365418eda1be6e4c344d4e32717045d2eafda
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_yor.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: yor
+include: afrimmlu_direct
+task: afrimmlu_direct_yor_prompt_2
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_zul.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8766392a0dc7e8abd5b8145f37bcd13f424a8b6a
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_2/afrimmlu_direct_zul.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: zul
+include: afrimmlu_direct
+task: afrimmlu_direct_zul_prompt_2
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_2/utils.py b/lm_eval/tasks/afrimmlu/direct/prompt_2/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0cfb334c27cfe4c5bbb1ff7126215c0ea9130c9
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_2/utils.py
@@ -0,0 +1,30 @@
+from lm_eval.utils import weighted_f1_score
+
+
+def doc_to_choice(doc):
+    choices = eval(doc["choices"])
+    return choices
+
+
+def doc_to_text(doc):
+    output = """As an expert in {subject}, choose the most accurate answer to the question below.
+Your goal is to select the correct option 'A', 'B', 'C', or 'D' by understanding the nuances of the topic.
+
+Question: {question}
+Choices:
+        A: {choice1}
+        B: {choice2}
+        C: {choice3}
+        D: {choice4}
+Answer: """
+
+    choices = eval(doc["choices"])
+    text = output.format(
+        subject=doc["subject"],
+        question=doc["question"],
+        choice1=choices[0],
+        choice2=choices[1],
+        choice3=choices[2],
+        choice4=choices[3],
+    )
+    return text
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct
new file mode 100644
index 0000000000000000000000000000000000000000..fb2fd165fcba0457c82e825afa5d8252546dc09c
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct
@@ -0,0 +1,37 @@
+tag:
+    - afrimmlu_tasks
+    - afrimmlu_tasks_prompt_3
+    - afrobench_mmlu_tasks
+dataset_path: masakhane/afrimmlu
+dataset_name: null
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: validation
+doc_to_text: !function utils.doc_to_text
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(answer)}}"
+doc_to_choice: !function utils.doc_to_choice
+should_decontaminate: true
+doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_amh.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c7c28f20b09193f8a0a5c1c0f4ffd8ae59312a08
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_amh.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: amh
+include: afrimmlu_direct
+task: afrimmlu_direct_amh_prompt_3
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_eng.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..83f7cfcb32c1d85061a3d9b6e1cca169a61d4ff0
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_eng.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: eng
+include: afrimmlu_direct
+task: afrimmlu_direct_eng_prompt_3
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_ewe.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..351bdf330c4b30d85448e45b9233aaf6cb704c4b
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_ewe.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ewe
+include: afrimmlu_direct
+task: afrimmlu_direct_ewe_prompt_3
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_fra.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..691978187578805d95bc215c8d678273f02d343d
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_fra.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fra
+include: afrimmlu_direct
+task: afrimmlu_direct_fra_prompt_3
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_hau.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..90521523bef4afe8420fc579108dd2353afb49f3
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_hau.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hau
+include: afrimmlu_direct
+task: afrimmlu_direct_hau_prompt_3
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_ibo.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..43a88fe6c13d804531aa7e251fe86c0102562bc9
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_ibo.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ibo
+include: afrimmlu_direct
+task: afrimmlu_direct_ibo_prompt_3
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_kin.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..977f3ab259efac5e6afcccdfd44e04279651ad18
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_kin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kin
+include: afrimmlu_direct
+task: afrimmlu_direct_kin_prompt_3
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_lin.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2d25584a3fe0d9c7a73e13ab7a3f1b8652616efd
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_lin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lin
+include: afrimmlu_direct
+task: afrimmlu_direct_lin_prompt_3
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_lug.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2b4da1a7f550f66c1b3f084879413d2d9fc13641
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_lug.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lug
+include: afrimmlu_direct
+task: afrimmlu_direct_lug_prompt_3
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_orm.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2738f980d41cea65fa790aa16232a0f6a7584226
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_orm.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: orm
+include: afrimmlu_direct
+task: afrimmlu_direct_orm_prompt_3
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_sna.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..063d111ac4aa0c8625d8615e7b13c1d10ac906fb
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_sna.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sna
+include: afrimmlu_direct
+task: afrimmlu_direct_sna_prompt_3
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_sot.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6e66d4fb91e3e98ed5b0fe955379bcb31bf26
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_sot.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sot
+include: afrimmlu_direct
+task: afrimmlu_direct_sot_prompt_3
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_swa.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e90204d40f2fef22b46b190b552cd9f9fcb777b0
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_swa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swa
+include: afrimmlu_direct
+task: afrimmlu_direct_swa_prompt_3
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_twi.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..719ebe9002cc9fab19dfa793153d779ad8ffbee6
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_twi.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: twi
+include: afrimmlu_direct
+task: afrimmlu_direct_twi_prompt_3
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_wol.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8f0f1d0d709b8f0dddd5708b66f6d27a122984d0
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_wol.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: wol
+include: afrimmlu_direct
+task: afrimmlu_direct_wol_prompt_3
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_xho.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8fc1af4d171021226243c69a59b44363b4a16639
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_xho.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: xho
+include: afrimmlu_direct
+task: afrimmlu_direct_xho_prompt_3
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_yor.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a641b03ae173f2342e8d7178119e68fea2e5f000
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_yor.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: yor
+include: afrimmlu_direct
+task: afrimmlu_direct_yor_prompt_3
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_zul.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8c6b493d34a99a4250676c2f0130ceff6b4ea4f8
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_3/afrimmlu_direct_zul.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: zul
+include: afrimmlu_direct
+task: afrimmlu_direct_zul_prompt_3
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_3/utils.py b/lm_eval/tasks/afrimmlu/direct/prompt_3/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc3da2e29667b4b25f68757e2169a5c8aa0c8dea
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_3/utils.py
@@ -0,0 +1,32 @@
+from lm_eval.utils import weighted_f1_score
+
+
+def doc_to_choice(doc):
+    choices = eval(doc["choices"])
+    return choices
+
+
+def doc_to_text(doc):
+    output = """You are a subject matter expert in {subject}.
+
+  Utilizing your expertise in {subject}, answer the following multiple-choice question
+  by picking 'A', 'B', 'C', or 'D'.
+
+Question: {question}
+Choices:
+        A: {choice1}
+        B: {choice2}
+        C: {choice3}
+        D: {choice4}
+Answer: """
+
+    choices = eval(doc["choices"])
+    text = output.format(
+        subject=doc["subject"],
+        question=doc["question"],
+        choice1=choices[0],
+        choice2=choices[1],
+        choice3=choices[2],
+        choice4=choices[3],
+    )
+    return text
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct
new file mode 100644
index 0000000000000000000000000000000000000000..c15b7b2fc3991517b15f2c370a246adb907f2e52
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct
@@ -0,0 +1,37 @@
+tag:
+    - afrimmlu_tasks
+    - afrimmlu_tasks_prompt_4
+    - afrobench_mmlu_tasks
+dataset_path: masakhane/afrimmlu
+dataset_name: null
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: validation
+doc_to_text: !function utils.doc_to_text
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(answer)}}"
+doc_to_choice: !function utils.doc_to_choice
+should_decontaminate: true
+doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_amh.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cc862dc2327d23f394742ee51031e37fc7c97ff1
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_amh.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: amh
+include: afrimmlu_direct
+task: afrimmlu_direct_amh_prompt_4
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_eng.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..69baef502b78bce307b77a7c44c8c4323ebc1102
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_eng.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: eng
+include: afrimmlu_direct
+task: afrimmlu_direct_eng_prompt_4
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_ewe.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f5af1074f4b993daa4f2468f60baf83b48bfc470
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_ewe.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ewe
+include: afrimmlu_direct
+task: afrimmlu_direct_ewe_prompt_4
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_fra.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d1f94eea1e47bd9442dbc76181069bf47ac29da1
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_fra.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fra
+include: afrimmlu_direct
+task: afrimmlu_direct_fra_prompt_4
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_hau.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ca8f7c5ed0ae15bc5a5e96c776f2251f2cff06fa
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_hau.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hau
+include: afrimmlu_direct
+task: afrimmlu_direct_hau_prompt_4
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_ibo.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8a181d07cc6a81aaf42308fc137c2241a0d8d444
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_ibo.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ibo
+include: afrimmlu_direct
+task: afrimmlu_direct_ibo_prompt_4
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_kin.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8f86122466a7db1fdc06a2352385fdc1fc78bd69
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_kin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kin
+include: afrimmlu_direct
+task: afrimmlu_direct_kin_prompt_4
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_lin.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3c7d3ecf7a86a68248a49d5fc97947ca8da69b0b
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_lin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lin
+include: afrimmlu_direct
+task: afrimmlu_direct_lin_prompt_4
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_lug.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..467201319f12257549de2fa3c260591dee13f311
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_lug.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lug
+include: afrimmlu_direct
+task: afrimmlu_direct_lug_prompt_4
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_orm.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e52668253d495c2583d9f5e964dc73c6850a5729
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_orm.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: orm
+include: afrimmlu_direct
+task: afrimmlu_direct_orm_prompt_4
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_sna.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..af29225a1ca8c63c042d661ce5dc5331a44ed28a
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_sna.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sna
+include: afrimmlu_direct
+task: afrimmlu_direct_sna_prompt_4
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_sot.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0342dc10b71f69880b3a3352f9d9def10f9815c8
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_sot.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sot
+include: afrimmlu_direct
+task: afrimmlu_direct_sot_prompt_4
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_swa.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ec9a3525f534ac5ea7ed6817d8f52bc67b57c445
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_swa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swa
+include: afrimmlu_direct
+task: afrimmlu_direct_swa_prompt_4
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_twi.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..83dc916c68120a6a28234dfaad48b71c9cbfdba3
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_twi.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: twi
+include: afrimmlu_direct
+task: afrimmlu_direct_twi_prompt_4
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_wol.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e656af2c6697004f7ea94ff638b8b8d4f9f8d549
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_wol.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: wol
+include: afrimmlu_direct
+task: afrimmlu_direct_wol_prompt_4
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_xho.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ab23d9346400d5ec0eedb7f91f530a04499f163c
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_xho.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: xho
+include: afrimmlu_direct
+task: afrimmlu_direct_xho_prompt_4
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_yor.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0dd0254819a9ef2b6b2b795bbcfad7ed8ef1c314
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_yor.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: yor
+include: afrimmlu_direct
+task: afrimmlu_direct_yor_prompt_4
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_zul.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..98a0937fc74a67a0fae7faa39164f10c288aa3f7
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_4/afrimmlu_direct_zul.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: zul
+include: afrimmlu_direct
+task: afrimmlu_direct_zul_prompt_4
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_4/utils.py b/lm_eval/tasks/afrimmlu/direct/prompt_4/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..29c23b7f856b2ab4ead359cafbbf404241e53ffb
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_4/utils.py
@@ -0,0 +1,28 @@
+from lm_eval.utils import weighted_f1_score
+
+
+def doc_to_choice(doc):
+    choices = eval(doc["choices"])
+    return choices
+
+
+def doc_to_text(doc):
+    output = """Analyze each question critically and determine the most correct option based on your understanding of the subject matter
+
+Question: {question}
+Choices:
+        A: {choice1}
+        B: {choice2}
+        C: {choice3}
+        D: {choice4}
+Answer: """
+
+    choices = eval(doc["choices"])
+    text = output.format(
+        question=doc["question"],
+        choice1=choices[0],
+        choice2=choices[1],
+        choice3=choices[2],
+        choice4=choices[3],
+    )
+    return text
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct
new file mode 100644
index 0000000000000000000000000000000000000000..3da1eb827af65c9bcb69dd4af7eab06df848ade2
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct
@@ -0,0 +1,37 @@
+tag:
+    - afrimmlu_tasks
+    - afrimmlu_tasks_prompt_5
+    - afrobench_mmlu_tasks
+dataset_path: masakhane/afrimmlu
+dataset_name: null
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: validation
+doc_to_text: !function utils.doc_to_text
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(answer)}}"
+doc_to_choice: !function utils.doc_to_choice
+should_decontaminate: true
+doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_amh.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cff031d7936a5b82bacf175d8d17e54a51d7fe92
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_amh.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: amh
+include: afrimmlu_direct
+task: afrimmlu_direct_amh_prompt_5
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_eng.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..52f317982b39dd21a3e11ce6695b95af9ef8f8df
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_eng.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: eng
+include: afrimmlu_direct
+task: afrimmlu_direct_eng_prompt_5
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_ewe.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cef2f86599b589c38e7fb30723a3024b3fcfeffe
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_ewe.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ewe
+include: afrimmlu_direct
+task: afrimmlu_direct_ewe_prompt_5
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_fra.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..042c0bbbf0c5afc60776127770b88d15c7c7c5ea
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_fra.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fra
+include: afrimmlu_direct
+task: afrimmlu_direct_fra_prompt_5
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_hau.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cd507182558a8884859713d4fbcf356898d9176c
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_hau.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hau
+include: afrimmlu_direct
+task: afrimmlu_direct_hau_prompt_5
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_ibo.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9e9839001ed95c56ae13b1fa97466fbbb39c4acb
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_ibo.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ibo
+include: afrimmlu_direct
+task: afrimmlu_direct_ibo_prompt_5
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_kin.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1d157559f8c8ea7172bdf64a78506e02830ee633
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_kin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kin
+include: afrimmlu_direct
+task: afrimmlu_direct_kin_prompt_5
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_lin.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6eca1f8e7ce1b442be6b62ec924143d736f97c62
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_lin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lin
+include: afrimmlu_direct
+task: afrimmlu_direct_lin_prompt_5
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_lug.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..854b160dc4dc6bc4192ab6c2f73a0e8286da6376
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_lug.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lug
+include: afrimmlu_direct
+task: afrimmlu_direct_lug_prompt_5
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_orm.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9592e585bbe1c62b6b4f2eb25ab02facda7bc242
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_orm.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: orm
+include: afrimmlu_direct
+task: afrimmlu_direct_orm_prompt_5
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_sna.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..51d05c686db303d55507c16f8a31c56ec3222f29
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_sna.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sna
+include: afrimmlu_direct
+task: afrimmlu_direct_sna_prompt_5
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_sot.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cce0e4607d45d5e302f7642b2343a7b1a1dcf991
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_sot.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sot
+include: afrimmlu_direct
+task: afrimmlu_direct_sot_prompt_5
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_swa.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c1cd2672b09792c6fd0cf6a9c3c77b83ac5cdbcb
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_swa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swa
+include: afrimmlu_direct
+task: afrimmlu_direct_swa_prompt_5
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_twi.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1e2e258c6d020e6599fe3d8fd92388958fce14b1
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_twi.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: twi
+include: afrimmlu_direct
+task: afrimmlu_direct_twi_prompt_5
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_wol.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d721871b35a08b564c04af09ca32349a4433bf93
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_wol.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: wol
+include: afrimmlu_direct
+task: afrimmlu_direct_wol_prompt_5
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_xho.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1c1507260fd7c79cfde49981a52fffa5b5cc89d2
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_xho.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: xho
+include: afrimmlu_direct
+task: afrimmlu_direct_xho_prompt_5
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_yor.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8f528abb1018753945b954945138084b2d7327ce
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_yor.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: yor
+include: afrimmlu_direct
+task: afrimmlu_direct_yor_prompt_5
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_zul.yaml b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ec83abebdb65535e345a1c488c2e2999f798d373
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_5/afrimmlu_direct_zul.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: zul
+include: afrimmlu_direct
+task: afrimmlu_direct_zul_prompt_5
diff --git a/lm_eval/tasks/afrimmlu/direct/prompt_5/utils.py b/lm_eval/tasks/afrimmlu/direct/prompt_5/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a47ceca967c136c7df7132d826ac51af26039722
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/direct/prompt_5/utils.py
@@ -0,0 +1,29 @@
+from lm_eval.utils import weighted_f1_score
+
+
+def doc_to_choice(doc):
+    choices = eval(doc["choices"])
+    return choices
+
+
+def doc_to_text(doc):
+    output = """Given your proficiency in {subject}, please answer the subsequent multiple-choice question with 'A', 'B', 'C', or 'D'.
+
+Question: {question}
+Choices:
+        A: {choice1}
+        B: {choice2}
+        C: {choice3}
+        D: {choice4}
+Answer: """
+
+    choices = eval(doc["choices"])
+    text = output.format(
+        subject=doc["subject"],
+        question=doc["question"],
+        choice1=choices[0],
+        choice2=choices[1],
+        choice3=choices[2],
+        choice4=choices[3],
+    )
+    return text
diff --git a/lm_eval/tasks/afrimmlu/gen_utils.py b/lm_eval/tasks/afrimmlu/gen_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a195b6b5852d35042c14632597762a3965faae07
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/gen_utils.py
@@ -0,0 +1,103 @@
+import argparse
+import os
+
+import yaml
+
+
+class FunctionTag:
+    def __init__(self, value):
+        self.value = value
+
+
+def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
+    """
+    Generate a yaml file for each language.
+
+    :param output_dir: The directory to output the files to.
+    :param overwrite: Whether to overwrite files if they already exist.
+    """
+    err = []
+    languages = {
+        "eng": "English",
+        "amh": "Amharic",
+        "ibo": "Igbo",
+        "fra": "French",
+        "sna": "chiShona",
+        "wol": "Wolof",
+        "ewe": "Ewe",
+        "lin": "Lingala",
+        "lug": "Luganda",
+        "xho": "isiXhosa",
+        "kin": "Kinyarwanda",
+        "twi": "Twi",
+        "zul": "Zulu",
+        "orm": "Oromo",
+        "yor": "Yoruba",
+        "hau": "Hausa",
+        "sot": "Sesotho",
+        "swa": "Swahili",
+    }
+
+    for lang in languages.keys():
+        try:
+            file_name = f"afrimmlu_direct_{lang}.yaml"
+            task_name = f"afrimmlu_direct_{lang}_{mode}"
+            yaml_template = "afrimmlu_direct"
+            if output_dir.split("/")[-1] == "translate":
+                file_name = f"afrimmlu_translate_{lang}.yaml"
+                task_name = f"afrimmlu_translate_{lang}_{mode}"
+                yaml_template = "afrimmlu_translate"
+            yaml_details = {
+                "include": yaml_template,
+                "task": task_name,
+                "dataset_name": lang,
+            }
+            os.makedirs(f"{output_dir}/{mode}", exist_ok=True)
+            with open(
+                f"{output_dir}/{mode}/{file_name}",
+                "w" if overwrite else "x",
+                encoding="utf8",
+            ) as f:
+                f.write("# Generated by utils.py\n")
+                yaml.dump(
+                    yaml_details,
+                    f,
+                    allow_unicode=True,
+                )
+        except FileExistsError:
+            err.append(file_name)
+
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist (use --overwrite flag):"
+            f" {', '.join(err)}"
+        )
+
+
+def main() -> None:
+    """Parse CLI args and generate language-specific yaml files."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--overwrite",
+        default=True,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default="./direct",
+        help="Directory to write yaml files to",
+    )
+    parser.add_argument(
+        "--mode",
+        default="prompt_4",
+        choices=["prompt_1", "prompt_2", "prompt_3", "prompt_4", "prompt_5"],
+        help="Prompt number",
+    )
+    args = parser.parse_args()
+
+    gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite, mode=args.mode)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_common_translate_yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_common_translate_yaml
deleted file mode 100644
index fad9467833b401251480e02d8449ff97c1280a3a..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_common_translate_yaml
+++ /dev/null
@@ -1,34 +0,0 @@
-tag:
-  - afrimmlu_translate
-task: null
-dataset_path: masakhane/afrimmlu-translate-test
-dataset_name: null
-output_type: multiple_choice
-test_split: test
-doc_to_text: !function utils.doc_to_text
-doc_to_target: "{{['A', 'B', 'C', 'D'].index(answer)}}"
-doc_to_choice: !function utils.doc_to_choice
-should_decontaminate: true
-doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
-metric_list:
-  - metric: f1
-    aggregation: !function utils.weighted_f1_score
-    # aggregation: mean
-    average: weighted
-    hf_evaluate: true
-    higher_is_better: True
-    ignore_case: true
-    ignore_punctuation: true
-    regexes_to_ignore:
-      - ","
-      - "\\$"
-  - metric: acc
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-    regexes_to_ignore:
-      - ","
-      - "\\$"
-metadata:
-  version: 1.0
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_amh.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_amh.yaml
deleted file mode 100644
index ac88ffa9500701e8bbb2b5c64d1f4c9f2ec856bc..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_amh.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dataset_name: amh
-include: afrimmlu_common_translate_yaml
-task: afrimmlu_translate_amh
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_eng.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_eng.yaml
deleted file mode 100644
index 0be98beedd86223dd14c1abbf51dbe93c7ff658a..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_eng.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dataset_name: eng
-include: afrimmlu_common_translate_yaml
-task: afrimmlu_translate_eng
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ewe.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ewe.yaml
deleted file mode 100644
index 624342b91f383479c7ef340bfb80ce305608cf61..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ewe.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dataset_name: ewe
-include: afrimmlu_common_translate_yaml
-task: afrimmlu_translate_ewe
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_fra.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_fra.yaml
deleted file mode 100644
index c4fd7e1fc774b6dd987e6c35d3a3fadbf6d577c4..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_fra.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dataset_name: fra
-include: afrimmlu_common_translate_yaml
-task: afrimmlu_translate_fra
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_hau.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_hau.yaml
deleted file mode 100644
index aaeb415fa2a00516ea3a84133066b7eae009f017..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_hau.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dataset_name: hau
-include: afrimmlu_common_translate_yaml
-task: afrimmlu_translate_hau
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ibo.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ibo.yaml
deleted file mode 100644
index 93fb24e8c3fa799a41c022a708748bb5e7341631..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ibo.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dataset_name: ibo
-include: afrimmlu_common_translate_yaml
-task: afrimmlu_translate_ibo
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_kin.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_kin.yaml
deleted file mode 100644
index f39f666840626dcf6ea61a196be702ec1c3e3308..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_kin.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dataset_name: kin
-include: afrimmlu_common_translate_yaml
-task: afrimmlu_translate_kin
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lin.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lin.yaml
deleted file mode 100644
index c935ee47382973e3dbe833987ea083bd3023b5cd..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lin.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dataset_name: lin
-include: afrimmlu_common_translate_yaml
-task: afrimmlu_translate_lin
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lug.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lug.yaml
deleted file mode 100644
index 72e4bce0113c8473eabf68a7d2e43ba2eabc965c..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lug.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dataset_name: lug
-include: afrimmlu_common_translate_yaml
-task: afrimmlu_translate_lug
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_orm.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_orm.yaml
deleted file mode 100644
index 3ff902499480d35576cb84453406a5d484349816..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_orm.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dataset_name: orm
-include: afrimmlu_common_translate_yaml
-task: afrimmlu_translate_orm
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sna.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sna.yaml
deleted file mode 100644
index 9979740a9bf6194d9a9c4db0f0b4845312f1aed7..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sna.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dataset_name: sna
-include: afrimmlu_common_translate_yaml
-task: afrimmlu_translate_sna
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sot.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sot.yaml
deleted file mode 100644
index deb2b9b81d544140bfa7e720d0b544089b39bfcd..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sot.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dataset_name: sot
-include: afrimmlu_common_translate_yaml
-task: afrimmlu_translate_sot
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_swa.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_swa.yaml
deleted file mode 100644
index e58d90bc69357a3b9c166e8f29000894daa8b108..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_swa.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dataset_name: swa
-include: afrimmlu_common_translate_yaml
-task: afrimmlu_translate_swa
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_twi.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_twi.yaml
deleted file mode 100644
index 51a2d26ae0563acda4972b272de4c0d6de81146f..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_twi.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dataset_name: twi
-include: afrimmlu_common_translate_yaml
-task: afrimmlu_translate_twi
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_wol.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_wol.yaml
deleted file mode 100644
index 006b684782c853a432d9e694abe525aaeb9609ca..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_wol.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dataset_name: wol
-include: afrimmlu_common_translate_yaml
-task: afrimmlu_translate_wol
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_xho.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_xho.yaml
deleted file mode 100644
index c0bdf4471b2178c67d7f6e1ae9c5fba16b3b7710..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_xho.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dataset_name: xho
-include: afrimmlu_common_translate_yaml
-task: afrimmlu_translate_xho
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_yor.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_yor.yaml
deleted file mode 100644
index 0e7ba6005b591141dc84efa454196458c1261e8c..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_yor.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dataset_name: yor
-include: afrimmlu_common_translate_yaml
-task: afrimmlu_translate_yor
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_zul.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_zul.yaml
deleted file mode 100644
index a18d251cc8f838fa2578019475b089c4b61ecf65..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_zul.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-dataset_name: zul
-include: afrimmlu_common_translate_yaml
-task: afrimmlu_translate_zul
diff --git a/lm_eval/tasks/afrimmlu/translate/afrimmlu_tt.yaml b/lm_eval/tasks/afrimmlu/translate/afrimmlu_tt.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bbbf9387e5563914e7b89a06540d73156c8fb1b9
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/afrimmlu_tt.yaml
@@ -0,0 +1,9 @@
+group: afrimmlu_tt-irokobench
+task:
+  - afrimmlu_tt_tasks
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 2
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate b/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate
new file mode 100644
index 0000000000000000000000000000000000000000..7a974279a3918de90369c391b09de818cb1b483d
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate
@@ -0,0 +1,32 @@
+tag: afrimmlu_tt_tasks
+dataset_path: masakhane/afrimmlu-translate-test
+dataset_name: null
+output_type: multiple_choice
+test_split: test
+doc_to_text: !function utils.doc_to_text
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(answer)}}"
+doc_to_choice: !function utils.doc_to_choice
+should_decontaminate: true
+doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_amh.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..aaaaa6b8b653a2093459dfc1d7649c932b1e1957
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_amh.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: amh
+include: afrimmlu_translate
+task: afrimmlu_translate_amh_prompt_1
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_ewe.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..45298a1f4ed4c0f101f816b7adc46115da2b3aba
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_ewe.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ewe
+include: afrimmlu_translate
+task: afrimmlu_translate_ewe_prompt_1
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_fra.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6ac43a8060bf6e3dff94cd141c2bcb02014e8abd
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_fra.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fra
+include: afrimmlu_translate
+task: afrimmlu_translate_fra_prompt_1
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_hau.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c09424d3812e26e2c8ff8a2bc08eecec7690f9fd
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_hau.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hau
+include: afrimmlu_translate
+task: afrimmlu_translate_hau_prompt_1
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_ibo.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6fe139910f24c12612a64c9633fd2dd625c580cc
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_ibo.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ibo
+include: afrimmlu_translate
+task: afrimmlu_translate_ibo_prompt_1
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_kin.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c689952307fc07588af39a3c9db587aac74c4389
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_kin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kin
+include: afrimmlu_translate
+task: afrimmlu_translate_kin_prompt_1
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_lin.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e245d7bdd3b96e16b84da7db505d0b005cb165fc
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_lin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lin
+include: afrimmlu_translate
+task: afrimmlu_translate_lin_prompt_1
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_lug.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bcbac5f65455134c60b873710d7c2e66cf0ac7ca
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_lug.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lug
+include: afrimmlu_translate
+task: afrimmlu_translate_lug_prompt_1
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_orm.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..84b3d2c35042fa836f7450fd48e4654876c0473e
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_orm.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: orm
+include: afrimmlu_translate
+task: afrimmlu_translate_orm_prompt_1
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_sna.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..722ee9526102ab230cab4472842fda9e902f9119
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_sna.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sna
+include: afrimmlu_translate
+task: afrimmlu_translate_sna_prompt_1
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_sot.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4e8893aa9d996552eb8a830dbaca3c8ad8988ee3
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_sot.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sot
+include: afrimmlu_translate
+task: afrimmlu_translate_sot_prompt_1
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_swa.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..eb89697c67df29132bfb5db2c4d381f24f133e92
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_swa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swa
+include: afrimmlu_translate
+task: afrimmlu_translate_swa_prompt_1
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_twi.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d672f6c768b823615c67ee46c9b48ae87d8177a9
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_twi.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: twi
+include: afrimmlu_translate
+task: afrimmlu_translate_twi_prompt_1
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_wol.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9fb5f3709d4bfb2702903eb6b50f36bd9b02c34a
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_wol.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: wol
+include: afrimmlu_translate
+task: afrimmlu_translate_wol_prompt_1
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_xho.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1a06af041863759d796db54f8276c7722d4abf92
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_xho.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: xho
+include: afrimmlu_translate
+task: afrimmlu_translate_xho_prompt_1
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_yor.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0f5eb7de392b44c98f4d415dff01eb0dda357247
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_yor.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: yor
+include: afrimmlu_translate
+task: afrimmlu_translate_yor_prompt_1
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_zul.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ae04b652fe227cecb10f25f8dd998a9ccb50dba3
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_1/afrimmlu_translate_zul.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: zul
+include: afrimmlu_translate
+task: afrimmlu_translate_zul_prompt_1
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_1/utils.py b/lm_eval/tasks/afrimmlu/translate/prompt_1/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1bb9162f0fbc68807db68134970ae2636980cbf
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_1/utils.py
@@ -0,0 +1,32 @@
+from lm_eval.utils import weighted_f1_score
+
+
+def doc_to_choice(doc):
+    choices = eval(doc["choices"])
+    return choices
+
+
+def doc_to_text(doc):
+    output = """You are a highly knowledgeable and intelligent artificial intelligence
+                model answers multiple-choice questions about {subject}
+
+                Question: {question}
+
+                Choices:
+                        A: {choice1}
+                        B: {choice2}
+                        C: {choice3}
+                        D: {choice4}
+
+                Answer:  """
+
+    choices = eval(doc["choices"])
+    text = output.format(
+        subject=doc["subject"],
+        question=doc["question"],
+        choice1=choices[0],
+        choice2=choices[1],
+        choice3=choices[2],
+        choice4=choices[3],
+    )
+    return text
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate b/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate
new file mode 100644
index 0000000000000000000000000000000000000000..7a974279a3918de90369c391b09de818cb1b483d
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate
@@ -0,0 +1,32 @@
+tag: afrimmlu_tt_tasks
+dataset_path: masakhane/afrimmlu-translate-test
+dataset_name: null
+output_type: multiple_choice
+test_split: test
+doc_to_text: !function utils.doc_to_text
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(answer)}}"
+doc_to_choice: !function utils.doc_to_choice
+should_decontaminate: true
+doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_amh.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..283b6ff1798eba1b331edc46036b5cf38e482443
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_amh.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: amh
+include: afrimmlu_translate
+task: afrimmlu_translate_amh_prompt_2
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_ewe.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..39e9f7355ace85592c87ad626303f3d4cc89a16b
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_ewe.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ewe
+include: afrimmlu_translate
+task: afrimmlu_translate_ewe_prompt_2
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_fra.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ced80282a8addde381a1b719e10e871adb2cf533
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_fra.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fra
+include: afrimmlu_translate
+task: afrimmlu_translate_fra_prompt_2
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_hau.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0d687cac47605f507f67c4a30d39e975a2a794cb
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_hau.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hau
+include: afrimmlu_translate
+task: afrimmlu_translate_hau_prompt_2
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_ibo.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d1edfaa3999a264b903295abfc34d39cdcece572
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_ibo.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ibo
+include: afrimmlu_translate
+task: afrimmlu_translate_ibo_prompt_2
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_kin.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..48fa15181a1c6e031581cb7e80da907788b90750
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_kin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kin
+include: afrimmlu_translate
+task: afrimmlu_translate_kin_prompt_2
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_lin.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..54163d5ccfad9c3973265750ca52b522cf685c7d
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_lin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lin
+include: afrimmlu_translate
+task: afrimmlu_translate_lin_prompt_2
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_lug.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d6b709ec0b5d967579097b2251f16e278db940e5
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_lug.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lug
+include: afrimmlu_translate
+task: afrimmlu_translate_lug_prompt_2
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_orm.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2a58330c01ee2da2d2b76b3f700ccdad665450fa
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_orm.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: orm
+include: afrimmlu_translate
+task: afrimmlu_translate_orm_prompt_2
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_sna.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0edd502ec7d909dcde2026f7e2bc0d7747032728
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_sna.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sna
+include: afrimmlu_translate
+task: afrimmlu_translate_sna_prompt_2
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_sot.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2f499691fb172c8ddc1a89abc1d3ac64c4303286
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_sot.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sot
+include: afrimmlu_translate
+task: afrimmlu_translate_sot_prompt_2
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_swa.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f267b6d087b15b710a44e8015f2d77070c35853d
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_swa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swa
+include: afrimmlu_translate
+task: afrimmlu_translate_swa_prompt_2
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_twi.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b145669ebb274b39da4826cf7a62c4dd0d72dd35
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_twi.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: twi
+include: afrimmlu_translate
+task: afrimmlu_translate_twi_prompt_2
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_wol.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c87a8d61d88ab5109759ac57cc18c1935f9596b0
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_wol.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: wol
+include: afrimmlu_translate
+task: afrimmlu_translate_wol_prompt_2
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_xho.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7f55271270b73c6c10eb29765ae8852383a2c832
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_xho.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: xho
+include: afrimmlu_translate
+task: afrimmlu_translate_xho_prompt_2
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_yor.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b5cdbc6b28f26ffa7d227618e1adf04aa13f0d8d
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_yor.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: yor
+include: afrimmlu_translate
+task: afrimmlu_translate_yor_prompt_2
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_zul.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2ff80402b4d66972d6cbde5aec2eb63163aa35d5
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_2/afrimmlu_translate_zul.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: zul
+include: afrimmlu_translate
+task: afrimmlu_translate_zul_prompt_2
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_2/utils.py b/lm_eval/tasks/afrimmlu/translate/prompt_2/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0cfb334c27cfe4c5bbb1ff7126215c0ea9130c9
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_2/utils.py
@@ -0,0 +1,30 @@
+from lm_eval.utils import weighted_f1_score
+
+
+def doc_to_choice(doc):
+    choices = eval(doc["choices"])
+    return choices
+
+
+def doc_to_text(doc):
+    output = """As an expert in {subject}, choose the most accurate answer to the question below.
+Your goal is to select the correct option 'A', 'B', 'C', or 'D' by understanding the nuances of the topic.
+
+Question: {question}
+Choices:
+        A: {choice1}
+        B: {choice2}
+        C: {choice3}
+        D: {choice4}
+Answer: """
+
+    choices = eval(doc["choices"])
+    text = output.format(
+        subject=doc["subject"],
+        question=doc["question"],
+        choice1=choices[0],
+        choice2=choices[1],
+        choice3=choices[2],
+        choice4=choices[3],
+    )
+    return text
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate b/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate
new file mode 100644
index 0000000000000000000000000000000000000000..7a974279a3918de90369c391b09de818cb1b483d
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate
@@ -0,0 +1,32 @@
+tag: afrimmlu_tt_tasks
+dataset_path: masakhane/afrimmlu-translate-test
+dataset_name: null
+output_type: multiple_choice
+test_split: test
+doc_to_text: !function utils.doc_to_text
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(answer)}}"
+doc_to_choice: !function utils.doc_to_choice
+should_decontaminate: true
+doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_amh.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0377257387ded41038b9d77532536cd5383a02b0
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_amh.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: amh
+include: afrimmlu_translate
+task: afrimmlu_translate_amh_prompt_3
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_ewe.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..aa924284fc298a2171f6cd13e69684d1e26cc166
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_ewe.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ewe
+include: afrimmlu_translate
+task: afrimmlu_translate_ewe_prompt_3
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_fra.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7bef6e78de5ed0f69cda1e26383e05cc5735c23d
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_fra.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fra
+include: afrimmlu_translate
+task: afrimmlu_translate_fra_prompt_3
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_hau.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d9841db93cb047f4b5a21118a1c7ce5f036ea1d4
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_hau.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hau
+include: afrimmlu_translate
+task: afrimmlu_translate_hau_prompt_3
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_ibo.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..20c58b27ee0e4c3d73affc18d621a8db348f278e
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_ibo.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ibo
+include: afrimmlu_translate
+task: afrimmlu_translate_ibo_prompt_3
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_kin.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ed3cfd16498db252b6c48eda56079b253595cfa7
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_kin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kin
+include: afrimmlu_translate
+task: afrimmlu_translate_kin_prompt_3
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_lin.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9eeb66eaadf15cbeec8cfcbf6664fa6eb74da889
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_lin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lin
+include: afrimmlu_translate
+task: afrimmlu_translate_lin_prompt_3
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_lug.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..46f722b757ee9acb3520868d428bb51735343687
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_lug.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lug
+include: afrimmlu_translate
+task: afrimmlu_translate_lug_prompt_3
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_orm.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6f95b375edb041fd68f907a0b00411beb4808e50
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_orm.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: orm
+include: afrimmlu_translate
+task: afrimmlu_translate_orm_prompt_3
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_sna.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..aa031e5c8a875c5fe75f11efa01dc044a003b1a4
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_sna.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sna
+include: afrimmlu_translate
+task: afrimmlu_translate_sna_prompt_3
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_sot.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e55ce671f15df0c1478357f4f8dad2fa4a4f09ed
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_sot.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sot
+include: afrimmlu_translate
+task: afrimmlu_translate_sot_prompt_3
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_swa.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9f507772fb75616c8fe9a44ccd4a9549125046b8
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_swa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swa
+include: afrimmlu_translate
+task: afrimmlu_translate_swa_prompt_3
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_twi.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1ea25d8b7f3ff1a3ce70f0a4ed68d2e999ff2143
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_twi.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: twi
+include: afrimmlu_translate
+task: afrimmlu_translate_twi_prompt_3
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_wol.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2a077d853ed9ff74956449be43bd5f5fc36fac6b
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_wol.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: wol
+include: afrimmlu_translate
+task: afrimmlu_translate_wol_prompt_3
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_xho.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..18678da825da9e065ed6825d3d7955e30e9c7fd0
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_xho.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: xho
+include: afrimmlu_translate
+task: afrimmlu_translate_xho_prompt_3
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_yor.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..815d219ba75659cf564b691a6c859042b6a397b6
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_yor.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: yor
+include: afrimmlu_translate
+task: afrimmlu_translate_yor_prompt_3
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_zul.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f294a873804351a29f63276958d10a963e461519
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_3/afrimmlu_translate_zul.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: zul
+include: afrimmlu_translate
+task: afrimmlu_translate_zul_prompt_3
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_3/utils.py b/lm_eval/tasks/afrimmlu/translate/prompt_3/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4609d97afdbe27cb5b55da9f057a08a2f73d649
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_3/utils.py
@@ -0,0 +1,32 @@
+from lm_eval.utils import weighted_f1_score
+
+
+def doc_to_choice(doc):
+    choices = eval(doc["choices"])
+    return choices
+
+
+def doc_to_text(doc):
+    output = """You are a subject matter expert in {subject}.
+
+  Utilizing your expertise in {subject}, answer the following multiple-choice question
+  by picking ''A'', ''B'', ''C'', or ''D''.
+
+Question: {question}
+Choices:
+        A: {choice1}
+        B: {choice2}
+        C: {choice3}
+        D: {choice4}
+Answer: """
+
+    choices = eval(doc["choices"])
+    text = output.format(
+        subject=doc["subject"],
+        question=doc["question"],
+        choice1=choices[0],
+        choice2=choices[1],
+        choice3=choices[2],
+        choice4=choices[3],
+    )
+    return text
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate b/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate
new file mode 100644
index 0000000000000000000000000000000000000000..7a974279a3918de90369c391b09de818cb1b483d
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate
@@ -0,0 +1,32 @@
+tag: afrimmlu_tt_tasks
+dataset_path: masakhane/afrimmlu-translate-test
+dataset_name: null
+output_type: multiple_choice
+test_split: test
+doc_to_text: !function utils.doc_to_text
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(answer)}}"
+doc_to_choice: !function utils.doc_to_choice
+should_decontaminate: true
+doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_amh.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..90c880241df63265e7c8e7a60228163024394e9c
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_amh.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: amh
+include: afrimmlu_translate
+task: afrimmlu_translate_amh_prompt_4
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_ewe.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c63ccfb49c04b4bfbbbf54c2d6a85d3308a64708
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_ewe.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ewe
+include: afrimmlu_translate
+task: afrimmlu_translate_ewe_prompt_4
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_fra.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..740f87adbe66021a281f516836558a767ab91c68
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_fra.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fra
+include: afrimmlu_translate
+task: afrimmlu_translate_fra_prompt_4
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_hau.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4f73a2edd89efa810435fa4363fe69a7b0855425
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_hau.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hau
+include: afrimmlu_translate
+task: afrimmlu_translate_hau_prompt_4
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_ibo.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..424fbab9ad78b940e8c8f252d774ab474a606878
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_ibo.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ibo
+include: afrimmlu_translate
+task: afrimmlu_translate_ibo_prompt_4
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_kin.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cafcae600b0f4f74160353609f68c6fecf055068
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_kin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kin
+include: afrimmlu_translate
+task: afrimmlu_translate_kin_prompt_4
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_lin.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..618f4aaf77e7694b2265bb1084630c7927460daa
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_lin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lin
+include: afrimmlu_translate
+task: afrimmlu_translate_lin_prompt_4
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_lug.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fe59cfd265493fb8d54c46793ac69a7d6f5e7279
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_lug.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lug
+include: afrimmlu_translate
+task: afrimmlu_translate_lug_prompt_4
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_orm.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4f25d96c45abc75c963effa1a0dcba504fdb81ce
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_orm.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: orm
+include: afrimmlu_translate
+task: afrimmlu_translate_orm_prompt_4
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_sna.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0943eec102650255d5955fde8b995eea2274c83a
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_sna.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sna
+include: afrimmlu_translate
+task: afrimmlu_translate_sna_prompt_4
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_sot.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..882117e4c7c10e6b0eabd80531d136189891188f
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_sot.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sot
+include: afrimmlu_translate
+task: afrimmlu_translate_sot_prompt_4
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_swa.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..92928aae81f1bc2f27e99b6e5ad20401990a102f
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_swa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swa
+include: afrimmlu_translate
+task: afrimmlu_translate_swa_prompt_4
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_twi.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8d093262712cefd637aa1861887ff92ddd077270
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_twi.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: twi
+include: afrimmlu_translate
+task: afrimmlu_translate_twi_prompt_4
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_wol.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..66161c7dd6a149ab713ae6f92f595b097c6ba794
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_wol.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: wol
+include: afrimmlu_translate
+task: afrimmlu_translate_wol_prompt_4
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_xho.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..25ff91f0004a504f48b1ed9c0ea3b000a7c7fcf2
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_xho.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: xho
+include: afrimmlu_translate
+task: afrimmlu_translate_xho_prompt_4
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_yor.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..54743fddbc83195064abc456b8dca0ff750c6fe5
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_yor.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: yor
+include: afrimmlu_translate
+task: afrimmlu_translate_yor_prompt_4
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_zul.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ebd80f22645d4cb5dfbd584b6fa60a45edaf3f44
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_4/afrimmlu_translate_zul.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: zul
+include: afrimmlu_translate
+task: afrimmlu_translate_zul_prompt_4
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_4/utils.py b/lm_eval/tasks/afrimmlu/translate/prompt_4/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..29c23b7f856b2ab4ead359cafbbf404241e53ffb
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_4/utils.py
@@ -0,0 +1,28 @@
+from lm_eval.utils import weighted_f1_score
+
+
+def doc_to_choice(doc):
+    choices = eval(doc["choices"])
+    return choices
+
+
+def doc_to_text(doc):
+    output = """Analyze each question critically and determine the most correct option based on your understanding of the subject matter
+
+Question: {question}
+Choices:
+        A: {choice1}
+        B: {choice2}
+        C: {choice3}
+        D: {choice4}
+Answer: """
+
+    choices = eval(doc["choices"])
+    text = output.format(
+        question=doc["question"],
+        choice1=choices[0],
+        choice2=choices[1],
+        choice3=choices[2],
+        choice4=choices[3],
+    )
+    return text
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate b/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate
new file mode 100644
index 0000000000000000000000000000000000000000..7a974279a3918de90369c391b09de818cb1b483d
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate
@@ -0,0 +1,32 @@
+tag: afrimmlu_tt_tasks
+dataset_path: masakhane/afrimmlu-translate-test
+dataset_name: null
+output_type: multiple_choice
+test_split: test
+doc_to_text: !function utils.doc_to_text
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(answer)}}"
+doc_to_choice: !function utils.doc_to_choice
+should_decontaminate: true
+doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_amh.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4b5ebb387b40e7a12a103a72ee43b3f711de9a7f
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_amh.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: amh
+include: afrimmlu_translate
+task: afrimmlu_translate_amh_prompt_5
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_ewe.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4860b79c529b23f7827c6b25527e0070b32c36bc
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_ewe.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ewe
+include: afrimmlu_translate
+task: afrimmlu_translate_ewe_prompt_5
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_fra.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..192f3423797a807c355fc21bb4c1e9137cc31b72
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_fra.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fra
+include: afrimmlu_translate
+task: afrimmlu_translate_fra_prompt_5
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_hau.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1fc6aafbb4d14111f18e31957c888fcd773acdef
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_hau.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hau
+include: afrimmlu_translate
+task: afrimmlu_translate_hau_prompt_5
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_ibo.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a6cca83cd7eb6b765b5512827044547e01810813
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_ibo.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ibo
+include: afrimmlu_translate
+task: afrimmlu_translate_ibo_prompt_5
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_kin.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6acd743a77e5b04b5df5ddf8861984f206697542
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_kin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kin
+include: afrimmlu_translate
+task: afrimmlu_translate_kin_prompt_5
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_lin.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f62c43f601a2d0e48b462fbf3cc4c1ce09760c50
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_lin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lin
+include: afrimmlu_translate
+task: afrimmlu_translate_lin_prompt_5
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_lug.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9b8a97f30edee493743ed39a4ff2e8ede1b1ab4c
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_lug.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lug
+include: afrimmlu_translate
+task: afrimmlu_translate_lug_prompt_5
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_orm.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0eebc1bdf290e775f6e2a7bba8601a7d980ed884
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_orm.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: orm
+include: afrimmlu_translate
+task: afrimmlu_translate_orm_prompt_5
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_sna.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..60d4d57f5bb6d793467316ac4fe2c2c97d055289
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_sna.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sna
+include: afrimmlu_translate
+task: afrimmlu_translate_sna_prompt_5
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_sot.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..eeddbb7b71328018a6ecdc1317c37c288a4f1806
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_sot.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sot
+include: afrimmlu_translate
+task: afrimmlu_translate_sot_prompt_5
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_swa.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..24a3b78e2d1c5b1323a335cbe20034a7a7e4b0b7
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_swa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swa
+include: afrimmlu_translate
+task: afrimmlu_translate_swa_prompt_5
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_twi.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ab3ea80c0610b333c11b67712e6bc5284994bbed
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_twi.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: twi
+include: afrimmlu_translate
+task: afrimmlu_translate_twi_prompt_5
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_wol.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2cd495e81df06383612c1278b18faeb0ac5c567f
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_wol.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: wol
+include: afrimmlu_translate
+task: afrimmlu_translate_wol_prompt_5
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_xho.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a9af97c083d1230687a6ea6a01c29a3024a79760
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_xho.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: xho
+include: afrimmlu_translate
+task: afrimmlu_translate_xho_prompt_5
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_yor.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6e227431566f0c5187e0baeeaeb1e82838db0469
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_yor.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: yor
+include: afrimmlu_translate
+task: afrimmlu_translate_yor_prompt_5
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_zul.yaml b/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a08884b8f40dc91798bdbba0ddf066862e8a2d76
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_5/afrimmlu_translate_zul.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: zul
+include: afrimmlu_translate
+task: afrimmlu_translate_zul_prompt_5
diff --git a/lm_eval/tasks/afrimmlu/translate/prompt_5/utils.py b/lm_eval/tasks/afrimmlu/translate/prompt_5/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..147225bb70653d67663ac1762a7cd6246c4e9f22
--- /dev/null
+++ b/lm_eval/tasks/afrimmlu/translate/prompt_5/utils.py
@@ -0,0 +1,28 @@
+from lm_eval.utils import weighted_f1_score
+
+
+def doc_to_choice(doc):
+    choices = eval(doc["choices"])
+    return choices
+
+
+def doc_to_text(doc):
+    output = """Given your proficiency in {subject}, please answer the subsequent multiple-choice question with 'A', 'B', 'C', or 'D'.
+Question: {question}
+Choices:
+        A: {choice1}
+        B: {choice2}
+        C: {choice3}
+        D: {choice4}
+Answer: """
+
+    choices = eval(doc["choices"])
+    text = output.format(
+        subject=doc["subject"],
+        question=doc["question"],
+        choice1=choices[0],
+        choice2=choices[1],
+        choice3=choices[2],
+        choice4=choices[3],
+    )
+    return text
diff --git a/lm_eval/tasks/afrimmlu/translate/utils.py b/lm_eval/tasks/afrimmlu/translate/utils.py
deleted file mode 100644
index 9d02b342b2e3c9f3d3bd66d3f62330aa53c9159c..0000000000000000000000000000000000000000
--- a/lm_eval/tasks/afrimmlu/translate/utils.py
+++ /dev/null
@@ -1,32 +0,0 @@
-from lm_eval.utils import weighted_f1_score
-
-
-def doc_to_choice(doc):
-    choices = eval(doc["choices"])
-    return choices
-
-
-def doc_to_text(doc):
-    output = """You are a highly knowledgeable and intelligent artificial intelligence
-                model answers multiple-choice questions about '{subject}'
-
-                Question: '''{question}'''
-
-                Choices:
-                        A: ''{choice1}'''
-                        B: ''{choice2}'''
-                        C: ''{choice3}'''
-                        D: ''{choice4}'''
-
-                Answer:  """
-
-    choices = eval(doc["choices"])
-    text = output.format(
-        subject=doc["subject"],
-        question=doc["question"],
-        choice1=choices[0],
-        choice2=choices[1],
-        choice3=choices[2],
-        choice4=choices[3],
-    )
-    return text
diff --git a/lm_eval/tasks/afrixnli/direct/afrixnli.yaml b/lm_eval/tasks/afrixnli/direct/afrixnli.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d85ccd128f752f7a1ab566aa28e90d5bbf545b66
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/afrixnli.yaml
@@ -0,0 +1,13 @@
+group: afrixnli-irokobench
+task:
+  - afrixnli_tasks_prompt_1
+  - afrixnli_tasks_prompt_2
+  - afrixnli_tasks_prompt_3
+  - afrixnli_tasks_prompt_4
+  - afrixnli_tasks_prompt_5
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 2
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_amh.yaml b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..39f727b4ccb7c07eb0b2f6b8d2472764446767d4
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_amh.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: amh
+doc_to_text: 'Please identify whether the premise entails or contradicts the hypothesis
+  in the following premise and hypothesis. The answer should be exact entailment,
+  contradiction, or neutral.
+
+
+  Premise: {premise}
+
+  Hypothesis: {hypothesis}
+
+
+  Is it entailment, contradiction, or neutral?'
+include: afrixnli_yaml
+task: afrixnli_amh_prompt_1
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_eng.yaml b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..593c57a34ec0f01d3c03e447acda48cd1644231b
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_eng.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: eng
+doc_to_text: 'Please identify whether the premise entails or contradicts the hypothesis
+  in the following premise and hypothesis. The answer should be exact entailment,
+  contradiction, or neutral.
+
+
+  Premise: {premise}
+
+  Hypothesis: {hypothesis}
+
+
+  Is it entailment, contradiction, or neutral?'
+include: afrixnli_yaml
+task: afrixnli_eng_prompt_1
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_ewe.yaml b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b6a10baae753575ebb228a6df34e4faf364efea1
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_ewe.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: ewe
+doc_to_text: 'Please identify whether the premise entails or contradicts the hypothesis
+  in the following premise and hypothesis. The answer should be exact entailment,
+  contradiction, or neutral.
+
+
+  Premise: {premise}
+
+  Hypothesis: {hypothesis}
+
+
+  Is it entailment, contradiction, or neutral?'
+include: afrixnli_yaml
+task: afrixnli_ewe_prompt_1
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_fra.yaml b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..08b2b5243633f276487d8d5595382211870eedf9
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_fra.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: fra
+doc_to_text: 'Please identify whether the premise entails or contradicts the hypothesis
+  in the following premise and hypothesis. The answer should be exact entailment,
+  contradiction, or neutral.
+
+
+  Premise: {premise}
+
+  Hypothesis: {hypothesis}
+
+
+  Is it entailment, contradiction, or neutral?'
+include: afrixnli_yaml
+task: afrixnli_fra_prompt_1
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_hau.yaml b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fe234b72694fdfde8474863d81265e851a350368
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_hau.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: 'Please identify whether the premise entails or contradicts the hypothesis
+  in the following premise and hypothesis. The answer should be exact entailment,
+  contradiction, or neutral.
+
+
+  Premise: {premise}
+
+  Hypothesis: {hypothesis}
+
+
+  Is it entailment, contradiction, or neutral?'
+include: afrixnli_yaml
+task: afrixnli_hau_prompt_1
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_ibo.yaml b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d282e0e5f84433b77f8954acaa4e65c9ccbf5ba4
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_ibo.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: 'Please identify whether the premise entails or contradicts the hypothesis
+  in the following premise and hypothesis. The answer should be exact entailment,
+  contradiction, or neutral.
+
+
+  Premise: {premise}
+
+  Hypothesis: {hypothesis}
+
+
+  Is it entailment, contradiction, or neutral?'
+include: afrixnli_yaml
+task: afrixnli_ibo_prompt_1
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_kin.yaml b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cfdff6c8c64e6a91a869386f71b6f6024c0ac156
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_kin.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_text: 'Please identify whether the premise entails or contradicts the hypothesis
+  in the following premise and hypothesis. The answer should be exact entailment,
+  contradiction, or neutral.
+
+
+  Premise: {premise}
+
+  Hypothesis: {hypothesis}
+
+
+  Is it entailment, contradiction, or neutral?'
+include: afrixnli_yaml
+task: afrixnli_kin_prompt_1
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_lin.yaml b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..410cb29f80366d78d0ec4fb9e240a9df0ec20372
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_lin.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: lin
+doc_to_text: 'Please identify whether the premise entails or contradicts the hypothesis
+  in the following premise and hypothesis. The answer should be exact entailment,
+  contradiction, or neutral.
+
+
+  Premise: {premise}
+
+  Hypothesis: {hypothesis}
+
+
+  Is it entailment, contradiction, or neutral?'
+include: afrixnli_yaml
+task: afrixnli_lin_prompt_1
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_lug.yaml b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b5665e37cce68d072cf8b64be5a787aed23fd70b
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_lug.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: lug
+doc_to_text: 'Please identify whether the premise entails or contradicts the hypothesis
+  in the following premise and hypothesis. The answer should be exact entailment,
+  contradiction, or neutral.
+
+
+  Premise: {premise}
+
+  Hypothesis: {hypothesis}
+
+
+  Is it entailment, contradiction, or neutral?'
+include: afrixnli_yaml
+task: afrixnli_lug_prompt_1
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_orm.yaml b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..12751c7f93de40a8bc91431de573be9f99868ab4
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_orm.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: orm
+doc_to_text: 'Please identify whether the premise entails or contradicts the hypothesis
+  in the following premise and hypothesis. The answer should be exact entailment,
+  contradiction, or neutral.
+
+
+  Premise: {premise}
+
+  Hypothesis: {hypothesis}
+
+
+  Is it entailment, contradiction, or neutral?'
+include: afrixnli_yaml
+task: afrixnli_orm_prompt_1
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_sna.yaml b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d00bbb6f9d146effdf5de3112db4c56f79002166
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_sna.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: sna
+doc_to_text: 'Please identify whether the premise entails or contradicts the hypothesis
+  in the following premise and hypothesis. The answer should be exact entailment,
+  contradiction, or neutral.
+
+
+  Premise: {premise}
+
+  Hypothesis: {hypothesis}
+
+
+  Is it entailment, contradiction, or neutral?'
+include: afrixnli_yaml
+task: afrixnli_sna_prompt_1
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_sot.yaml b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2ae346aed8ff8e53b94252385443b54fe4364595
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_sot.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: sot
+doc_to_text: 'Please identify whether the premise entails or contradicts the hypothesis
+  in the following premise and hypothesis. The answer should be exact entailment,
+  contradiction, or neutral.
+
+
+  Premise: {premise}
+
+  Hypothesis: {hypothesis}
+
+
+  Is it entailment, contradiction, or neutral?'
+include: afrixnli_yaml
+task: afrixnli_sot_prompt_1
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_swa.yaml b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ca6729bf80cad6e95027c7c0e994cd1da14d0d6d
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_swa.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: 'Please identify whether the premise entails or contradicts the hypothesis
+  in the following premise and hypothesis. The answer should be exact entailment,
+  contradiction, or neutral.
+
+
+  Premise: {premise}
+
+  Hypothesis: {hypothesis}
+
+
+  Is it entailment, contradiction, or neutral?'
+include: afrixnli_yaml
+task: afrixnli_swa_prompt_1
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_twi.yaml b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7dc85428dab2ed5da0cb6fa17b0a428088f346f1
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_twi.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_text: 'Please identify whether the premise entails or contradicts the hypothesis
+  in the following premise and hypothesis. The answer should be exact entailment,
+  contradiction, or neutral.
+
+
+  Premise: {premise}
+
+  Hypothesis: {hypothesis}
+
+
+  Is it entailment, contradiction, or neutral?'
+include: afrixnli_yaml
+task: afrixnli_twi_prompt_1
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_wol.yaml b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..78ef254aeed1d393efca6390fa574aa41eac5f21
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_wol.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: wol
+doc_to_text: 'Please identify whether the premise entails or contradicts the hypothesis
+  in the following premise and hypothesis. The answer should be exact entailment,
+  contradiction, or neutral.
+
+
+  Premise: {premise}
+
+  Hypothesis: {hypothesis}
+
+
+  Is it entailment, contradiction, or neutral?'
+include: afrixnli_yaml
+task: afrixnli_wol_prompt_1
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_xho.yaml b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cb0a8527741f4b8f3ac1fb7c2741a6cf5e2c64ae
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_xho.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: xho
+doc_to_text: 'Please identify whether the premise entails or contradicts the hypothesis
+  in the following premise and hypothesis. The answer should be exact entailment,
+  contradiction, or neutral.
+
+
+  Premise: {premise}
+
+  Hypothesis: {hypothesis}
+
+
+  Is it entailment, contradiction, or neutral?'
+include: afrixnli_yaml
+task: afrixnli_xho_prompt_1
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_yaml b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..81c9eeaa5af0740cc32122519f671c4d0425c080
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_yaml
@@ -0,0 +1,30 @@
+tag:
+    - afrixnli_tasks
+    - afrixnli_tasks_prompt_1
+dataset_path: masakhane/afrixnli
+dataset_name: null
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: validation
+doc_to_target: !function utils.doc_to_target
+doc_to_choice:
+  - "entailment"
+  - "neutral"
+  - "contradiction"
+should_decontaminate: true
+doc_to_decontamination_query: premise
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    average: weighted
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_yor.yaml b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..473aea37a7b036d7ef219eca756482cd2bac754b
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_yor.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: 'Please identify whether the premise entails or contradicts the hypothesis
+  in the following premise and hypothesis. The answer should be exact entailment,
+  contradiction, or neutral.
+
+
+  Premise: {premise}
+
+  Hypothesis: {hypothesis}
+
+
+  Is it entailment, contradiction, or neutral?'
+include: afrixnli_yaml
+task: afrixnli_yor_prompt_1
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_zul.yaml b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fa07a8c991d56a0cef3dc8453017649952715f8a
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_1/afrixnli_zul.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: zul
+doc_to_text: 'Please identify whether the premise entails or contradicts the hypothesis
+  in the following premise and hypothesis. The answer should be exact entailment,
+  contradiction, or neutral.
+
+
+  Premise: {premise}
+
+  Hypothesis: {hypothesis}
+
+
+  Is it entailment, contradiction, or neutral?'
+include: afrixnli_yaml
+task: afrixnli_zul_prompt_1
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_1/utils.py b/lm_eval/tasks/afrixnli/direct/prompt_1/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d97a0a288508e817ab695e637fb157a08c813808
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_1/utils.py
@@ -0,0 +1,19 @@
+from lm_eval.utils import weighted_f1_score
+
+
+def doc_to_text(doc):
+    output = """Please identify whether the premise entails or contradicts the hypothesis in the following premise
+    and hypothesis. The answer should be exact entailment, contradiction, or neutral.
+
+    Premise: {premise}
+    Hypothesis: {hypothesis}
+
+    Is it entailment, contradiction, or neutral?"""
+
+    text = output.format(premise=doc["premise"], hypothesis=doc["hypothesis"])
+    return text
+
+
+def doc_to_target(doc):
+    replacements = {0: "entailment", 1: "neutral", 2: "contradiction"}
+    return replacements[doc["label"]]
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_amh.yaml b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fbf916b25d43db2fdc476c27fe5f2e8e02c45625
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_amh.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: amh
+include: afrixnli_yaml
+task: afrixnli_amh_prompt_2
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_eng.yaml b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dfa8ebfe8815a58c1a043b328a22f762a739b9d2
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_eng.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: eng
+include: afrixnli_yaml
+task: afrixnli_eng_prompt_2
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_ewe.yaml b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..995ef3e65894548266a72f45417222e2760e30fa
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_ewe.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ewe
+include: afrixnli_yaml
+task: afrixnli_ewe_prompt_2
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_fra.yaml b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ce72588c19901c04ee82479206f54816fa358915
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_fra.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fra
+include: afrixnli_yaml
+task: afrixnli_fra_prompt_2
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_hau.yaml b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..369ee58bedfd98fd95c63510c3a84eec10238df0
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_hau.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hau
+include: afrixnli_yaml
+task: afrixnli_hau_prompt_2
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_ibo.yaml b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e118c613ebf2d0388298fe6ba750923816ba4af6
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_ibo.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ibo
+include: afrixnli_yaml
+task: afrixnli_ibo_prompt_2
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_kin.yaml b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..81f6d803d6762f5a6b86dae00ec0b26040a943ac
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_kin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kin
+include: afrixnli_yaml
+task: afrixnli_kin_prompt_2
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_lin.yaml b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2d99c2eb57aedcce988f37415c414882d5bb4186
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_lin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lin
+include: afrixnli_yaml
+task: afrixnli_lin_prompt_2
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_lug.yaml b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..31325539e1dd778da5e057436aeb3b60d7531a58
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_lug.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lug
+include: afrixnli_yaml
+task: afrixnli_lug_prompt_2
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_orm.yaml b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c4ad555afafe2b99470d706e0eb46dc8256037fb
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_orm.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: orm
+include: afrixnli_yaml
+task: afrixnli_orm_prompt_2
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_sna.yaml b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a780b0c428c822c08d5bb16dd909cb883da494a2
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_sna.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sna
+include: afrixnli_yaml
+task: afrixnli_sna_prompt_2
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_sot.yaml b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..94e78880d31b48ce8dc4e562a9d9cc3643208535
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_sot.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sot
+include: afrixnli_yaml
+task: afrixnli_sot_prompt_2
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_swa.yaml b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8622e2833c24290007405dc90043f0c5b6ced7ad
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_swa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swa
+include: afrixnli_yaml
+task: afrixnli_swa_prompt_2
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_twi.yaml b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4219b81ee8a1de24535cf2cd6eae4643e660d0de
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_twi.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: twi
+include: afrixnli_yaml
+task: afrixnli_twi_prompt_2
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_wol.yaml b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..546b17904959bd83c1f26617a9a13b79fc654a55
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_wol.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: wol
+include: afrixnli_yaml
+task: afrixnli_wol_prompt_2
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_xho.yaml b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..649c61df93eef6b6828d7da9c5a672bc5fce9611
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_xho.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: xho
+include: afrixnli_yaml
+task: afrixnli_xho_prompt_2
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_yaml b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cfab642bf9175d3066879680618e95f097a609a2
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_yaml
@@ -0,0 +1,34 @@
+tag:
+    - afrixnli_tasks
+    - afrixnli_tasks_prompt_2
+dataset_path: masakhane/afrixnli
+dataset_name: null
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: validation
+doc_to_text: "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither?\nAnswer:"
+# True = entailment
+# False = contradiction
+# Neither = neutral
+doc_to_target: !function utils.doc_to_target
+doc_to_choice:
+  - "True"
+  - "Neither"
+  - "False"
+should_decontaminate: true
+doc_to_decontamination_query: premise
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    average: weighted
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_yor.yaml b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..53f23ace6bbb25c6457f7bd4e5b760b7ebb8b298
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_yor.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: yor
+include: afrixnli_yaml
+task: afrixnli_yor_prompt_2
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_zul.yaml b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dd89fe131e26f4b0d1dedb1b86aeac72f8f706d6
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_2/afrixnli_zul.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: zul
+include: afrixnli_yaml
+task: afrixnli_zul_prompt_2
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_2/utils.py b/lm_eval/tasks/afrixnli/direct/prompt_2/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d1ac19e19b2e855c957e75f1c778366dfbc7e55
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_2/utils.py
@@ -0,0 +1,6 @@
+from lm_eval.utils import weighted_f1_score
+
+
+def doc_to_target(doc):
+    replacements = {0: "True", 1: "Neither", 2: "False"}
+    return replacements[doc["label"]]
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_amh.yaml b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3ff9f99c187a914fde7514c7c4caf49cb63c4186
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_amh.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: amh
+doc_to_text: "Given the following premise and hypothesis in Amharic, identify if the\
+  \ premise entails, contradicts, or is neutral towards the hypothesis. Please respond\
+  \ with exact 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_amh_prompt_3
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_eng.yaml b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a53aea6dbd9c3fedd9a812fc8f698b5f16d41bf3
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_eng.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: eng
+doc_to_text: "Given the following premise and hypothesis in English, identify if the\
+  \ premise entails, contradicts, or is neutral towards the hypothesis. Please respond\
+  \ with exact 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_eng_prompt_3
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_ewe.yaml b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..54b58ae6e972774be55be9988a20d6962a7e56ff
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_ewe.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: ewe
+doc_to_text: "Given the following premise and hypothesis in Ewe, identify if the premise\
+  \ entails, contradicts, or is neutral towards the hypothesis. Please respond with\
+  \ exact 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}} \n\
+  Hypothesis: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_ewe_prompt_3
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_fra.yaml b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fedb519ec8421d77aedb24215de643544614bf70
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_fra.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: fra
+doc_to_text: "Given the following premise and hypothesis in French, identify if the\
+  \ premise entails, contradicts, or is neutral towards the hypothesis. Please respond\
+  \ with exact 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_fra_prompt_3
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_hau.yaml b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3a9ebb95181426ab8a9138267a8400c37825e76e
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_hau.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: "Given the following premise and hypothesis in Hausa, identify if the\
+  \ premise entails, contradicts, or is neutral towards the hypothesis. Please respond\
+  \ with exact 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_hau_prompt_3
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_ibo.yaml b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6b61f7678a3ab596bbda1b6039129e2e71b6bda6
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_ibo.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: "Given the following premise and hypothesis in Igbo, identify if the\
+  \ premise entails, contradicts, or is neutral towards the hypothesis. Please respond\
+  \ with exact 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_ibo_prompt_3
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_kin.yaml b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1bd0829bfaf89354c5814eeffe1d4de8432fa540
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_kin.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_text: "Given the following premise and hypothesis in Kinyarwanda, identify\
+  \ if the premise entails, contradicts, or is neutral towards the hypothesis. Please\
+  \ respond with exact 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_kin_prompt_3
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_lin.yaml b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..697c439fa18a1ecd80e41fca14bc0836d956cfde
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_lin.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: lin
+doc_to_text: "Given the following premise and hypothesis in Lingala, identify if the\
+  \ premise entails, contradicts, or is neutral towards the hypothesis. Please respond\
+  \ with exact 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_lin_prompt_3
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_lug.yaml b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1b5667c0720473f0ab7703b17777b5ced0459381
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_lug.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: lug
+doc_to_text: "Given the following premise and hypothesis in Luganda, identify if the\
+  \ premise entails, contradicts, or is neutral towards the hypothesis. Please respond\
+  \ with exact 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_lug_prompt_3
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_orm.yaml b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..37a6d843e51870f6ad2845e1f385b6aacef2fab2
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_orm.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: orm
+doc_to_text: "Given the following premise and hypothesis in Oromo, identify if the\
+  \ premise entails, contradicts, or is neutral towards the hypothesis. Please respond\
+  \ with exact 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_orm_prompt_3
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_sna.yaml b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c7e0f0b05000c40fbebca19921a2486a57d8054a
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_sna.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: sna
+doc_to_text: "Given the following premise and hypothesis in chiShona, identify if\
+  \ the premise entails, contradicts, or is neutral towards the hypothesis. Please\
+  \ respond with exact 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_sna_prompt_3
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_sot.yaml b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0c0ccd9e64ee0b807fef742c873126666327f276
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_sot.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: sot
+doc_to_text: "Given the following premise and hypothesis in Sesotho, identify if the\
+  \ premise entails, contradicts, or is neutral towards the hypothesis. Please respond\
+  \ with exact 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_sot_prompt_3
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_swa.yaml b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dabd96ef2e1cbf6df83432cc057382d40e448ff7
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_swa.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: "Given the following premise and hypothesis in Swahili, identify if the\
+  \ premise entails, contradicts, or is neutral towards the hypothesis. Please respond\
+  \ with exact 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_swa_prompt_3
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_twi.yaml b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4d3158d45209fe96a7e0b7520d065c9108a0798b
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_twi.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_text: "Given the following premise and hypothesis in Twi, identify if the premise\
+  \ entails, contradicts, or is neutral towards the hypothesis. Please respond with\
+  \ exact 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}} \n\
+  Hypothesis: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_twi_prompt_3
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_wol.yaml b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..51fbdc79b0987386839599a3521bb2d564256e83
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_wol.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: wol
+doc_to_text: "Given the following premise and hypothesis in Wolof, identify if the\
+  \ premise entails, contradicts, or is neutral towards the hypothesis. Please respond\
+  \ with exact 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_wol_prompt_3
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_xho.yaml b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..00ca9d17934256c4169ba0da95f7a604c60ac037
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_xho.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: xho
+doc_to_text: "Given the following premise and hypothesis in isiXhosa, identify if\
+  \ the premise entails, contradicts, or is neutral towards the hypothesis. Please\
+  \ respond with exact 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_xho_prompt_3
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_yaml b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..04609ac3c424b323858858fdbffbd83ccec52b7e
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_yaml
@@ -0,0 +1,30 @@
+tag:
+    - afrixnli_tasks
+    - afrixnli_tasks_prompt_3
+dataset_path: masakhane/afrixnli
+dataset_name: null
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: validation
+doc_to_target: !function utils.doc_to_target
+doc_to_choice:
+  - "entailment"
+  - "neutral"
+  - "contradiction"
+should_decontaminate: true
+doc_to_decontamination_query: premise
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    average: weighted
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_yor.yaml b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6d8b2f847f473ad2d1857e924495e98fffbc9edd
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_yor.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: "Given the following premise and hypothesis in Yoruba, identify if the\
+  \ premise entails, contradicts, or is neutral towards the hypothesis. Please respond\
+  \ with exact 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_yor_prompt_3
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_zul.yaml b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..83b87141b4021baad581be7d6b60375c40ef73c5
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_3/afrixnli_zul.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: zul
+doc_to_text: "Given the following premise and hypothesis in Zulu, identify if the\
+  \ premise entails, contradicts, or is neutral towards the hypothesis. Please respond\
+  \ with exact 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_zul_prompt_3
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_3/utils.py b/lm_eval/tasks/afrixnli/direct/prompt_3/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..422ed169bffa2777cd91307c3ab097619e4d5399
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_3/utils.py
@@ -0,0 +1,6 @@
+from lm_eval.utils import weighted_f1_score
+
+
+def doc_to_target(doc):
+    replacements = {0: "entailment", 1: "neutral", 2: "contradiction"}
+    return replacements[doc["label"]]
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_amh.yaml b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..63b05465144af310263939fea2b8335672dbb7ae
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_amh.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: amh
+doc_to_text: "You are an expert in Natural Language Inference (NLI) specializing in\
+  \ the Amharic language.\nAnalyze the premise and hypothesis given in Amharic, and\
+  \ determine the relationship between them.\n Respond with one of the following options:\
+  \ 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}} \nHypothesis:\
+  \ {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_amh_prompt_4
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_eng.yaml b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1ecb06d10497274dde56ab73302525add553254a
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_eng.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: eng
+doc_to_text: "You are an expert in Natural Language Inference (NLI) specializing in\
+  \ the English language.\nAnalyze the premise and hypothesis given in English, and\
+  \ determine the relationship between them.\n Respond with one of the following options:\
+  \ 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}} \nHypothesis:\
+  \ {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_eng_prompt_4
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_ewe.yaml b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..64157b549956f932b3ad5bd2f67610378d683596
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_ewe.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: ewe
+doc_to_text: "You are an expert in Natural Language Inference (NLI) specializing in\
+  \ the Ewe language.\nAnalyze the premise and hypothesis given in Ewe, and determine\
+  \ the relationship between them.\n Respond with one of the following options: 'entailment',\
+  \ 'contradiction', or 'neutral'. \n\nPremise: {{premise}} \nHypothesis: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_ewe_prompt_4
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_fra.yaml b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..78da10cf7e04482eef5c0f9517a176196681c103
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_fra.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: fra
+doc_to_text: "You are an expert in Natural Language Inference (NLI) specializing in\
+  \ the French language.\nAnalyze the premise and hypothesis given in French, and\
+  \ determine the relationship between them.\n Respond with one of the following options:\
+  \ 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}} \nHypothesis:\
+  \ {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_fra_prompt_4
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_hau.yaml b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..811a0fca1364f55e5ba3dfe37e7d9c99e7090e6a
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_hau.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: "You are an expert in Natural Language Inference (NLI) specializing in\
+  \ the Hausa language.\nAnalyze the premise and hypothesis given in Hausa, and determine\
+  \ the relationship between them.\n Respond with one of the following options: 'entailment',\
+  \ 'contradiction', or 'neutral'. \n\nPremise: {{premise}} \nHypothesis: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_hau_prompt_4
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_ibo.yaml b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..73fdba2fba55cc9af3bb802e50562de8ceb9a97e
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_ibo.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: "You are an expert in Natural Language Inference (NLI) specializing in\
+  \ the Igbo language.\nAnalyze the premise and hypothesis given in Igbo, and determine\
+  \ the relationship between them.\n Respond with one of the following options: 'entailment',\
+  \ 'contradiction', or 'neutral'. \n\nPremise: {{premise}} \nHypothesis: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_ibo_prompt_4
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_kin.yaml b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f975d82b4a3b2ee0898aa8cc9aec225b3bb26e2d
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_kin.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_text: "You are an expert in Natural Language Inference (NLI) specializing in\
+  \ the Kinyarwanda language.\nAnalyze the premise and hypothesis given in Kinyarwanda,\
+  \ and determine the relationship between them.\n Respond with one of the following\
+  \ options: 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_kin_prompt_4
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_lin.yaml b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..63062ac444bcd71951152416384ae5852510decb
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_lin.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: lin
+doc_to_text: "You are an expert in Natural Language Inference (NLI) specializing in\
+  \ the Lingala language.\nAnalyze the premise and hypothesis given in Lingala, and\
+  \ determine the relationship between them.\n Respond with one of the following options:\
+  \ 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}} \nHypothesis:\
+  \ {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_lin_prompt_4
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_lug.yaml b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1553c620009ec7374378e584e5f7523ff6d57306
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_lug.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: lug
+doc_to_text: "You are an expert in Natural Language Inference (NLI) specializing in\
+  \ the Luganda language.\nAnalyze the premise and hypothesis given in Luganda, and\
+  \ determine the relationship between them.\n Respond with one of the following options:\
+  \ 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}} \nHypothesis:\
+  \ {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_lug_prompt_4
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_orm.yaml b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ba2a377b7fb03f6cd1546fe8f1b65549d2133d6c
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_orm.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: orm
+doc_to_text: "You are an expert in Natural Language Inference (NLI) specializing in\
+  \ the Oromo language.\nAnalyze the premise and hypothesis given in Oromo, and determine\
+  \ the relationship between them.\n Respond with one of the following options: 'entailment',\
+  \ 'contradiction', or 'neutral'. \n\nPremise: {{premise}} \nHypothesis: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_orm_prompt_4
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_sna.yaml b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..afce6e955b21eb514b4dfd024d7a8d115a3377ba
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_sna.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: sna
+doc_to_text: "You are an expert in Natural Language Inference (NLI) specializing in\
+  \ the chiShona language.\nAnalyze the premise and hypothesis given in chiShona,\
+  \ and determine the relationship between them.\n Respond with one of the following\
+  \ options: 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_sna_prompt_4
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_sot.yaml b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..40c7cf8476813d29347fdd8e14785cb61a48c172
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_sot.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: sot
+doc_to_text: "You are an expert in Natural Language Inference (NLI) specializing in\
+  \ the Sesotho language.\nAnalyze the premise and hypothesis given in Sesotho, and\
+  \ determine the relationship between them.\n Respond with one of the following options:\
+  \ 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}} \nHypothesis:\
+  \ {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_sot_prompt_4
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_swa.yaml b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1c28aaae79accde1d796fb5dff56f8998130df0b
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_swa.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: "You are an expert in Natural Language Inference (NLI) specializing in\
+  \ the Swahili language.\nAnalyze the premise and hypothesis given in Swahili, and\
+  \ determine the relationship between them.\n Respond with one of the following options:\
+  \ 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}} \nHypothesis:\
+  \ {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_swa_prompt_4
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_twi.yaml b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f9835314e7fc979010c80e6be80ebd616eb3abff
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_twi.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_text: "You are an expert in Natural Language Inference (NLI) specializing in\
+  \ the Twi language.\nAnalyze the premise and hypothesis given in Twi, and determine\
+  \ the relationship between them.\n Respond with one of the following options: 'entailment',\
+  \ 'contradiction', or 'neutral'. \n\nPremise: {{premise}} \nHypothesis: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_twi_prompt_4
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_wol.yaml b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6b535bc2d45555821810abb91755fb2afbae9bd1
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_wol.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: wol
+doc_to_text: "You are an expert in Natural Language Inference (NLI) specializing in\
+  \ the Wolof language.\nAnalyze the premise and hypothesis given in Wolof, and determine\
+  \ the relationship between them.\n Respond with one of the following options: 'entailment',\
+  \ 'contradiction', or 'neutral'. \n\nPremise: {{premise}} \nHypothesis: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_wol_prompt_4
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_xho.yaml b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..45f55e0e1441fe40f0a6fcecec0309d9c3013dc3
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_xho.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: xho
+doc_to_text: "You are an expert in Natural Language Inference (NLI) specializing in\
+  \ the isiXhosa language.\nAnalyze the premise and hypothesis given in isiXhosa,\
+  \ and determine the relationship between them.\n Respond with one of the following\
+  \ options: 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_xho_prompt_4
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_yaml b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fe5de1a6dd271c23b14712b09ab070ec848b753b
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_yaml
@@ -0,0 +1,30 @@
+tag:
+    - afrixnli_tasks
+    - afrixnli_tasks_prompt_4
+dataset_path: masakhane/afrixnli
+dataset_name: null
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: validation
+doc_to_target: !function utils.doc_to_target
+doc_to_choice:
+  - "entailment"
+  - "neutral"
+  - "contradiction"
+should_decontaminate: true
+doc_to_decontamination_query: premise
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    average: weighted
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_yor.yaml b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..63d4f60642c71a1d881af27291a73e04b4abca34
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_yor.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: "You are an expert in Natural Language Inference (NLI) specializing in\
+  \ the Yoruba language.\nAnalyze the premise and hypothesis given in Yoruba, and\
+  \ determine the relationship between them.\n Respond with one of the following options:\
+  \ 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}} \nHypothesis:\
+  \ {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_yor_prompt_4
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_zul.yaml b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1b4a232e395e36f6180a81247b23b624efcdbd05
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_4/afrixnli_zul.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: zul
+doc_to_text: "You are an expert in Natural Language Inference (NLI) specializing in\
+  \ the Zulu language.\nAnalyze the premise and hypothesis given in Zulu, and determine\
+  \ the relationship between them.\n Respond with one of the following options: 'entailment',\
+  \ 'contradiction', or 'neutral'. \n\nPremise: {{premise}} \nHypothesis: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_zul_prompt_4
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_4/utils.py b/lm_eval/tasks/afrixnli/direct/prompt_4/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d97a0a288508e817ab695e637fb157a08c813808
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_4/utils.py
@@ -0,0 +1,19 @@
+from lm_eval.utils import weighted_f1_score
+
+
+def doc_to_text(doc):
+    output = """Please identify whether the premise entails or contradicts the hypothesis in the following premise
+    and hypothesis. The answer should be exact entailment, contradiction, or neutral.
+
+    Premise: {premise}
+    Hypothesis: {hypothesis}
+
+    Is it entailment, contradiction, or neutral?"""
+
+    text = output.format(premise=doc["premise"], hypothesis=doc["hypothesis"])
+    return text
+
+
+def doc_to_target(doc):
+    replacements = {0: "entailment", 1: "neutral", 2: "contradiction"}
+    return replacements[doc["label"]]
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_amh.yaml b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..70873a211527bec45adf7c689deef653eb3cfe07
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_amh.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: amh
+doc_to_text: "Based on the given statement, is the following claim 'true', 'false',\
+  \ or 'inconclusive'. \nStatement: {{premise}} \nClaim: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_amh_prompt_5
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_eng.yaml b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..675264a8dc0da305ec2d92ff16a8393fd9bd0729
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_eng.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng
+doc_to_text: "Based on the given statement, is the following claim 'true', 'false',\
+  \ or 'inconclusive'. \nStatement: {{premise}} \nClaim: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_eng_prompt_5
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_ewe.yaml b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7f60db0bffdd4ed7c367a10c6e365707a02348a4
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_ewe.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ewe
+doc_to_text: "Based on the given statement, is the following claim 'true', 'false',\
+  \ or 'inconclusive'. \nStatement: {{premise}} \nClaim: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_ewe_prompt_5
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_fra.yaml b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2bb558dabcb62d4e7c49c54c100986703fdc88ad
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_fra.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: fra
+doc_to_text: "Based on the given statement, is the following claim 'true', 'false',\
+  \ or 'inconclusive'. \nStatement: {{premise}} \nClaim: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_fra_prompt_5
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_hau.yaml b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..728ae1b805f2ac9f014200ad59c82b6e822ca884
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_hau.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: "Based on the given statement, is the following claim 'true', 'false',\
+  \ or 'inconclusive'. \nStatement: {{premise}} \nClaim: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_hau_prompt_5
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_ibo.yaml b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3086b9b4f3c31121e089a8f0bc4c9e9ee4c1cc4a
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_ibo.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: "Based on the given statement, is the following claim 'true', 'false',\
+  \ or 'inconclusive'. \nStatement: {{premise}} \nClaim: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_ibo_prompt_5
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_kin.yaml b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..13a8845cf1ca22f4a25c79931d526a3305a2172c
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_kin.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_text: "Based on the given statement, is the following claim 'true', 'false',\
+  \ or 'inconclusive'. \nStatement: {{premise}} \nClaim: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_kin_prompt_5
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_lin.yaml b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a0250f29f300f3f0d312744b0c7a83bfbcc1bc55
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_lin.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: lin
+doc_to_text: "Based on the given statement, is the following claim 'true', 'false',\
+  \ or 'inconclusive'. \nStatement: {{premise}} \nClaim: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_lin_prompt_5
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_lug.yaml b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..928b74ce4fce73f952ac71999b4dbfc83c9632cb
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_lug.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: lug
+doc_to_text: "Based on the given statement, is the following claim 'true', 'false',\
+  \ or 'inconclusive'. \nStatement: {{premise}} \nClaim: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_lug_prompt_5
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_orm.yaml b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f7f555db795996ac482deaae924db8af58e5c123
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_orm.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: orm
+doc_to_text: "Based on the given statement, is the following claim 'true', 'false',\
+  \ or 'inconclusive'. \nStatement: {{premise}} \nClaim: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_orm_prompt_5
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_sna.yaml b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ac0ef3007edbcdcaf6a705202565ec1d842889a0
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_sna.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: sna
+doc_to_text: "Based on the given statement, is the following claim 'true', 'false',\
+  \ or 'inconclusive'. \nStatement: {{premise}} \nClaim: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_sna_prompt_5
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_sot.yaml b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..21fcdde5b66733a9f488c12b207545241b7ee7e6
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_sot.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: sot
+doc_to_text: "Based on the given statement, is the following claim 'true', 'false',\
+  \ or 'inconclusive'. \nStatement: {{premise}} \nClaim: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_sot_prompt_5
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_swa.yaml b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5d5824adcf3373864aa3ecf660952f667c648ea8
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_swa.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: "Based on the given statement, is the following claim 'true', 'false',\
+  \ or 'inconclusive'. \nStatement: {{premise}} \nClaim: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_swa_prompt_5
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_twi.yaml b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b519ef71eec7ddf91fc2e247021779edfec29145
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_twi.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_text: "Based on the given statement, is the following claim 'true', 'false',\
+  \ or 'inconclusive'. \nStatement: {{premise}} \nClaim: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_twi_prompt_5
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_wol.yaml b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a865c8b166b19e458c3ff68138f74e48f8ce6b60
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_wol.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: wol
+doc_to_text: "Based on the given statement, is the following claim 'true', 'false',\
+  \ or 'inconclusive'. \nStatement: {{premise}} \nClaim: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_wol_prompt_5
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_xho.yaml b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1891bfd0592a8d3c6e2f0e98619bbeee234a852f
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_xho.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: xho
+doc_to_text: "Based on the given statement, is the following claim 'true', 'false',\
+  \ or 'inconclusive'. \nStatement: {{premise}} \nClaim: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_xho_prompt_5
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_yaml b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..13e2b6ef7244d2689d7c56146aa15328e792c2fc
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_yaml
@@ -0,0 +1,30 @@
+tag:
+    - afrixnli_tasks
+    - afrixnli_tasks_prompt_5
+dataset_path: masakhane/afrixnli
+dataset_name: null
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: validation
+doc_to_target: !function utils.doc_to_target
+doc_to_choice:
+  - "true"
+  - "inconclusive"
+  - "false"
+should_decontaminate: true
+doc_to_decontamination_query: premise
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    average: weighted
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_yor.yaml b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4510441b606a8dbc0e635a00c3a009c4f891bd23
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_yor.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: "Based on the given statement, is the following claim 'true', 'false',\
+  \ or 'inconclusive'. \nStatement: {{premise}} \nClaim: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_yor_prompt_5
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_zul.yaml b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2aa872b0410f56ea9e7ea19c4fb3d5adf93d323d
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_5/afrixnli_zul.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: zul
+doc_to_text: "Based on the given statement, is the following claim 'true', 'false',\
+  \ or 'inconclusive'. \nStatement: {{premise}} \nClaim: {{hypothesis}}"
+include: afrixnli_yaml
+task: afrixnli_zul_prompt_5
diff --git a/lm_eval/tasks/afrixnli/direct/prompt_5/utils.py b/lm_eval/tasks/afrixnli/direct/prompt_5/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b9cb312b25a4c21bdd3d6a5e0a4e8e160451e4a
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/direct/prompt_5/utils.py
@@ -0,0 +1,6 @@
+from lm_eval.utils import weighted_f1_score
+
+
+def doc_to_target(doc):
+    replacements = {0: "true", 1: "false", 2: "inconclusive"}
+    return replacements[doc["label"]]
diff --git a/lm_eval/tasks/afrixnli/gen_utils.py b/lm_eval/tasks/afrixnli/gen_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..338b4f9daf9e0724b0d87a7f2182fac956e245ca
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/gen_utils.py
@@ -0,0 +1,129 @@
+import argparse
+import os
+
+import yaml
+
+
+class FunctionTag:
+    def __init__(self, value):
+        self.value = value
+
+
+def prompt_func(mode, lang):
+    prompt_map = {
+        "prompt_1": "Please identify whether the premise entails or contradicts the hypothesis in the following premise "
+        "and hypothesis. The answer should be exact entailment, contradiction, or neutral.\n\nPremise: {premise}\nHypothesis: {hypothesis}\n\n"
+        "Is it entailment, contradiction, or neutral?",
+        "prompt_3": f"Given the following premise and hypothesis in {lang}, identify if the premise entails, contradicts, "
+        f"or is neutral towards the hypothesis. Please respond with exact 'entailment', 'contradiction', or 'neutral'. \n\n"
+        "Premise: {{premise}} \nHypothesis: {{hypothesis}}",
+        "prompt_4": f"You are an expert in Natural Language Inference (NLI) specializing in the {lang} language.\n"
+        f"Analyze the premise and hypothesis given in {lang}, and determine the relationship between them.\n "
+        f"Respond with one of the following options: 'entailment', 'contradiction', or 'neutral'. \n\n"
+        "Premise: {{premise}} \nHypothesis: {{hypothesis}}",
+        "prompt_5": "Based on the given statement, is the following claim 'true', 'false', or 'inconclusive'. \n"
+        "Statement: {{premise}} \nClaim: {{hypothesis}}",
+    }
+    return prompt_map[mode]
+
+
+def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
+    """
+    Generate a yaml file for each language.
+
+    :param output_dir: The directory to output the files to.
+    :param overwrite: Whether to overwrite files if they already exist.
+    """
+    err = []
+    languages = {
+        "eng": "English",
+        "amh": "Amharic",
+        "ibo": "Igbo",
+        "fra": "French",
+        "sna": "chiShona",
+        "wol": "Wolof",
+        "ewe": "Ewe",
+        "lin": "Lingala",
+        "lug": "Luganda",
+        "xho": "isiXhosa",
+        "kin": "Kinyarwanda",
+        "twi": "Twi",
+        "zul": "Zulu",
+        "orm": "Oromo",
+        "yor": "Yoruba",
+        "hau": "Hausa",
+        "sot": "Sesotho",
+        "swa": "Swahili",
+    }
+
+    for lang in languages.keys():
+        try:
+            file_name = f"afrixnli_{lang}.yaml"
+            task_name = f"afrixnli_{lang}_{mode}"
+            yaml_template = "afrixnli_yaml"
+            if output_dir.split("/")[-1] == "translate":
+                file_name = f"afrixnli_translate_{lang}.yaml"
+                task_name = f"afrixnli_translate_{lang}_{mode}"
+                yaml_template = "afrixnli_translate_yaml"
+            if int(mode.split("_")[-1]) == 1 or int(mode.split("_")[-1]) > 2:
+                yaml_details = {
+                    "include": yaml_template,
+                    "task": task_name,
+                    "dataset_name": lang,
+                    "doc_to_text": prompt_func(mode, languages[lang]),
+                }
+            else:
+                yaml_details = {
+                    "include": yaml_template,
+                    "task": task_name,
+                    "dataset_name": lang,
+                }
+            os.makedirs(f"{output_dir}/{mode}", exist_ok=True)
+            with open(
+                f"{output_dir}/{mode}/{file_name}",
+                "w" if overwrite else "x",
+                encoding="utf8",
+            ) as f:
+                f.write("# Generated by utils.py\n")
+                yaml.dump(
+                    yaml_details,
+                    f,
+                    allow_unicode=True,
+                )
+        except FileExistsError:
+            err.append(file_name)
+
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist (use --overwrite flag):"
+            f" {', '.join(err)}"
+        )
+
+
+def main() -> None:
+    """Parse CLI args and generate language-specific yaml files."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--overwrite",
+        default=True,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default="./translate",
+        help="Directory to write yaml files to",
+    )
+    parser.add_argument(
+        "--mode",
+        default="prompt_5",
+        choices=["prompt_1", "prompt_2", "prompt_3", "prompt_4", "prompt_5"],
+        help="Prompt number",
+    )
+    args = parser.parse_args()
+
+    gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite, mode=args.mode)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lm_eval/tasks/afrixnli/translate/afrixnli_tt.yaml b/lm_eval/tasks/afrixnli/translate/afrixnli_tt.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ba507b39d0320bc6307062fc3158a2f1d9212c84
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/afrixnli_tt.yaml
@@ -0,0 +1,9 @@
+group: afrixnli_tt-irokobench
+task:
+  - afrixnli_tt_tasks
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 2
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_amh.yaml b/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..92ef8df7270120b2ead5d3ece0d9cffc2bfc1741
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_amh.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: amh
+doc_to_text: 'Please identify whether the premise entails or contradicts the hypothesis
+  in the following premise and hypothesis. The answer should be exact entailment,
+  contradiction, or neutral.
+
+
+  Premise: {premise}
+
+  Hypothesis: {hypothesis}
+
+
+  Is it entailment, contradiction, or neutral?'
+include: afrixnli_translate_yaml
+task: afrixnli_translate_amh_prompt_1
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_ewe.yaml b/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fa32dd72a6bd7412eae5fb94ccb3d5af06402a1c
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_ewe.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: ewe
+doc_to_text: 'Please identify whether the premise entails or contradicts the hypothesis
+  in the following premise and hypothesis. The answer should be exact entailment,
+  contradiction, or neutral.
+
+
+  Premise: {premise}
+
+  Hypothesis: {hypothesis}
+
+
+  Is it entailment, contradiction, or neutral?'
+include: afrixnli_translate_yaml
+task: afrixnli_translate_ewe_prompt_1
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_fra.yaml b/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3dc72af6f3dbcc41ab83a2adcf384b952e97a4d5
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_fra.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: fra
+doc_to_text: 'Please identify whether the premise entails or contradicts the hypothesis
+  in the following premise and hypothesis. The answer should be exact entailment,
+  contradiction, or neutral.
+
+
+  Premise: {premise}
+
+  Hypothesis: {hypothesis}
+
+
+  Is it entailment, contradiction, or neutral?'
+include: afrixnli_translate_yaml
+task: afrixnli_translate_fra_prompt_1
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_hau.yaml b/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..77f22faf2789d56de00b4a226832e2cb3d401362
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_hau.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: 'Please identify whether the premise entails or contradicts the hypothesis
+  in the following premise and hypothesis. The answer should be exact entailment,
+  contradiction, or neutral.
+
+
+  Premise: {premise}
+
+  Hypothesis: {hypothesis}
+
+
+  Is it entailment, contradiction, or neutral?'
+include: afrixnli_translate_yaml
+task: afrixnli_translate_hau_prompt_1
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_ibo.yaml b/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a7ac8793976d7722375f01f43f587b74e9654ec2
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_ibo.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: 'Please identify whether the premise entails or contradicts the hypothesis
+  in the following premise and hypothesis. The answer should be exact entailment,
+  contradiction, or neutral.
+
+
+  Premise: {premise}
+
+  Hypothesis: {hypothesis}
+
+
+  Is it entailment, contradiction, or neutral?'
+include: afrixnli_translate_yaml
+task: afrixnli_translate_ibo_prompt_1
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_kin.yaml b/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3a52861402a8c3d65ccbe025fa62807d86e89b14
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_kin.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_text: 'Please identify whether the premise entails or contradicts the hypothesis
+  in the following premise and hypothesis. The answer should be exact entailment,
+  contradiction, or neutral.
+
+
+  Premise: {premise}
+
+  Hypothesis: {hypothesis}
+
+
+  Is it entailment, contradiction, or neutral?'
+include: afrixnli_translate_yaml
+task: afrixnli_translate_kin_prompt_1
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_lin.yaml b/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..eb2a667e864afb856ec85ecd6b300378b44f8050
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_lin.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: lin
+doc_to_text: 'Please identify whether the premise entails or contradicts the hypothesis
+  in the following premise and hypothesis. The answer should be exact entailment,
+  contradiction, or neutral.
+
+
+  Premise: {premise}
+
+  Hypothesis: {hypothesis}
+
+
+  Is it entailment, contradiction, or neutral?'
+include: afrixnli_translate_yaml
+task: afrixnli_translate_lin_prompt_1
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_lug.yaml b/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bf45d957a3d4f169630b3c3405b348a0ceefe1b1
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_lug.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: lug
+doc_to_text: 'Please identify whether the premise entails or contradicts the hypothesis
+  in the following premise and hypothesis. The answer should be exact entailment,
+  contradiction, or neutral.
+
+
+  Premise: {premise}
+
+  Hypothesis: {hypothesis}
+
+
+  Is it entailment, contradiction, or neutral?'
+include: afrixnli_translate_yaml
+task: afrixnli_translate_lug_prompt_1
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_orm.yaml b/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..14b20a1c35ae63adddddbc4a0b8d4e1fba2c90b7
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_orm.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: orm
+doc_to_text: 'Please identify whether the premise entails or contradicts the hypothesis
+  in the following premise and hypothesis. The answer should be exact entailment,
+  contradiction, or neutral.
+
+
+  Premise: {premise}
+
+  Hypothesis: {hypothesis}
+
+
+  Is it entailment, contradiction, or neutral?'
+include: afrixnli_translate_yaml
+task: afrixnli_translate_orm_prompt_1
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_sna.yaml b/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..13103dd7a2f339f02b280dd3c67d8ec27807c86a
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_sna.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: sna
+doc_to_text: 'Please identify whether the premise entails or contradicts the hypothesis
+  in the following premise and hypothesis. The answer should be exact entailment,
+  contradiction, or neutral.
+
+
+  Premise: {premise}
+
+  Hypothesis: {hypothesis}
+
+
+  Is it entailment, contradiction, or neutral?'
+include: afrixnli_translate_yaml
+task: afrixnli_translate_sna_prompt_1
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_sot.yaml b/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..97cf3cba569f25219a7cdccb2be7a1c6effae0c9
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_sot.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: sot
+doc_to_text: 'Please identify whether the premise entails or contradicts the hypothesis
+  in the following premise and hypothesis. The answer should be exact entailment,
+  contradiction, or neutral.
+
+
+  Premise: {premise}
+
+  Hypothesis: {hypothesis}
+
+
+  Is it entailment, contradiction, or neutral?'
+include: afrixnli_translate_yaml
+task: afrixnli_translate_sot_prompt_1
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_swa.yaml b/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..824bb17aa23f3a1a546d369a6d3118272ee05c54
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_swa.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: 'Please identify whether the premise entails or contradicts the hypothesis
+  in the following premise and hypothesis. The answer should be exact entailment,
+  contradiction, or neutral.
+
+
+  Premise: {premise}
+
+  Hypothesis: {hypothesis}
+
+
+  Is it entailment, contradiction, or neutral?'
+include: afrixnli_translate_yaml
+task: afrixnli_translate_swa_prompt_1
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_twi.yaml b/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9d971c3e5fe6ca65e7568da81b647d2ff8f20696
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_twi.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_text: 'Please identify whether the premise entails or contradicts the hypothesis
+  in the following premise and hypothesis. The answer should be exact entailment,
+  contradiction, or neutral.
+
+
+  Premise: {premise}
+
+  Hypothesis: {hypothesis}
+
+
+  Is it entailment, contradiction, or neutral?'
+include: afrixnli_translate_yaml
+task: afrixnli_translate_twi_prompt_1
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_wol.yaml b/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..877787a88c0e8ee0c2d7d87d97aeff2d91e1330f
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_wol.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: wol
+doc_to_text: 'Please identify whether the premise entails or contradicts the hypothesis
+  in the following premise and hypothesis. The answer should be exact entailment,
+  contradiction, or neutral.
+
+
+  Premise: {premise}
+
+  Hypothesis: {hypothesis}
+
+
+  Is it entailment, contradiction, or neutral?'
+include: afrixnli_translate_yaml
+task: afrixnli_translate_wol_prompt_1
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_xho.yaml b/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c907a2bf453d99c048822ad93feb12781004d2d0
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_xho.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: xho
+doc_to_text: 'Please identify whether the premise entails or contradicts the hypothesis
+  in the following premise and hypothesis. The answer should be exact entailment,
+  contradiction, or neutral.
+
+
+  Premise: {premise}
+
+  Hypothesis: {hypothesis}
+
+
+  Is it entailment, contradiction, or neutral?'
+include: afrixnli_translate_yaml
+task: afrixnli_translate_xho_prompt_1
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_yaml b/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..832b51493a0f17996ebca680f7151e38b59168d3
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_yaml
@@ -0,0 +1,27 @@
+tag: afrixnli_tt_tasks
+dataset_path: masakhane/afrixnli-translate-test
+dataset_name: null
+output_type: multiple_choice
+test_split: test
+fewshot_split: test
+doc_to_target: !function utils.doc_to_target
+doc_to_choice:
+  - "entailment"
+  - "neutral"
+  - "contradiction"
+should_decontaminate: true
+doc_to_decontamination_query: premise
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    average: weighted
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_yor.yaml b/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4c0ec7c91ff7d81a500bb1d335fe06406a3c94e0
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_yor.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: 'Please identify whether the premise entails or contradicts the hypothesis
+  in the following premise and hypothesis. The answer should be exact entailment,
+  contradiction, or neutral.
+
+
+  Premise: {premise}
+
+  Hypothesis: {hypothesis}
+
+
+  Is it entailment, contradiction, or neutral?'
+include: afrixnli_translate_yaml
+task: afrixnli_translate_yor_prompt_1
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_zul.yaml b/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..78753d1fe3ef143c547780041e70cc03d20289f1
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_1/afrixnli_translate_zul.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: zul
+doc_to_text: 'Please identify whether the premise entails or contradicts the hypothesis
+  in the following premise and hypothesis. The answer should be exact entailment,
+  contradiction, or neutral.
+
+
+  Premise: {premise}
+
+  Hypothesis: {hypothesis}
+
+
+  Is it entailment, contradiction, or neutral?'
+include: afrixnli_translate_yaml
+task: afrixnli_translate_zul_prompt_1
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_1/utils.py b/lm_eval/tasks/afrixnli/translate/prompt_1/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d97a0a288508e817ab695e637fb157a08c813808
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_1/utils.py
@@ -0,0 +1,19 @@
+from lm_eval.utils import weighted_f1_score
+
+
+def doc_to_text(doc):
+    output = """Please identify whether the premise entails or contradicts the hypothesis in the following premise
+    and hypothesis. The answer should be exact entailment, contradiction, or neutral.
+
+    Premise: {premise}
+    Hypothesis: {hypothesis}
+
+    Is it entailment, contradiction, or neutral?"""
+
+    text = output.format(premise=doc["premise"], hypothesis=doc["hypothesis"])
+    return text
+
+
+def doc_to_target(doc):
+    replacements = {0: "entailment", 1: "neutral", 2: "contradiction"}
+    return replacements[doc["label"]]
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_amh.yaml b/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0810f6b37b9c83815fab4de50d3bc42b2c01624e
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_amh.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: amh
+include: afrixnli_translate_yaml
+task: afrixnli_translate_amh_prompt_2
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_ewe.yaml b/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d7aec16a61ab04415327addd832893ab6e4e531c
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_ewe.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ewe
+include: afrixnli_translate_yaml
+task: afrixnli_translate_ewe_prompt_2
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_fra.yaml b/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f17a4ecf0a3fe044c04fff53ef61f5946bf744b1
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_fra.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fra
+include: afrixnli_translate_yaml
+task: afrixnli_translate_fra_prompt_2
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_hau.yaml b/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..688778c3195c54868f0f3d1d9f56e17c167205f9
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_hau.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hau
+include: afrixnli_translate_yaml
+task: afrixnli_translate_hau_prompt_2
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_ibo.yaml b/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5667b3d0624c569bf9e392a67780a80fdf5aee3b
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_ibo.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ibo
+include: afrixnli_translate_yaml
+task: afrixnli_translate_ibo_prompt_2
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_kin.yaml b/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a74950cc34121f63d3dd944a310b7656bc2ff894
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_kin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kin
+include: afrixnli_translate_yaml
+task: afrixnli_translate_kin_prompt_2
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_lin.yaml b/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..27e88a5b7664a515e7d6cc901962f201913670d6
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_lin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lin
+include: afrixnli_translate_yaml
+task: afrixnli_translate_lin_prompt_2
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_lug.yaml b/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..63ff988c3aea7178e06d85ca20fea40c87e9dbcc
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_lug.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: lug
+include: afrixnli_translate_yaml
+task: afrixnli_translate_lug_prompt_2
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_orm.yaml b/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..db1a3ea1cbeff8ddd8c5b2ba49d3d553410c1b06
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_orm.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: orm
+include: afrixnli_translate_yaml
+task: afrixnli_translate_orm_prompt_2
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_sna.yaml b/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fa110774a6fa86e29253307163e22c82f35a9ac0
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_sna.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sna
+include: afrixnli_translate_yaml
+task: afrixnli_translate_sna_prompt_2
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_sot.yaml b/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3133308b2810a3a61e1037ebe58c02ba0da0a2ef
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_sot.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: sot
+include: afrixnli_translate_yaml
+task: afrixnli_translate_sot_prompt_2
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_swa.yaml b/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..926f91321cfcade46ca52492cba83daa348bc746
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_swa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swa
+include: afrixnli_translate_yaml
+task: afrixnli_translate_swa_prompt_2
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_twi.yaml b/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c229de3dd2b40b902e7ef93de0cc06a6da04d8af
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_twi.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: twi
+include: afrixnli_translate_yaml
+task: afrixnli_translate_twi_prompt_2
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_wol.yaml b/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..87844c49639fa350e52877d967f64d5ea95cf28e
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_wol.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: wol
+include: afrixnli_translate_yaml
+task: afrixnli_translate_wol_prompt_2
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_xho.yaml b/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..63fa3ffc1c69d89637dca73da5050c840593aa4a
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_xho.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: xho
+include: afrixnli_translate_yaml
+task: afrixnli_translate_xho_prompt_2
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_yaml b/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8ad87afcd99a5c875ccf1f7d2761afdea2a118aa
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_yaml
@@ -0,0 +1,31 @@
+tag: afrixnli_tt_tasks
+dataset_path: masakhane/afrixnli-translate-test
+dataset_name: null
+output_type: multiple_choice
+test_split: test
+fewshot_split: test
+doc_to_text: "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither?\nAnswer:"
+# True = entailment
+# False = contradiction
+# Neither = neutral
+doc_to_target: !function utils.doc_to_target
+doc_to_choice:
+  - "True"
+  - "Neither"
+  - "False"
+should_decontaminate: true
+doc_to_decontamination_query: premise
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    average: weighted
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_yor.yaml b/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7dfc9bd6d46fb8982402dd85531d1d312a8a07b8
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_yor.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: yor
+include: afrixnli_translate_yaml
+task: afrixnli_translate_yor_prompt_2
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_zul.yaml b/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0878c4e03e985eced95c234fa079c3d92999a982
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_2/afrixnli_translate_zul.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: zul
+include: afrixnli_translate_yaml
+task: afrixnli_translate_zul_prompt_2
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_2/utils.py b/lm_eval/tasks/afrixnli/translate/prompt_2/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d1ac19e19b2e855c957e75f1c778366dfbc7e55
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_2/utils.py
@@ -0,0 +1,6 @@
+from lm_eval.utils import weighted_f1_score
+
+
+def doc_to_target(doc):
+    replacements = {0: "True", 1: "Neither", 2: "False"}
+    return replacements[doc["label"]]
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_amh.yaml b/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6fb06d0f1f7dc3c62e7c51a6395b1a79f1759878
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_amh.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: amh
+doc_to_text: "Given the following premise and hypothesis in Amharic, identify if the\
+  \ premise entails, contradicts, or is neutral towards the hypothesis. Please respond\
+  \ with exact 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_amh_prompt_3
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_ewe.yaml b/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0d550f9daff83651b4099d3ad4228de0afab6ac4
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_ewe.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: ewe
+doc_to_text: "Given the following premise and hypothesis in Ewe, identify if the premise\
+  \ entails, contradicts, or is neutral towards the hypothesis. Please respond with\
+  \ exact 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}} \n\
+  Hypothesis: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_ewe_prompt_3
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_fra.yaml b/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3156466c388362d06414b613e963f0f9fcb1465f
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_fra.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: fra
+doc_to_text: "Given the following premise and hypothesis in French, identify if the\
+  \ premise entails, contradicts, or is neutral towards the hypothesis. Please respond\
+  \ with exact 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_fra_prompt_3
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_hau.yaml b/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9ae3c21e2100b4b07428f67c23070d51d32761b0
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_hau.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: "Given the following premise and hypothesis in Hausa, identify if the\
+  \ premise entails, contradicts, or is neutral towards the hypothesis. Please respond\
+  \ with exact 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_hau_prompt_3
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_ibo.yaml b/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..28696f337841ae738354ae11bf537f07743c49f5
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_ibo.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: "Given the following premise and hypothesis in Igbo, identify if the\
+  \ premise entails, contradicts, or is neutral towards the hypothesis. Please respond\
+  \ with exact 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_ibo_prompt_3
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_kin.yaml b/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6981da83291b927f5fd5908f62e039227ded3a9e
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_kin.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_text: "Given the following premise and hypothesis in Kinyarwanda, identify\
+  \ if the premise entails, contradicts, or is neutral towards the hypothesis. Please\
+  \ respond with exact 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_kin_prompt_3
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_lin.yaml b/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1984416f0a828871d5e29ca46bde91c300440feb
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_lin.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: lin
+doc_to_text: "Given the following premise and hypothesis in Lingala, identify if the\
+  \ premise entails, contradicts, or is neutral towards the hypothesis. Please respond\
+  \ with exact 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_lin_prompt_3
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_lug.yaml b/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..32a7ad2a161818c6d84b9002d6690b95cc86af3c
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_lug.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: lug
+doc_to_text: "Given the following premise and hypothesis in Luganda, identify if the\
+  \ premise entails, contradicts, or is neutral towards the hypothesis. Please respond\
+  \ with exact 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_lug_prompt_3
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_orm.yaml b/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d3923a80cdc78c59dabb63bb3fe9dda6e8e572a9
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_orm.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: orm
+doc_to_text: "Given the following premise and hypothesis in Oromo, identify if the\
+  \ premise entails, contradicts, or is neutral towards the hypothesis. Please respond\
+  \ with exact 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_orm_prompt_3
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_sna.yaml b/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d7dbf17e8b0520fc15bc6d6b337c959f828268ef
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_sna.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: sna
+doc_to_text: "Given the following premise and hypothesis in chiShona, identify if\
+  \ the premise entails, contradicts, or is neutral towards the hypothesis. Please\
+  \ respond with exact 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_sna_prompt_3
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_sot.yaml b/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c4e89ec90608180f281765d321c6c1b0220171e0
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_sot.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: sot
+doc_to_text: "Given the following premise and hypothesis in Sesotho, identify if the\
+  \ premise entails, contradicts, or is neutral towards the hypothesis. Please respond\
+  \ with exact 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_sot_prompt_3
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_swa.yaml b/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a3c5243b35264d5b26065a63c9c3babb8c714ec7
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_swa.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: "Given the following premise and hypothesis in Swahili, identify if the\
+  \ premise entails, contradicts, or is neutral towards the hypothesis. Please respond\
+  \ with exact 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_swa_prompt_3
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_twi.yaml b/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e7e8568701be5958b2da080d5d6c0885e83bb370
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_twi.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_text: "Given the following premise and hypothesis in Twi, identify if the premise\
+  \ entails, contradicts, or is neutral towards the hypothesis. Please respond with\
+  \ exact 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}} \n\
+  Hypothesis: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_twi_prompt_3
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_wol.yaml b/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2cf0b08eff851ad0832adad093f6338222f3bc74
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_wol.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: wol
+doc_to_text: "Given the following premise and hypothesis in Wolof, identify if the\
+  \ premise entails, contradicts, or is neutral towards the hypothesis. Please respond\
+  \ with exact 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_wol_prompt_3
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_xho.yaml b/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a4dafa34f6cd5b69e115de88e04cd24b3473c5fd
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_xho.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: xho
+doc_to_text: "Given the following premise and hypothesis in isiXhosa, identify if\
+  \ the premise entails, contradicts, or is neutral towards the hypothesis. Please\
+  \ respond with exact 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_xho_prompt_3
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_yaml b/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..832b51493a0f17996ebca680f7151e38b59168d3
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_yaml
@@ -0,0 +1,27 @@
+tag: afrixnli_tt_tasks
+dataset_path: masakhane/afrixnli-translate-test
+dataset_name: null
+output_type: multiple_choice
+test_split: test
+fewshot_split: test
+doc_to_target: !function utils.doc_to_target
+doc_to_choice:
+  - "entailment"
+  - "neutral"
+  - "contradiction"
+should_decontaminate: true
+doc_to_decontamination_query: premise
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    average: weighted
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_yor.yaml b/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f5c01ca54eaba74fd968ce7847f43b2fe4b373fd
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_yor.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: "Given the following premise and hypothesis in Yoruba, identify if the\
+  \ premise entails, contradicts, or is neutral towards the hypothesis. Please respond\
+  \ with exact 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_yor_prompt_3
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_zul.yaml b/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fdbbec809a0b21cabc58defd6cbac15d0ea29ff2
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_3/afrixnli_translate_zul.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: zul
+doc_to_text: "Given the following premise and hypothesis in Zulu, identify if the\
+  \ premise entails, contradicts, or is neutral towards the hypothesis. Please respond\
+  \ with exact 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_zul_prompt_3
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_3/utils.py b/lm_eval/tasks/afrixnli/translate/prompt_3/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c455a3045a9be8b7318b96e23d9f061add6a342e
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_3/utils.py
@@ -0,0 +1,21 @@
+from lm_eval.utils import weighted_f1_score
+
+
+def doc_to_text(doc):
+    output = """You are an NLP assistant whose purpose is to solve Natural Language Inference (NLI) problems
+
+    Please identify whether the premise entails or contradicts the hypothesis in the following premise
+    and hypothesis. The answer should be exact entailment, contradiction, or neutral.
+
+    Premise: {premise}
+    Hypothesis: {hypothesis}
+
+    Is it entailment, contradiction, or neutral?"""
+
+    text = output.format(premise=doc["premise"], hypothesis=doc["hypothesis"])
+    return text
+
+
+def doc_to_target(doc):
+    replacements = {0: "entailment", 1: "neutral", 2: "contradiction"}
+    return replacements[doc["label"]]
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_amh.yaml b/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b5f972e7070ac1ed50b7ed177daa706d520e3a4f
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_amh.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: amh
+doc_to_text: "You are an expert in Natural Language Inference (NLI) specializing in\
+  \ the Amharic language.\nAnalyze the premise and hypothesis given in Amharic, and\
+  \ determine the relationship between them.\n Respond with one of the following options:\
+  \ 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}} \nHypothesis:\
+  \ {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_amh_prompt_4
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_ewe.yaml b/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ebc775dd705e572f68835e194b6c3c8d745f6e1b
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_ewe.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: ewe
+doc_to_text: "You are an expert in Natural Language Inference (NLI) specializing in\
+  \ the Ewe language.\nAnalyze the premise and hypothesis given in Ewe, and determine\
+  \ the relationship between them.\n Respond with one of the following options: 'entailment',\
+  \ 'contradiction', or 'neutral'. \n\nPremise: {{premise}} \nHypothesis: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_ewe_prompt_4
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_fra.yaml b/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2ad718c708ddb9c82c3e0ac35c7b805c0c364a64
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_fra.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: fra
+doc_to_text: "You are an expert in Natural Language Inference (NLI) specializing in\
+  \ the French language.\nAnalyze the premise and hypothesis given in French, and\
+  \ determine the relationship between them.\n Respond with one of the following options:\
+  \ 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}} \nHypothesis:\
+  \ {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_fra_prompt_4
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_hau.yaml b/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dd65f366bc2de430404434c7918e3a0bb69aba03
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_hau.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: "You are an expert in Natural Language Inference (NLI) specializing in\
+  \ the Hausa language.\nAnalyze the premise and hypothesis given in Hausa, and determine\
+  \ the relationship between them.\n Respond with one of the following options: 'entailment',\
+  \ 'contradiction', or 'neutral'. \n\nPremise: {{premise}} \nHypothesis: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_hau_prompt_4
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_ibo.yaml b/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..13df12642e743ceb4867a4d515ed2d20edce7486
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_ibo.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: "You are an expert in Natural Language Inference (NLI) specializing in\
+  \ the Igbo language.\nAnalyze the premise and hypothesis given in Igbo, and determine\
+  \ the relationship between them.\n Respond with one of the following options: 'entailment',\
+  \ 'contradiction', or 'neutral'. \n\nPremise: {{premise}} \nHypothesis: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_ibo_prompt_4
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_kin.yaml b/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..198d88750287ee0fc28507ddcdab0146b1c2f734
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_kin.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_text: "You are an expert in Natural Language Inference (NLI) specializing in\
+  \ the Kinyarwanda language.\nAnalyze the premise and hypothesis given in Kinyarwanda,\
+  \ and determine the relationship between them.\n Respond with one of the following\
+  \ options: 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_kin_prompt_4
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_lin.yaml b/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b25856cfdcbb7b0c0df2aba81322d650585e9d9a
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_lin.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: lin
+doc_to_text: "You are an expert in Natural Language Inference (NLI) specializing in\
+  \ the Lingala language.\nAnalyze the premise and hypothesis given in Lingala, and\
+  \ determine the relationship between them.\n Respond with one of the following options:\
+  \ 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}} \nHypothesis:\
+  \ {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_lin_prompt_4
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_lug.yaml b/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..633c173c0b75903c81934391e1e9f07a8de9b7f7
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_lug.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: lug
+doc_to_text: "You are an expert in Natural Language Inference (NLI) specializing in\
+  \ the Luganda language.\nAnalyze the premise and hypothesis given in Luganda, and\
+  \ determine the relationship between them.\n Respond with one of the following options:\
+  \ 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}} \nHypothesis:\
+  \ {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_lug_prompt_4
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_orm.yaml b/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e63f93eb89351af076593eff1623123301bbda9c
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_orm.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: orm
+doc_to_text: "You are an expert in Natural Language Inference (NLI) specializing in\
+  \ the Oromo language.\nAnalyze the premise and hypothesis given in Oromo, and determine\
+  \ the relationship between them.\n Respond with one of the following options: 'entailment',\
+  \ 'contradiction', or 'neutral'. \n\nPremise: {{premise}} \nHypothesis: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_orm_prompt_4
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_sna.yaml b/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6fcb4e063159ab48e9e73423b895658a74cafa9e
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_sna.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: sna
+doc_to_text: "You are an expert in Natural Language Inference (NLI) specializing in\
+  \ the chiShona language.\nAnalyze the premise and hypothesis given in chiShona,\
+  \ and determine the relationship between them.\n Respond with one of the following\
+  \ options: 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_sna_prompt_4
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_sot.yaml b/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..358e4b353eb73065bf91b5471127faf5b2f1675f
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_sot.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: sot
+doc_to_text: "You are an expert in Natural Language Inference (NLI) specializing in\
+  \ the Sesotho language.\nAnalyze the premise and hypothesis given in Sesotho, and\
+  \ determine the relationship between them.\n Respond with one of the following options:\
+  \ 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}} \nHypothesis:\
+  \ {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_sot_prompt_4
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_swa.yaml b/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8ce271ed77c52dbce949a6a83ccd1313d26c9b25
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_swa.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: "You are an expert in Natural Language Inference (NLI) specializing in\
+  \ the Swahili language.\nAnalyze the premise and hypothesis given in Swahili, and\
+  \ determine the relationship between them.\n Respond with one of the following options:\
+  \ 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}} \nHypothesis:\
+  \ {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_swa_prompt_4
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_twi.yaml b/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8171e0daa98aa93614800f3c77a42d2b699ee2a7
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_twi.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_text: "You are an expert in Natural Language Inference (NLI) specializing in\
+  \ the Twi language.\nAnalyze the premise and hypothesis given in Twi, and determine\
+  \ the relationship between them.\n Respond with one of the following options: 'entailment',\
+  \ 'contradiction', or 'neutral'. \n\nPremise: {{premise}} \nHypothesis: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_twi_prompt_4
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_wol.yaml b/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b2662662dff2263023dd2bce1afeb86bb09ce262
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_wol.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: wol
+doc_to_text: "You are an expert in Natural Language Inference (NLI) specializing in\
+  \ the Wolof language.\nAnalyze the premise and hypothesis given in Wolof, and determine\
+  \ the relationship between them.\n Respond with one of the following options: 'entailment',\
+  \ 'contradiction', or 'neutral'. \n\nPremise: {{premise}} \nHypothesis: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_wol_prompt_4
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_xho.yaml b/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5aa3a9d171a4dabe831d5a6126ca61384d218d81
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_xho.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: xho
+doc_to_text: "You are an expert in Natural Language Inference (NLI) specializing in\
+  \ the isiXhosa language.\nAnalyze the premise and hypothesis given in isiXhosa,\
+  \ and determine the relationship between them.\n Respond with one of the following\
+  \ options: 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}}\
+  \ \nHypothesis: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_xho_prompt_4
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_yaml b/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..832b51493a0f17996ebca680f7151e38b59168d3
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_yaml
@@ -0,0 +1,27 @@
+tag: afrixnli_tt_tasks
+dataset_path: masakhane/afrixnli-translate-test
+dataset_name: null
+output_type: multiple_choice
+test_split: test
+fewshot_split: test
+doc_to_target: !function utils.doc_to_target
+doc_to_choice:
+  - "entailment"
+  - "neutral"
+  - "contradiction"
+should_decontaminate: true
+doc_to_decontamination_query: premise
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    average: weighted
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_yor.yaml b/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..478e5043431c73fe2448474b593ae02002d6a722
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_yor.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: "You are an expert in Natural Language Inference (NLI) specializing in\
+  \ the Yoruba language.\nAnalyze the premise and hypothesis given in Yoruba, and\
+  \ determine the relationship between them.\n Respond with one of the following options:\
+  \ 'entailment', 'contradiction', or 'neutral'. \n\nPremise: {{premise}} \nHypothesis:\
+  \ {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_yor_prompt_4
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_zul.yaml b/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c0dc06e6bce0aae18769cd2259aee6699be3b0bd
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_4/afrixnli_translate_zul.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: zul
+doc_to_text: "You are an expert in Natural Language Inference (NLI) specializing in\
+  \ the Zulu language.\nAnalyze the premise and hypothesis given in Zulu, and determine\
+  \ the relationship between them.\n Respond with one of the following options: 'entailment',\
+  \ 'contradiction', or 'neutral'. \n\nPremise: {{premise}} \nHypothesis: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_zul_prompt_4
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_4/utils.py b/lm_eval/tasks/afrixnli/translate/prompt_4/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d97a0a288508e817ab695e637fb157a08c813808
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_4/utils.py
@@ -0,0 +1,19 @@
+from lm_eval.utils import weighted_f1_score
+
+
+def doc_to_text(doc):
+    output = """Please identify whether the premise entails or contradicts the hypothesis in the following premise
+    and hypothesis. The answer should be exact entailment, contradiction, or neutral.
+
+    Premise: {premise}
+    Hypothesis: {hypothesis}
+
+    Is it entailment, contradiction, or neutral?"""
+
+    text = output.format(premise=doc["premise"], hypothesis=doc["hypothesis"])
+    return text
+
+
+def doc_to_target(doc):
+    replacements = {0: "entailment", 1: "neutral", 2: "contradiction"}
+    return replacements[doc["label"]]
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_amh.yaml b/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3079712ce60b0b9b7e5846d3e1d9b16383c8cf97
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_amh.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: amh
+doc_to_text: "Based on the given statement, is the following claim 'true', 'false',\
+  \ or 'inconclusive'. \nStatement: {{premise}} \nClaim: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_amh_prompt_5
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_ewe.yaml b/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6eb452db53361c5c048e75a807de20b3528414ec
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_ewe.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ewe
+doc_to_text: "Based on the given statement, is the following claim 'true', 'false',\
+  \ or 'inconclusive'. \nStatement: {{premise}} \nClaim: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_ewe_prompt_5
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_fra.yaml b/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d6ddf49332ba4ec1671a7a20e036e7d4906c2097
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_fra.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: fra
+doc_to_text: "Based on the given statement, is the following claim 'true', 'false',\
+  \ or 'inconclusive'. \nStatement: {{premise}} \nClaim: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_fra_prompt_5
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_hau.yaml b/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..09d182f7f12e9debba53ed9bd5b1249c38b63a53
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_hau.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: "Based on the given statement, is the following claim 'true', 'false',\
+  \ or 'inconclusive'. \nStatement: {{premise}} \nClaim: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_hau_prompt_5
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_ibo.yaml b/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b5bf1555a454879018ee16c3ed60dd1f71cbdbbe
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_ibo.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: "Based on the given statement, is the following claim 'true', 'false',\
+  \ or 'inconclusive'. \nStatement: {{premise}} \nClaim: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_ibo_prompt_5
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_kin.yaml b/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f0cbe9c2c78612111af3ce29e4a8b846879f3060
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_kin.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_text: "Based on the given statement, is the following claim 'true', 'false',\
+  \ or 'inconclusive'. \nStatement: {{premise}} \nClaim: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_kin_prompt_5
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_lin.yaml b/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..159116be57762adddff7368d1731a867a8daa152
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_lin.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: lin
+doc_to_text: "Based on the given statement, is the following claim 'true', 'false',\
+  \ or 'inconclusive'. \nStatement: {{premise}} \nClaim: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_lin_prompt_5
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_lug.yaml b/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9448fa28cb2561631dfa78fbeccb4bc054c867f6
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_lug.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: lug
+doc_to_text: "Based on the given statement, is the following claim 'true', 'false',\
+  \ or 'inconclusive'. \nStatement: {{premise}} \nClaim: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_lug_prompt_5
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_orm.yaml b/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..64621cb4b784c2e4c16bfe220acb69d7ea17cb8f
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_orm.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: orm
+doc_to_text: "Based on the given statement, is the following claim 'true', 'false',\
+  \ or 'inconclusive'. \nStatement: {{premise}} \nClaim: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_orm_prompt_5
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_sna.yaml b/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..788bae3068b2b40ed1ae4419a8ddbfc9094fe71c
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_sna.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: sna
+doc_to_text: "Based on the given statement, is the following claim 'true', 'false',\
+  \ or 'inconclusive'. \nStatement: {{premise}} \nClaim: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_sna_prompt_5
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_sot.yaml b/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..617dd9f88db6d283eef224126b36a0fcf2e35158
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_sot.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: sot
+doc_to_text: "Based on the given statement, is the following claim 'true', 'false',\
+  \ or 'inconclusive'. \nStatement: {{premise}} \nClaim: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_sot_prompt_5
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_swa.yaml b/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..81a159252ca45302bf5c88c448489a76bd342270
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_swa.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: "Based on the given statement, is the following claim 'true', 'false',\
+  \ or 'inconclusive'. \nStatement: {{premise}} \nClaim: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_swa_prompt_5
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_twi.yaml b/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cb9f115fb3c01ea8a509a7347f6a369ae1f9c819
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_twi.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_text: "Based on the given statement, is the following claim 'true', 'false',\
+  \ or 'inconclusive'. \nStatement: {{premise}} \nClaim: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_twi_prompt_5
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_wol.yaml b/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a5f4eb0c2eeaf10d04b2978cf3ffb821a7123889
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_wol.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: wol
+doc_to_text: "Based on the given statement, is the following claim 'true', 'false',\
+  \ or 'inconclusive'. \nStatement: {{premise}} \nClaim: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_wol_prompt_5
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_xho.yaml b/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d085919b777d18fff97397bdd32d1c0cf2c1c316
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_xho.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: xho
+doc_to_text: "Based on the given statement, is the following claim 'true', 'false',\
+  \ or 'inconclusive'. \nStatement: {{premise}} \nClaim: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_xho_prompt_5
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_yaml b/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3047238439e371f99664aed214a6589b33528e66
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_yaml
@@ -0,0 +1,27 @@
+tag: afrixnli_tt_tasks
+dataset_path: masakhane/afrixnli-translate-test
+dataset_name: null
+output_type: multiple_choice
+test_split: test
+fewshot_split: test
+doc_to_target: !function utils.doc_to_target
+doc_to_choice:
+  - "true"
+  - "inconclusive"
+  - "false"
+should_decontaminate: true
+doc_to_decontamination_query: premise
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    average: weighted
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_yor.yaml b/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..107c663428d39e3eaa565315a40e4aa5f4b53201
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_yor.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: "Based on the given statement, is the following claim 'true', 'false',\
+  \ or 'inconclusive'. \nStatement: {{premise}} \nClaim: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_yor_prompt_5
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_zul.yaml b/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d963646034069b77a78fe5284b106b6f74718a6a
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_5/afrixnli_translate_zul.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: zul
+doc_to_text: "Based on the given statement, is the following claim 'true', 'false',\
+  \ or 'inconclusive'. \nStatement: {{premise}} \nClaim: {{hypothesis}}"
+include: afrixnli_translate_yaml
+task: afrixnli_translate_zul_prompt_5
diff --git a/lm_eval/tasks/afrixnli/translate/prompt_5/utils.py b/lm_eval/tasks/afrixnli/translate/prompt_5/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b9cb312b25a4c21bdd3d6a5e0a4e8e160451e4a
--- /dev/null
+++ b/lm_eval/tasks/afrixnli/translate/prompt_5/utils.py
@@ -0,0 +1,6 @@
+from lm_eval.utils import weighted_f1_score
+
+
+def doc_to_target(doc):
+    replacements = {0: "true", 1: "false", 2: "inconclusive"}
+    return replacements[doc["label"]]
diff --git a/lm_eval/tasks/afrobench/README.md b/lm_eval/tasks/afrobench/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a6ab3ceef1b37e94f1c191d1931648b7b669a49e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/README.md
@@ -0,0 +1,72 @@
+# AfroBench
+
+### Paper
+
+Title: `AfroBench: How Good are Large Language Models on African Languages?`
+
+Paper Link: https://arxiv.org/abs/2311.07978
+
+## Abstract
+> Large-scale multilingual evaluations, such as MEGA, often include only a handful of African languages due to the scarcity of high-quality evaluation data and the limited discoverability of existing African datasets. This lack of representation hinders comprehensive LLM evaluation across a diverse range of languages and tasks. To address these challenges, we introduce AfroBench -- a multi-task benchmark for evaluating the performance of LLMs across 64 African languages, 15 tasks and 22 datasets. AfroBench consists of nine natural language understanding datasets, six text generation datasets, six knowledge and question answering tasks, and one mathematical reasoning task. We present results comparing the performance of prompting LLMs to fine-tuned baselines based on BERT and T5-style models. Our results suggest large gaps in performance between high-resource languages, such as English, and African languages across most tasks; but performance also varies based on the availability of monolingual data resources. Our findings confirm that performance on African languages continues to remain a hurdle for current LLMs, underscoring the need for additional efforts to close this gap.
+
+HomePage: https://mcgill-nlp.github.io/AfroBench/
+
+### Groups, and Tasks
+#### Groups
+* `afrobench` : Runs all that tasks, datasets and prompts in this benchmark
+* `afrobench_lite`: Runs the lite version of the benchmark which includes; afrimgsm, afrimmlu, afrixnli, sib, intent, adr and flores
+
+Dataset specific grouping that listing all prompts, allowing users to review or edit them.
+* `adr`   `afrihate`   `afrisenti`   `belebele`  `african_flores` `injongointent`  `mafand`  `masakhaner`  `masakhapos`  `naijarc`  `nollysenti`  `african_ntrex`  `openai_mmlu`  `salt`  `sib`  `uhura`  `xlsum`
+
+
+#### Task Tags
+* `adr_tasks`: all datasets in this benchmark relating to Automatic Diacritics Restoration task
+* `afrihate_tasks`: all datasets in this benchmark relating to Hate Speech detection task
+* `afrimgsm_tasks`: all datasets in this benchmark relating to Mathematical reasoning task
+* `afrixnli_tasks`: all datasets in this benchmark relating to Natural Language Inference task
+* `afrobench_xqa_tasks`: all datasets in this benchmark relating to Crosslingual QA (XQA) task
+* `afrobench_sentiment_tasks`: all datasets in this benchmark relating to Sentiment Classification task
+* `afrobench_MT_tasks`: all datasets in this benchmark relating to Machine Translation task
+* `afrobench_TC_tasks`: all datasets in this benchmark relating to Topic Classification task
+* `afrobench_mmlu_tasks`: all datasets in this benchmark relating to MMLU task
+* `injongointent_tasks`: all datasets in this benchmark relating to Intent Detection task
+* `masakhaner_tasks`: all datasets in this benchmark relating to Named Entity Recognition (NER) task
+* `masakhapos_tasks`: all datasets in this benchmark relating to Part of Speech Tagging (POS) task
+* `RC_tasks`: all datasets in this benchmark relating to Reading Comprehension task
+* `uhura_arc_easy_tasks`: all datasets in this benchmark relating to Arc-Easy (XQA) task
+* `xlsum_tasks`: all datasets in this benchmark relating to Summarization task
+
+
+We've included sample run scripts for easier integration with the benchmark: [sample run scripts](./sample_run_scripts)
+
+For better understanding of the run interface see [interface.md](../../../docs/interface.md)
+
+All dataset used in this benchmark are available at [huggingface](https://huggingface.co/collections/masakhane/afrobench-67dbf553ebf5701c2207f883)
+
+### Citation
+
+```
+@misc{ojo2025afrobenchgoodlargelanguage,
+      title={AfroBench: How Good are Large Language Models on African Languages?},
+      author={Jessica Ojo and Odunayo Ogundepo and Akintunde Oladipo and Kelechi Ogueji and Jimmy Lin and Pontus Stenetorp and David Ifeoluwa Adelani},
+      year={2025},
+      eprint={2311.07978},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2311.07978},
+}
+```
+Please cite datasets used. Citations for individual datasets are included in their respective repository readme files within this benchmark.
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? The original paper doesn't have an associated implementation, but there is an official entry in [BigBench](https://github.com/google/BIG-bench/tree/main/bigbench/benchmark_tasks/social_iqa). I use the same prompting format as BigBench.
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
diff --git a/lm_eval/tasks/afrobench/adr/README.md b/lm_eval/tasks/afrobench/adr/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cb09567dcd2f461e3adba531bddd570c21ebfdf5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/adr/README.md
@@ -0,0 +1,7 @@
+# Automatic Diacritics Restoration (ADR)
+
+Automatic Diacritics Restoration (ADR) is the task of restoring diacritical marks in text where they have been omitted or removed.
+This process is essential for languages where diacritics alter pronunciation, meaning, or grammatical structure.
+ADR requires the model to have a deep understanding of linguistic context, syntax, and semantics to accurately predict and reinsert the appropriate diacritics.
+
+As part of this benchmark project, we utilise the mafand dataset to curate a dataset specifically for ADR. We focus on five languages: Gbomola, Fon, Igbo, Wolof, and Yoruba.
diff --git a/lm_eval/tasks/afrobench/adr/afridiacritics.yaml b/lm_eval/tasks/afrobench/adr/afridiacritics.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..34d60eef66acd9829ebb6e60ca6b85e8616a32d8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/adr/afridiacritics.yaml
@@ -0,0 +1,13 @@
+group: adr
+task:
+  - adr_prompt_1
+  - adr_prompt_2
+  - adr_prompt_3
+  - adr_prompt_4
+  - adr_prompt_5
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1
diff --git a/lm_eval/tasks/afrobench/adr/gen_utils.py b/lm_eval/tasks/afrobench/adr/gen_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff6e63e3456abf809a1068f4abeea8ac93b49e94
--- /dev/null
+++ b/lm_eval/tasks/afrobench/adr/gen_utils.py
@@ -0,0 +1,105 @@
+import argparse
+import os
+
+import yaml
+
+
+class FunctionTag:
+    def __init__(self, value):
+        self.value = value
+
+
+def prompt_func(mode, lang):
+    prompt_map = {
+        "prompt_1": "Please restore the missing diacritics in the following sentence: {{text}}. Return output sentence only",
+        "prompt_2": "Given a sentence without diacritics, add the appropriate diacritics to make it grammatically "
+        "and semantically correct. \nSentence: {{text}}. Return output sentence only",
+        "prompt_3": f"This text is in {lang}. Restore all diacritical marks to their proper places in the "
+        "following sentence: {{text}}. Return output sentence only",
+        "prompt_4": f"You are a linguist specializing in diacritical marks for {lang}. "
+        f"Add the appropriate diacritics to this {lang} sentence: "
+        "{{text}}. Return output sentence only",
+        "prompt_5": f"You are a linguist specializing in diacritical marks for {lang}. Diacritics are essential for "
+        f"proper pronunciation and meaning in {lang}. You are tasked with converting {lang} sentences  "
+        "without diacritics into their correctly accented forms. Here's the input: {{text}}. "
+        "Return output sentence only",
+    }
+    return prompt_map[mode]
+
+
+def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
+    """
+    Generate a yaml file for each language.
+
+    :param output_dir: The directory to output the files to.
+    :param overwrite: Whether to overwrite files if they already exist.
+    """
+    err = []
+    languages = {
+        "fon": "Fon",
+        "bbj": "Gbomala",
+        "ibo": "Igbo",
+        "wol": "Wolof",
+        "yor": "Yoruba",
+    }
+
+    for lang in languages.keys():
+        try:
+            file_name = f"afridiacritics_{lang}.yaml"
+            task_name = f"afridiacritics_{lang}_{mode}"
+            yaml_template = "afridiacritics_yaml"
+            yaml_details = {
+                "include": yaml_template,
+                "task": task_name,
+                "dataset_name": lang,
+                "doc_to_text": prompt_func(mode, languages[lang]),
+            }
+            os.makedirs(f"{output_dir}/{mode}", exist_ok=True)
+            with open(
+                f"{output_dir}/{mode}/{file_name}",
+                "w" if overwrite else "x",
+                encoding="utf8",
+            ) as f:
+                f.write("# Generated by utils.py\n")
+                yaml.dump(
+                    yaml_details,
+                    f,
+                    allow_unicode=True,
+                )
+        except FileExistsError:
+            err.append(file_name)
+
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist (use --overwrite flag):"
+            f" {', '.join(err)}"
+        )
+
+
+def main() -> None:
+    """Parse CLI args and generate language-specific yaml files."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--overwrite",
+        default=True,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default="./",
+        help="Directory to write yaml files to",
+    )
+    parser.add_argument(
+        "--mode",
+        default="prompt_1",
+        choices=["prompt_1", "prompt_2", "prompt_3", "prompt_4", "prompt_5"],
+        help="Prompt number",
+    )
+    args = parser.parse_args()
+
+    gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite, mode=args.mode)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lm_eval/tasks/afrobench/adr/prompt_1/afridiacritics_bbj.yaml b/lm_eval/tasks/afrobench/adr/prompt_1/afridiacritics_bbj.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f3eb26ebae6f723c03591aa73eb29f2256fb0e4a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/adr/prompt_1/afridiacritics_bbj.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: bbj
+doc_to_text: 'Please restore the missing diacritics in the following sentence: {{text}}.
+  Return output sentence only'
+include: afridiacritics_yaml
+task: afridiacritics_bbj_prompt_1
diff --git a/lm_eval/tasks/afrobench/adr/prompt_1/afridiacritics_fon.yaml b/lm_eval/tasks/afrobench/adr/prompt_1/afridiacritics_fon.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..874832d5d00799deb9235d2f04960684f2b91770
--- /dev/null
+++ b/lm_eval/tasks/afrobench/adr/prompt_1/afridiacritics_fon.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: fon
+doc_to_text: 'Please restore the missing diacritics in the following sentence: {{text}}.
+  Return output sentence only'
+include: afridiacritics_yaml
+task: afridiacritics_fon_prompt_1
diff --git a/lm_eval/tasks/afrobench/adr/prompt_1/afridiacritics_ibo.yaml b/lm_eval/tasks/afrobench/adr/prompt_1/afridiacritics_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..983bc3914c421f46fa1adbdd67c15c433996a584
--- /dev/null
+++ b/lm_eval/tasks/afrobench/adr/prompt_1/afridiacritics_ibo.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: 'Please restore the missing diacritics in the following sentence: {{text}}.
+  Return output sentence only'
+include: afridiacritics_yaml
+task: afridiacritics_ibo_prompt_1
diff --git a/lm_eval/tasks/afrobench/adr/prompt_1/afridiacritics_wol.yaml b/lm_eval/tasks/afrobench/adr/prompt_1/afridiacritics_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9067770a3dac5fc4aab23cdad2d15ede76b82de4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/adr/prompt_1/afridiacritics_wol.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: wol
+doc_to_text: 'Please restore the missing diacritics in the following sentence: {{text}}.
+  Return output sentence only'
+include: afridiacritics_yaml
+task: afridiacritics_wol_prompt_1
diff --git a/lm_eval/tasks/afrobench/adr/prompt_1/afridiacritics_yaml b/lm_eval/tasks/afrobench/adr/prompt_1/afridiacritics_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..53cebaee05c9e7a65779ad12faaa0a9ee40c7c8b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/adr/prompt_1/afridiacritics_yaml
@@ -0,0 +1,25 @@
+tag:
+- adr_tasks
+- adr_prompt_1
+dataset_path: masakhane/diacritics-restoration
+dataset_kwargs: {trust_remote_code: True}
+doc_to_target: target
+output_type: generate_until
+fewshot_split: dev
+test_split: test
+training_split: train
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: chrf
+    aggregation: chrf
+    higher_is_better: true
+generation_kwargs:
+  do_sample: false
+  until:
+  - '<eos>'
+  - </s>
+  - <|im_end|>
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/adr/prompt_1/afridiacritics_yor.yaml b/lm_eval/tasks/afrobench/adr/prompt_1/afridiacritics_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8e98af10abec2009d32b112694923f45c17473af
--- /dev/null
+++ b/lm_eval/tasks/afrobench/adr/prompt_1/afridiacritics_yor.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: 'Please restore the missing diacritics in the following sentence: {{text}}.
+  Return output sentence only'
+include: afridiacritics_yaml
+task: afridiacritics_yor_prompt_1
diff --git a/lm_eval/tasks/afrobench/adr/prompt_2/afridiacritics_bbj.yaml b/lm_eval/tasks/afrobench/adr/prompt_2/afridiacritics_bbj.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0f054eea4c29da978b830d3a5eb2571af364f920
--- /dev/null
+++ b/lm_eval/tasks/afrobench/adr/prompt_2/afridiacritics_bbj.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: bbj
+doc_to_text: "Given a sentence without diacritics, add the appropriate diacritics\
+  \ to make it grammatically and semantically correct. \nSentence: {{text}}. Return\
+  \ output sentence only"
+include: afridiacritics_yaml
+task: afridiacritics_bbj_prompt_2
diff --git a/lm_eval/tasks/afrobench/adr/prompt_2/afridiacritics_fon.yaml b/lm_eval/tasks/afrobench/adr/prompt_2/afridiacritics_fon.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..07f7114649ff8f362a0a2072995724290c5224bd
--- /dev/null
+++ b/lm_eval/tasks/afrobench/adr/prompt_2/afridiacritics_fon.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: fon
+doc_to_text: "Given a sentence without diacritics, add the appropriate diacritics\
+  \ to make it grammatically and semantically correct. \nSentence: {{text}}. Return\
+  \ output sentence only"
+include: afridiacritics_yaml
+task: afridiacritics_fon_prompt_2
diff --git a/lm_eval/tasks/afrobench/adr/prompt_2/afridiacritics_ibo.yaml b/lm_eval/tasks/afrobench/adr/prompt_2/afridiacritics_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c1ebac101f9a69722b21bbfe65ccd224f811e8d3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/adr/prompt_2/afridiacritics_ibo.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: "Given a sentence without diacritics, add the appropriate diacritics\
+  \ to make it grammatically and semantically correct. \nSentence: {{text}}. Return\
+  \ output sentence only"
+include: afridiacritics_yaml
+task: afridiacritics_ibo_prompt_2
diff --git a/lm_eval/tasks/afrobench/adr/prompt_2/afridiacritics_wol.yaml b/lm_eval/tasks/afrobench/adr/prompt_2/afridiacritics_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b8448d6ffce9f67252621cf6085fc575dace588e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/adr/prompt_2/afridiacritics_wol.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: wol
+doc_to_text: "Given a sentence without diacritics, add the appropriate diacritics\
+  \ to make it grammatically and semantically correct. \nSentence: {{text}}. Return\
+  \ output sentence only"
+include: afridiacritics_yaml
+task: afridiacritics_wol_prompt_2
diff --git a/lm_eval/tasks/afrobench/adr/prompt_2/afridiacritics_yaml b/lm_eval/tasks/afrobench/adr/prompt_2/afridiacritics_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a0cc722d890f6a64939417f39f860532c4cd342b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/adr/prompt_2/afridiacritics_yaml
@@ -0,0 +1,25 @@
+tag:
+- adr_tasks
+- adr_prompt_2
+dataset_path: masakhane/diacritics-restoration
+dataset_kwargs: {trust_remote_code: True}
+doc_to_target: target
+output_type: generate_until
+fewshot_split: dev
+test_split: test
+training_split: train
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: chrf
+    aggregation: chrf
+    higher_is_better: true
+generation_kwargs:
+  do_sample: false
+  until:
+  - '<eos>'
+  - </s>
+  - <|im_end|>
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/adr/prompt_2/afridiacritics_yor.yaml b/lm_eval/tasks/afrobench/adr/prompt_2/afridiacritics_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..eb95f5e234add5c178efc181bddb1fc87f9ce19d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/adr/prompt_2/afridiacritics_yor.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: "Given a sentence without diacritics, add the appropriate diacritics\
+  \ to make it grammatically and semantically correct. \nSentence: {{text}}. Return\
+  \ output sentence only"
+include: afridiacritics_yaml
+task: afridiacritics_yor_prompt_2
diff --git a/lm_eval/tasks/afrobench/adr/prompt_3/afridiacritics_bbj.yaml b/lm_eval/tasks/afrobench/adr/prompt_3/afridiacritics_bbj.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a50b40c535d778cb4bd564455fbfdcf43415a53d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/adr/prompt_3/afridiacritics_bbj.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: bbj
+doc_to_text: 'This text is in Gbomala. Restore all diacritical marks to their proper
+  places in the following sentence: {{text}}. Return output sentence only'
+include: afridiacritics_yaml
+task: afridiacritics_bbj_prompt_3
diff --git a/lm_eval/tasks/afrobench/adr/prompt_3/afridiacritics_fon.yaml b/lm_eval/tasks/afrobench/adr/prompt_3/afridiacritics_fon.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5b0909ce9dd69f46cdca75ebdc325f452b25a462
--- /dev/null
+++ b/lm_eval/tasks/afrobench/adr/prompt_3/afridiacritics_fon.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: fon
+doc_to_text: 'This text is in Fon. Restore all diacritical marks to their proper places
+  in the following sentence: {{text}}. Return output sentence only'
+include: afridiacritics_yaml
+task: afridiacritics_fon_prompt_3
diff --git a/lm_eval/tasks/afrobench/adr/prompt_3/afridiacritics_ibo.yaml b/lm_eval/tasks/afrobench/adr/prompt_3/afridiacritics_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..04d1df0e1f07ac7c082bd75b1ce93959e0e0d56d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/adr/prompt_3/afridiacritics_ibo.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: 'This text is in Igbo. Restore all diacritical marks to their proper
+  places in the following sentence: {{text}}. Return output sentence only'
+include: afridiacritics_yaml
+task: afridiacritics_ibo_prompt_3
diff --git a/lm_eval/tasks/afrobench/adr/prompt_3/afridiacritics_wol.yaml b/lm_eval/tasks/afrobench/adr/prompt_3/afridiacritics_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..576e0845188b523be7ec2f342a440174aa496263
--- /dev/null
+++ b/lm_eval/tasks/afrobench/adr/prompt_3/afridiacritics_wol.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: wol
+doc_to_text: 'This text is in Wolof. Restore all diacritical marks to their proper
+  places in the following sentence: {{text}}. Return output sentence only'
+include: afridiacritics_yaml
+task: afridiacritics_wol_prompt_3
diff --git a/lm_eval/tasks/afrobench/adr/prompt_3/afridiacritics_yaml b/lm_eval/tasks/afrobench/adr/prompt_3/afridiacritics_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0a27eeef2d37880527c7b99f1fa9296f843b72a0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/adr/prompt_3/afridiacritics_yaml
@@ -0,0 +1,25 @@
+tag:
+- adr_tasks
+- adr_prompt_3
+dataset_path: masakhane/diacritics-restoration
+dataset_kwargs: {trust_remote_code: True}
+doc_to_target: target
+output_type: generate_until
+fewshot_split: dev
+test_split: test
+training_split: train
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: chrf
+    aggregation: chrf
+    higher_is_better: true
+generation_kwargs:
+  do_sample: false
+  until:
+  - '<eos>'
+  - </s>
+  - <|im_end|>
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/adr/prompt_3/afridiacritics_yor.yaml b/lm_eval/tasks/afrobench/adr/prompt_3/afridiacritics_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..169c110872d6fdc2d2d41b6472fe30d93934f5df
--- /dev/null
+++ b/lm_eval/tasks/afrobench/adr/prompt_3/afridiacritics_yor.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: 'This text is in Yoruba. Restore all diacritical marks to their proper
+  places in the following sentence: {{text}}. Return output sentence only'
+include: afridiacritics_yaml
+task: afridiacritics_yor_prompt_3
diff --git a/lm_eval/tasks/afrobench/adr/prompt_4/afridiacritics_bbj.yaml b/lm_eval/tasks/afrobench/adr/prompt_4/afridiacritics_bbj.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7a807b09ee3000f022374e31e61cdb2f2e091f0e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/adr/prompt_4/afridiacritics_bbj.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: bbj
+doc_to_text: 'You are a linguist specializing in diacritical marks for Gbomala. Add
+  the appropriate diacritics to this Gbomala sentence: {{text}}. Return output sentence
+  only'
+include: afridiacritics_yaml
+task: afridiacritics_bbj_prompt_4
diff --git a/lm_eval/tasks/afrobench/adr/prompt_4/afridiacritics_fon.yaml b/lm_eval/tasks/afrobench/adr/prompt_4/afridiacritics_fon.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..11076e685ae5f6a4435a486d3f35db269ded8f51
--- /dev/null
+++ b/lm_eval/tasks/afrobench/adr/prompt_4/afridiacritics_fon.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: fon
+doc_to_text: 'You are a linguist specializing in diacritical marks for Fon. Add the
+  appropriate diacritics to this Fon sentence: {{text}}. Return output sentence only'
+include: afridiacritics_yaml
+task: afridiacritics_fon_prompt_4
diff --git a/lm_eval/tasks/afrobench/adr/prompt_4/afridiacritics_ibo.yaml b/lm_eval/tasks/afrobench/adr/prompt_4/afridiacritics_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..367e387ae7456f57d604b1fe3ac032084b16fb98
--- /dev/null
+++ b/lm_eval/tasks/afrobench/adr/prompt_4/afridiacritics_ibo.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: 'You are a linguist specializing in diacritical marks for Igbo. Add the
+  appropriate diacritics to this Igbo sentence: {{text}}. Return output sentence only'
+include: afridiacritics_yaml
+task: afridiacritics_ibo_prompt_4
diff --git a/lm_eval/tasks/afrobench/adr/prompt_4/afridiacritics_wol.yaml b/lm_eval/tasks/afrobench/adr/prompt_4/afridiacritics_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..23fb81e754445e8745d5d67720707af7d502e3df
--- /dev/null
+++ b/lm_eval/tasks/afrobench/adr/prompt_4/afridiacritics_wol.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: wol
+doc_to_text: 'You are a linguist specializing in diacritical marks for Wolof. Add
+  the appropriate diacritics to this Wolof sentence: {{text}}. Return output sentence
+  only'
+include: afridiacritics_yaml
+task: afridiacritics_wol_prompt_4
diff --git a/lm_eval/tasks/afrobench/adr/prompt_4/afridiacritics_yaml b/lm_eval/tasks/afrobench/adr/prompt_4/afridiacritics_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6ae62e9d3384d3ee1bff044dbfd1cb23275ae517
--- /dev/null
+++ b/lm_eval/tasks/afrobench/adr/prompt_4/afridiacritics_yaml
@@ -0,0 +1,25 @@
+tag:
+- adr_tasks
+- adr_prompt_4
+dataset_path: masakhane/diacritics-restoration
+dataset_kwargs: {trust_remote_code: True}
+doc_to_target: target
+output_type: generate_until
+fewshot_split: dev
+test_split: test
+training_split: train
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: chrf
+    aggregation: chrf
+    higher_is_better: true
+generation_kwargs:
+  do_sample: false
+  until:
+  - '<eos>'
+  - </s>
+  - <|im_end|>
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/adr/prompt_4/afridiacritics_yor.yaml b/lm_eval/tasks/afrobench/adr/prompt_4/afridiacritics_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..21e3a53fefcb4ae41eb00a406a3319f14ed60aba
--- /dev/null
+++ b/lm_eval/tasks/afrobench/adr/prompt_4/afridiacritics_yor.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: 'You are a linguist specializing in diacritical marks for Yoruba. Add
+  the appropriate diacritics to this Yoruba sentence: {{text}}. Return output sentence
+  only'
+include: afridiacritics_yaml
+task: afridiacritics_yor_prompt_4
diff --git a/lm_eval/tasks/afrobench/adr/prompt_5/afridiacritics_bbj.yaml b/lm_eval/tasks/afrobench/adr/prompt_5/afridiacritics_bbj.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b1bcc833c73d0a789700c1b50b8636163620ed27
--- /dev/null
+++ b/lm_eval/tasks/afrobench/adr/prompt_5/afridiacritics_bbj.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: bbj
+doc_to_text: 'You are a linguist specializing in diacritical marks for Gbomala. Diacritics
+  are essential for proper pronunciation and meaning in Gbomala. You are tasked with
+  converting Gbomala sentences  without diacritics into their correctly accented forms.
+  Here''s the input: {{text}}. Return output sentence only'
+include: afridiacritics_yaml
+task: afridiacritics_bbj_prompt_5
diff --git a/lm_eval/tasks/afrobench/adr/prompt_5/afridiacritics_fon.yaml b/lm_eval/tasks/afrobench/adr/prompt_5/afridiacritics_fon.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3a1c55f813b4c4b7d08daff74cf32040b85e2b35
--- /dev/null
+++ b/lm_eval/tasks/afrobench/adr/prompt_5/afridiacritics_fon.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: fon
+doc_to_text: 'You are a linguist specializing in diacritical marks for Fon. Diacritics
+  are essential for proper pronunciation and meaning in Fon. You are tasked with converting
+  Fon sentences  without diacritics into their correctly accented forms. Here''s the
+  input: {{text}}. Return output sentence only'
+include: afridiacritics_yaml
+task: afridiacritics_fon_prompt_5
diff --git a/lm_eval/tasks/afrobench/adr/prompt_5/afridiacritics_ibo.yaml b/lm_eval/tasks/afrobench/adr/prompt_5/afridiacritics_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6cc9865dca7a2d8889df88d73abfb54b615089f4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/adr/prompt_5/afridiacritics_ibo.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: 'You are a linguist specializing in diacritical marks for Igbo. Diacritics
+  are essential for proper pronunciation and meaning in Igbo. You are tasked with
+  converting Igbo sentences  without diacritics into their correctly accented forms.
+  Here''s the input: {{text}}. Return output sentence only'
+include: afridiacritics_yaml
+task: afridiacritics_ibo_prompt_5
diff --git a/lm_eval/tasks/afrobench/adr/prompt_5/afridiacritics_wol.yaml b/lm_eval/tasks/afrobench/adr/prompt_5/afridiacritics_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fed10a7031ac71a948720b15eff1677df411934c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/adr/prompt_5/afridiacritics_wol.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: wol
+doc_to_text: 'You are a linguist specializing in diacritical marks for Wolof. Diacritics
+  are essential for proper pronunciation and meaning in Wolof. You are tasked with
+  converting Wolof sentences  without diacritics into their correctly accented forms.
+  Here''s the input: {{text}}. Return output sentence only'
+include: afridiacritics_yaml
+task: afridiacritics_wol_prompt_5
diff --git a/lm_eval/tasks/afrobench/adr/prompt_5/afridiacritics_yaml b/lm_eval/tasks/afrobench/adr/prompt_5/afridiacritics_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..aaad3306e7270e78cdd2f83dd8ffeb790520134d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/adr/prompt_5/afridiacritics_yaml
@@ -0,0 +1,25 @@
+tag:
+- adr_tasks
+- adr_prompt_5
+dataset_path: masakhane/diacritics-restoration
+dataset_kwargs: {trust_remote_code: True}
+doc_to_target: target
+output_type: generate_until
+fewshot_split: dev
+test_split: test
+training_split: train
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: chrf
+    aggregation: chrf
+    higher_is_better: true
+generation_kwargs:
+  do_sample: false
+  until:
+  - '<eos>'
+  - </s>
+  - <|im_end|>
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/adr/prompt_5/afridiacritics_yor.yaml b/lm_eval/tasks/afrobench/adr/prompt_5/afridiacritics_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bd1c9007a394de95a19c1a09b398614366537a1f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/adr/prompt_5/afridiacritics_yor.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: 'You are a linguist specializing in diacritical marks for Yoruba. Diacritics
+  are essential for proper pronunciation and meaning in Yoruba. You are tasked with
+  converting Yoruba sentences  without diacritics into their correctly accented forms.
+  Here''s the input: {{text}}. Return output sentence only'
+include: afridiacritics_yaml
+task: afridiacritics_yor_prompt_5
diff --git a/lm_eval/tasks/afrobench/afriqa/README.md b/lm_eval/tasks/afrobench/afriqa/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8730d7c8d8d68b6b83dfad3d4f584534b048d111
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/README.md
@@ -0,0 +1,24 @@
+#
+
+## Paper
+Title: `AfriQA: Cross-lingual Open-Retrieval Question Answering for African Languages`
+
+Paper Link: https://arxiv.org/abs/2305.06897
+
+## Abstract
+>AfriQA is the first cross-lingual question answering (QA) dataset with a focus on African languages. The dataset includes over 12,000 XOR QA examples across 10 African languages, making it an invaluable resource for developing more equitable QA technology. African languages have historically been underserved in the digital landscape, with far less in-language content available online. This makes it difficult for QA systems to provide accurate information to users in their native language. However, cross-lingual open-retrieval question answering (XOR QA) systems can help fill this gap by retrieving answer content from other languages. AfriQA focuses specifically on African languages where cross-lingual answer content is the only high-coverage source of information. Previous datasets have primarily focused on languages where cross-lingual QA augments coverage from the target language, but AfriQA highlights the importance of African languages as a realistic use case for XOR QA.
+
+HomePage: https://github.com/masakhane-io/afriqa
+
+### Citation
+
+```
+@misc{ogundepo2023afriqa,
+      title={AfriQA: Cross-lingual Open-Retrieval Question Answering for African Languages},
+      author={Odunayo Ogundepo and Tajuddeen R. Gwadabe and Clara E. Rivera and Jonathan H. Clark and Sebastian Ruder and David Ifeoluwa Adelani and Bonaventure F. P. Dossou and Abdou Aziz DIOP and Claytone Sikasote and Gilles Hacheme and Happy Buzaaba and Ignatius Ezeani and Rooweither Mabuya and Salomey Osei and Chris Emezue and Albert Njoroge Kahira and Shamsuddeen H. Muhammad and Akintunde Oladipo and Abraham Toluwase Owodunni and Atnafu Lambebo Tonja and Iyanuoluwa Shode and Akari Asai and Tunde Oluwaseyi Ajayi and Clemencia Siro and Steven Arthur and Mofetoluwa Adeyemi and Orevaoghene Ahia and Aremu Anuoluwapo and Oyinkansola Awosan and Chiamaka Chukwuneke and Bernard Opoku and Awokoya Ayodele and Verrah Otiende and Christine Mwase and Boyd Sinkala and Andre Niyongabo Rubungo and Daniel A. Ajisafe and Emeka Felix Onwuegbuzia and Habib Mbow and Emile Niyomutabazi and Eunice Mukonde and Falalu Ibrahim Lawan and Ibrahim Said Ahmad and Jesujoba O. Alabi and Martin Namukombo and Mbonu Chinedu and Mofya Phiri and Neo Putini and Ndumiso Mngoma and Priscilla A. Amuok and Ruqayya Nasir Iro and Sonia Adhiambo},
+      year={2023},
+      eprint={2305.06897},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
diff --git a/lm_eval/tasks/afrobench/afriqa/afriqa.yaml b/lm_eval/tasks/afrobench/afriqa/afriqa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..80810ca4c1195f281b6eaa9581bf420ff4582291
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/afriqa.yaml
@@ -0,0 +1,13 @@
+group: afriqa
+task:
+  - afriqa_prompt_1
+  - afriqa_prompt_2
+  - afriqa_prompt_3
+  - afriqa_prompt_4
+  - afriqa_prompt_5
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_1/afriqa b/lm_eval/tasks/afrobench/afriqa/prompt_1/afriqa
new file mode 100644
index 0000000000000000000000000000000000000000..d9b6218e766a57309804e9514cf9d9682cf49131
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_1/afriqa
@@ -0,0 +1,42 @@
+tag:
+    - afrobench_xqa_tasks
+    - afriqa_prompt_1
+dataset_kwargs: {trust_remote_code: True}
+dataset_path: masakhane/afriqa-gold-passages
+dataset_name: null
+output_type: generate_until
+test_split: test
+fewshot_split: train
+doc_to_target: answer_pivot
+should_decontaminate: true
+doc_to_decontamination_query: question_lang
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+filter_list:
+  - name: remove_whitespace
+    filter:
+      - function: remove_whitespace
+      - function: take_first
+target_delimiter: " "
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+      - "."
+      - ","
+      - "\\$"
+  - metric: f1
+    aggregation: !function utils.f1
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+      - "."
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_1/afriqa_bem.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_1/afriqa_bem.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a3b639a833ee4e8fb32d992538b747ab92b1f360
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_1/afriqa_bem.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: bem
+doc_to_text: 'Your task is to answer a qestion given a context.Make sure you respond
+  with the shortest span containing the answer in the context.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_bem_prompt_1
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_1/afriqa_fon.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_1/afriqa_fon.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c51196157f95e96315d0321e4b53859ef8e5ae35
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_1/afriqa_fon.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: fon
+doc_to_text: 'Your task is to answer a qestion given a context.Make sure you respond
+  with the shortest span containing the answer in the context.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_fon_prompt_1
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_1/afriqa_hau.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_1/afriqa_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0536590ac3486f815ac35d5333b8a6a268fd851a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_1/afriqa_hau.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: 'Your task is to answer a qestion given a context.Make sure you respond
+  with the shortest span containing the answer in the context.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_hau_prompt_1
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_1/afriqa_ibo.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_1/afriqa_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..62eb71160c6cf546b274b8724cd8955c0b0e86c8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_1/afriqa_ibo.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: 'Your task is to answer a qestion given a context.Make sure you respond
+  with the shortest span containing the answer in the context.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_ibo_prompt_1
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_1/afriqa_kin.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_1/afriqa_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e632c4beef739a3e299280071229521db78cbf21
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_1/afriqa_kin.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_text: 'Your task is to answer a qestion given a context.Make sure you respond
+  with the shortest span containing the answer in the context.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_kin_prompt_1
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_1/afriqa_swa.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_1/afriqa_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dbdebe14dccc3eda73ee706e3327a13a43a3aa81
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_1/afriqa_swa.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: 'Your task is to answer a qestion given a context.Make sure you respond
+  with the shortest span containing the answer in the context.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+fewshot_split: test
+fewshot_config:
+  sampler: first_n
+task: afriqa_swa_prompt_1
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_1/afriqa_twi.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_1/afriqa_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..67ba171569e34842e2c5af86875d2495f4715421
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_1/afriqa_twi.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_text: 'Your task is to answer a qestion given a context.Make sure you respond
+  with the shortest span containing the answer in the context.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_twi_prompt_1
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_1/afriqa_yor.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_1/afriqa_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..51d20e43e0b4dcb6d5da0e61c22ad827085b253f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_1/afriqa_yor.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: 'Your task is to answer a qestion given a context.Make sure you respond
+  with the shortest span containing the answer in the context.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_yor_prompt_1
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_1/afriqa_zul.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_1/afriqa_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1c254b96e565e01750c6a838b427f926ed3f40d0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_1/afriqa_zul.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: zul
+doc_to_text: 'Your task is to answer a qestion given a context.Make sure you respond
+  with the shortest span containing the answer in the context.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_zul_prompt_1
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_1/utils.py b/lm_eval/tasks/afrobench/afriqa/prompt_1/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..eae1d885037da14892b39715c49e4d3aac61f06f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_1/utils.py
@@ -0,0 +1,53 @@
+import re
+import string
+from collections import Counter
+
+
+def normalize_answer(s):
+    """
+    Taken from the official evaluation script for v1.1 of the SQuAD dataset.
+    Lower text and remove punctuation, articles and extra whitespace.
+    """
+
+    def remove_articles(text):
+        return re.sub(r"\b(a|an|the)\b", " ", text)
+
+    def white_space_fix(text):
+        return " ".join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def f1(items):
+    """
+    Taken from the official evaluation script for v1.1 of the SQuAD dataset.
+    """
+
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+
+    f1_list = []
+
+    for i in range(len(golds)):
+        prediction_tokens = normalize_answer(preds[i]).split()
+        references_tokens = normalize_answer(golds[i]).split()
+        common = Counter(prediction_tokens) & Counter(references_tokens)
+        num_same = sum(common.values())
+        if num_same == 0:
+            f1_score = 0
+        else:
+            precision = 1.0 * num_same / len(prediction_tokens)
+            recall = 1.0 * num_same / len(references_tokens)
+            f1_score = (2 * precision * recall) / (precision + recall)
+
+        f1_list.append(f1_score)
+
+    return sum(f1_list) / len(f1_list)
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_2/afriqa b/lm_eval/tasks/afrobench/afriqa/prompt_2/afriqa
new file mode 100644
index 0000000000000000000000000000000000000000..d53ce05b168b8ffaf1325167aaee6537b9b2dbbe
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_2/afriqa
@@ -0,0 +1,42 @@
+tag:
+    - afrobench_xqa_tasks
+    - afriqa_prompt_2
+dataset_kwargs: {trust_remote_code: True}
+dataset_path: masakhane/afriqa-gold-passages
+dataset_name: null
+output_type: generate_until
+test_split: test
+fewshot_split: train
+doc_to_target: answer_pivot
+should_decontaminate: true
+doc_to_decontamination_query: question_lang
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+filter_list:
+  - name: remove_whitespace
+    filter:
+      - function: remove_whitespace
+      - function: take_first
+target_delimiter: " "
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+      - "."
+      - ","
+      - "\\$"
+  - metric: f1
+    aggregation: !function utils.f1
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+      - "."
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_2/afriqa_bem.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_2/afriqa_bem.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2469c7f434e133f0b94c41940917b17368566dfe
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_2/afriqa_bem.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: bem
+doc_to_text: 'Your task is to answer a question given a context. The question is in
+  Bemba, while the context is in English or French.Make sure you respond with the
+  shortest span in the context that contains the answer.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_bem_prompt_2
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_2/afriqa_fon.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_2/afriqa_fon.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..384db44987a074c88615164e9da85b71fad2da37
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_2/afriqa_fon.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: fon
+doc_to_text: 'Your task is to answer a question given a context. The question is in
+  Fon, while the context is in English or French.Make sure you respond with the shortest
+  span in the context that contains the answer.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_fon_prompt_2
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_2/afriqa_hau.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_2/afriqa_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..40c942eced4451a620541f0cce6fe87d8f82e5cb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_2/afriqa_hau.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: 'Your task is to answer a question given a context. The question is in
+  Hausa, while the context is in English or French.Make sure you respond with the
+  shortest span in the context that contains the answer.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_hau_prompt_2
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_2/afriqa_ibo.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_2/afriqa_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8198795d2d1d79855d45d1c00800e56c8ad5742b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_2/afriqa_ibo.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: 'Your task is to answer a question given a context. The question is in
+  Igbo, while the context is in English or French.Make sure you respond with the shortest
+  span in the context that contains the answer.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_ibo_prompt_2
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_2/afriqa_kin.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_2/afriqa_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7a238ae5cc6f687aeb84d88808f994b66464d8bd
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_2/afriqa_kin.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_text: 'Your task is to answer a question given a context. The question is in
+  Kinyarwanda, while the context is in English or French.Make sure you respond with
+  the shortest span in the context that contains the answer.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_kin_prompt_2
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_2/afriqa_swa.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_2/afriqa_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4be94d07d9ed58745b6e87d38bfe927d20116137
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_2/afriqa_swa.yaml
@@ -0,0 +1,16 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: 'Your task is to answer a question given a context. The question is in
+  Swahili, while the context is in English or French.Make sure you respond with the
+  shortest span in the context that contains the answer.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+fewshot_split: test
+fewshot_config:
+  sampler: first_n
+task: afriqa_swa_prompt_2
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_2/afriqa_twi.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_2/afriqa_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f08487d0c539b519b2707cc671f7ceda3b50387d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_2/afriqa_twi.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_text: 'Your task is to answer a question given a context. The question is in
+  Twi, while the context is in English or French.Make sure you respond with the shortest
+  span in the context that contains the answer.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_twi_prompt_2
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_2/afriqa_yor.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_2/afriqa_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..44aee11a143976b845c8cb3a9c6a1d8cf01ccf17
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_2/afriqa_yor.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: 'Your task is to answer a question given a context. The question is in
+  Yoruba, while the context is in English or French.Make sure you respond with the
+  shortest span in the context that contains the answer.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_yor_prompt_2
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_2/afriqa_zul.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_2/afriqa_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..99c5b18fa243c47f843862950f07e1211745f8b5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_2/afriqa_zul.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: zul
+doc_to_text: 'Your task is to answer a question given a context. The question is in
+  Zulu, while the context is in English or French.Make sure you respond with the shortest
+  span in the context that contains the answer.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_zul_prompt_2
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_2/utils.py b/lm_eval/tasks/afrobench/afriqa/prompt_2/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..eae1d885037da14892b39715c49e4d3aac61f06f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_2/utils.py
@@ -0,0 +1,53 @@
+import re
+import string
+from collections import Counter
+
+
+def normalize_answer(s):
+    """
+    Taken from the official evaluation script for v1.1 of the SQuAD dataset.
+    Lower text and remove punctuation, articles and extra whitespace.
+    """
+
+    def remove_articles(text):
+        return re.sub(r"\b(a|an|the)\b", " ", text)
+
+    def white_space_fix(text):
+        return " ".join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def f1(items):
+    """
+    Taken from the official evaluation script for v1.1 of the SQuAD dataset.
+    """
+
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+
+    f1_list = []
+
+    for i in range(len(golds)):
+        prediction_tokens = normalize_answer(preds[i]).split()
+        references_tokens = normalize_answer(golds[i]).split()
+        common = Counter(prediction_tokens) & Counter(references_tokens)
+        num_same = sum(common.values())
+        if num_same == 0:
+            f1_score = 0
+        else:
+            precision = 1.0 * num_same / len(prediction_tokens)
+            recall = 1.0 * num_same / len(references_tokens)
+            f1_score = (2 * precision * recall) / (precision + recall)
+
+        f1_list.append(f1_score)
+
+    return sum(f1_list) / len(f1_list)
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_3/afriqa b/lm_eval/tasks/afrobench/afriqa/prompt_3/afriqa
new file mode 100644
index 0000000000000000000000000000000000000000..79a923b1b30075d31407e804641b71339e9bedb0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_3/afriqa
@@ -0,0 +1,42 @@
+tag:
+    - afrobench_xqa_tasks
+    - afriqa_prompt_3
+dataset_kwargs: {trust_remote_code: True}
+dataset_path: masakhane/afriqa-gold-passages
+dataset_name: null
+output_type: generate_until
+test_split: test
+fewshot_split: train
+doc_to_target: answer_pivot
+should_decontaminate: true
+doc_to_decontamination_query: question_lang
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+filter_list:
+  - name: remove_whitespace
+    filter:
+      - function: remove_whitespace
+      - function: take_first
+target_delimiter: " "
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+      - "."
+      - ","
+      - "\\$"
+  - metric: f1
+    aggregation: !function utils.f1
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+      - "."
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_3/afriqa_bem.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_3/afriqa_bem.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3af92f5a4abc656a862c701757d056b836506582
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_3/afriqa_bem.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: bem
+doc_to_text: 'Given the context, provide the answer to the following question.Ensure
+  your response is concise and directly from the context.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_bem_prompt_3
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_3/afriqa_fon.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_3/afriqa_fon.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..73c12439632863ee0cc49a84459abd0dbe4ea985
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_3/afriqa_fon.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: fon
+doc_to_text: 'Given the context, provide the answer to the following question.Ensure
+  your response is concise and directly from the context.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_fon_prompt_3
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_3/afriqa_hau.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_3/afriqa_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ff08d081971aa54fd645c9e626231e82341ecabb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_3/afriqa_hau.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: 'Given the context, provide the answer to the following question.Ensure
+  your response is concise and directly from the context.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_hau_prompt_3
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_3/afriqa_ibo.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_3/afriqa_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..12f18a0bff19f69cbfaa0fce86cf9d61ecd08136
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_3/afriqa_ibo.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: 'Given the context, provide the answer to the following question.Ensure
+  your response is concise and directly from the context.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_ibo_prompt_3
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_3/afriqa_kin.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_3/afriqa_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e92dec41c227b0d14e2a646893b7fe4dc55e5425
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_3/afriqa_kin.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_text: 'Given the context, provide the answer to the following question.Ensure
+  your response is concise and directly from the context.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_kin_prompt_3
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_3/afriqa_swa.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_3/afriqa_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..30c574e5fa77ad65dc91d2670720945cdb67c032
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_3/afriqa_swa.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: 'Given the context, provide the answer to the following question.Ensure
+  your response is concise and directly from the context.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+fewshot_split: test
+fewshot_config:
+  sampler: first_n
+task: afriqa_swa_prompt_3
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_3/afriqa_twi.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_3/afriqa_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b08534d9bb98b2603a405aa7d8888a8d4230a52c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_3/afriqa_twi.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_text: 'Given the context, provide the answer to the following question.Ensure
+  your response is concise and directly from the context.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_twi_prompt_3
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_3/afriqa_yor.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_3/afriqa_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d3c74ce7c3a86d7df40572096bdddf75ee5321c5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_3/afriqa_yor.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: 'Given the context, provide the answer to the following question.Ensure
+  your response is concise and directly from the context.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_yor_prompt_3
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_3/afriqa_zul.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_3/afriqa_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c54b0bd7f054dc8111c4c72e8fca55d7e7f0a4e6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_3/afriqa_zul.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: zul
+doc_to_text: 'Given the context, provide the answer to the following question.Ensure
+  your response is concise and directly from the context.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_zul_prompt_3
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_3/utils.py b/lm_eval/tasks/afrobench/afriqa/prompt_3/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..eae1d885037da14892b39715c49e4d3aac61f06f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_3/utils.py
@@ -0,0 +1,53 @@
+import re
+import string
+from collections import Counter
+
+
+def normalize_answer(s):
+    """
+    Taken from the official evaluation script for v1.1 of the SQuAD dataset.
+    Lower text and remove punctuation, articles and extra whitespace.
+    """
+
+    def remove_articles(text):
+        return re.sub(r"\b(a|an|the)\b", " ", text)
+
+    def white_space_fix(text):
+        return " ".join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def f1(items):
+    """
+    Taken from the official evaluation script for v1.1 of the SQuAD dataset.
+    """
+
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+
+    f1_list = []
+
+    for i in range(len(golds)):
+        prediction_tokens = normalize_answer(preds[i]).split()
+        references_tokens = normalize_answer(golds[i]).split()
+        common = Counter(prediction_tokens) & Counter(references_tokens)
+        num_same = sum(common.values())
+        if num_same == 0:
+            f1_score = 0
+        else:
+            precision = 1.0 * num_same / len(prediction_tokens)
+            recall = 1.0 * num_same / len(references_tokens)
+            f1_score = (2 * precision * recall) / (precision + recall)
+
+        f1_list.append(f1_score)
+
+    return sum(f1_list) / len(f1_list)
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_4/afriqa b/lm_eval/tasks/afrobench/afriqa/prompt_4/afriqa
new file mode 100644
index 0000000000000000000000000000000000000000..e251f1e27fab773d7fd54364ebfc870819df5d55
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_4/afriqa
@@ -0,0 +1,42 @@
+tag:
+    - afrobench_xqa_tasks
+    - afriqa_prompt_4
+dataset_kwargs: {trust_remote_code: True}
+dataset_path: masakhane/afriqa-gold-passages
+dataset_name: null
+output_type: generate_until
+test_split: test
+fewshot_split: train
+doc_to_target: answer_pivot
+should_decontaminate: true
+doc_to_decontamination_query: question_lang
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+filter_list:
+  - name: remove_whitespace
+    filter:
+      - function: remove_whitespace
+      - function: take_first
+target_delimiter: " "
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+      - "."
+      - ","
+      - "\\$"
+  - metric: f1
+    aggregation: !function utils.f1
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+      - "."
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_4/afriqa_bem.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_4/afriqa_bem.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..db3d1c2a142ae6b6b1bd86678027df58c8715785
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_4/afriqa_bem.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: bem
+doc_to_text: 'You are an AI assistant and your task is to answer the question based
+  on the provided context.Your answer should be the shortest span that contains the
+  answer within the context.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_bem_prompt_4
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_4/afriqa_fon.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_4/afriqa_fon.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0c65dd07265d54a8c2aad56c563073fab7b38b49
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_4/afriqa_fon.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: fon
+doc_to_text: 'You are an AI assistant and your task is to answer the question based
+  on the provided context.Your answer should be the shortest span that contains the
+  answer within the context.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_fon_prompt_4
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_4/afriqa_hau.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_4/afriqa_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..baeaf020b05b04e025024ac7dd4d3f6bf86caaa9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_4/afriqa_hau.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: 'You are an AI assistant and your task is to answer the question based
+  on the provided context.Your answer should be the shortest span that contains the
+  answer within the context.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_hau_prompt_4
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_4/afriqa_ibo.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_4/afriqa_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6db1cc71614a9e316eeb3b12a3f0740d9cb6e671
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_4/afriqa_ibo.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: 'You are an AI assistant and your task is to answer the question based
+  on the provided context.Your answer should be the shortest span that contains the
+  answer within the context.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_ibo_prompt_4
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_4/afriqa_kin.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_4/afriqa_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dc8f3678207cdbc1dab076573ed7aeea02b19b92
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_4/afriqa_kin.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_text: 'You are an AI assistant and your task is to answer the question based
+  on the provided context.Your answer should be the shortest span that contains the
+  answer within the context.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_kin_prompt_4
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_4/afriqa_swa.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_4/afriqa_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4fe8fbcdf2a298f9b8c58047da94c24cddc72fdc
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_4/afriqa_swa.yaml
@@ -0,0 +1,16 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: 'You are an AI assistant and your task is to answer the question based
+  on the provided context.Your answer should be the shortest span that contains the
+  answer within the context.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+fewshot_split: test
+fewshot_config:
+  sampler: first_n
+task: afriqa_swa_prompt_4
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_4/afriqa_twi.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_4/afriqa_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d679cd0bb6173696e7555158356b067f2eea895c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_4/afriqa_twi.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_text: 'You are an AI assistant and your task is to answer the question based
+  on the provided context.Your answer should be the shortest span that contains the
+  answer within the context.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_twi_prompt_4
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_4/afriqa_yor.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_4/afriqa_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6011dc3313ff35fe55ed204529c5d0c1503d468a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_4/afriqa_yor.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: 'You are an AI assistant and your task is to answer the question based
+  on the provided context.Your answer should be the shortest span that contains the
+  answer within the context.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_yor_prompt_4
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_4/afriqa_zul.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_4/afriqa_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..26a6ccad3efe93ea094a64189b5efbe1dddb9734
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_4/afriqa_zul.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: zul
+doc_to_text: 'You are an AI assistant and your task is to answer the question based
+  on the provided context.Your answer should be the shortest span that contains the
+  answer within the context.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_zul_prompt_4
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_4/utils.py b/lm_eval/tasks/afrobench/afriqa/prompt_4/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..eae1d885037da14892b39715c49e4d3aac61f06f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_4/utils.py
@@ -0,0 +1,53 @@
+import re
+import string
+from collections import Counter
+
+
+def normalize_answer(s):
+    """
+    Taken from the official evaluation script for v1.1 of the SQuAD dataset.
+    Lower text and remove punctuation, articles and extra whitespace.
+    """
+
+    def remove_articles(text):
+        return re.sub(r"\b(a|an|the)\b", " ", text)
+
+    def white_space_fix(text):
+        return " ".join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def f1(items):
+    """
+    Taken from the official evaluation script for v1.1 of the SQuAD dataset.
+    """
+
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+
+    f1_list = []
+
+    for i in range(len(golds)):
+        prediction_tokens = normalize_answer(preds[i]).split()
+        references_tokens = normalize_answer(golds[i]).split()
+        common = Counter(prediction_tokens) & Counter(references_tokens)
+        num_same = sum(common.values())
+        if num_same == 0:
+            f1_score = 0
+        else:
+            precision = 1.0 * num_same / len(prediction_tokens)
+            recall = 1.0 * num_same / len(references_tokens)
+            f1_score = (2 * precision * recall) / (precision + recall)
+
+        f1_list.append(f1_score)
+
+    return sum(f1_list) / len(f1_list)
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_5/afriqa b/lm_eval/tasks/afrobench/afriqa/prompt_5/afriqa
new file mode 100644
index 0000000000000000000000000000000000000000..fab00068beb951dbab88d4baa870fabfced4f820
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_5/afriqa
@@ -0,0 +1,42 @@
+tag:
+    - afrobench_xqa_tasks
+    - afriqa_prompt_5
+dataset_kwargs: {trust_remote_code: True}
+dataset_path: masakhane/afriqa-gold-passages
+dataset_name: null
+output_type: generate_until
+test_split: test
+fewshot_split: train
+doc_to_target: answer_pivot
+should_decontaminate: true
+doc_to_decontamination_query: question_lang
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+filter_list:
+  - name: remove_whitespace
+    filter:
+      - function: remove_whitespace
+      - function: take_first
+target_delimiter: " "
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+      - "."
+      - ","
+      - "\\$"
+  - metric: f1
+    aggregation: !function utils.f1
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+      - "."
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_5/afriqa_bem.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_5/afriqa_bem.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4288845d30d2e3ec1620dd1a226bf17d385322be
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_5/afriqa_bem.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: bem
+doc_to_text: 'Using the context, find the answer to the question.Respond with the
+  briefest span that includes the answer from the context.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_bem_prompt_5
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_5/afriqa_fon.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_5/afriqa_fon.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c234e944783b0c456b1d0a0599d43a1d569ad18f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_5/afriqa_fon.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: fon
+doc_to_text: 'Using the context, find the answer to the question.Respond with the
+  briefest span that includes the answer from the context.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_fon_prompt_5
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_5/afriqa_hau.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_5/afriqa_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..34823c9e47d2e8d614df530559c638973a02d956
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_5/afriqa_hau.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: 'Using the context, find the answer to the question.Respond with the
+  briefest span that includes the answer from the context.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_hau_prompt_5
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_5/afriqa_ibo.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_5/afriqa_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6582d2d56632e96a3d14c646a47dd7d4b55b1652
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_5/afriqa_ibo.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: 'Using the context, find the answer to the question.Respond with the
+  briefest span that includes the answer from the context.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_ibo_prompt_5
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_5/afriqa_kin.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_5/afriqa_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ed9d6517878d8ba4e29342c915648fdb48fdd45e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_5/afriqa_kin.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_text: 'Using the context, find the answer to the question.Respond with the
+  briefest span that includes the answer from the context.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_kin_prompt_5
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_5/afriqa_swa.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_5/afriqa_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dfcfb147f8d7789d89e0521129ba1b01f1725384
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_5/afriqa_swa.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: 'Using the context, find the answer to the question.Respond with the
+  briefest span that includes the answer from the context.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+fewshot_split: test
+fewshot_config:
+  sampler: first_n
+task: afriqa_swa_prompt_5
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_5/afriqa_twi.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_5/afriqa_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cde555cf760b159378dbc137a4759ab3c87a6b4a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_5/afriqa_twi.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_text: 'Using the context, find the answer to the question.Respond with the
+  briefest span that includes the answer from the context.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_twi_prompt_5
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_5/afriqa_yor.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_5/afriqa_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c9fa17e82c271118309bf6769efff0635cf94230
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_5/afriqa_yor.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: 'Using the context, find the answer to the question.Respond with the
+  briefest span that includes the answer from the context.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_yor_prompt_5
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_5/afriqa_zul.yaml b/lm_eval/tasks/afrobench/afriqa/prompt_5/afriqa_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..427e7217f4b113e28ad3efc568b35ab093eed463
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_5/afriqa_zul.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: zul
+doc_to_text: 'Using the context, find the answer to the question.Respond with the
+  briefest span that includes the answer from the context.
+
+  Question: {{question_lang}}
+
+  Context: {{context}}
+
+  Answer:'
+include: afriqa
+task: afriqa_zul_prompt_5
diff --git a/lm_eval/tasks/afrobench/afriqa/prompt_5/utils.py b/lm_eval/tasks/afrobench/afriqa/prompt_5/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..eae1d885037da14892b39715c49e4d3aac61f06f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/prompt_5/utils.py
@@ -0,0 +1,53 @@
+import re
+import string
+from collections import Counter
+
+
+def normalize_answer(s):
+    """
+    Taken from the official evaluation script for v1.1 of the SQuAD dataset.
+    Lower text and remove punctuation, articles and extra whitespace.
+    """
+
+    def remove_articles(text):
+        return re.sub(r"\b(a|an|the)\b", " ", text)
+
+    def white_space_fix(text):
+        return " ".join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def f1(items):
+    """
+    Taken from the official evaluation script for v1.1 of the SQuAD dataset.
+    """
+
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+
+    f1_list = []
+
+    for i in range(len(golds)):
+        prediction_tokens = normalize_answer(preds[i]).split()
+        references_tokens = normalize_answer(golds[i]).split()
+        common = Counter(prediction_tokens) & Counter(references_tokens)
+        num_same = sum(common.values())
+        if num_same == 0:
+            f1_score = 0
+        else:
+            precision = 1.0 * num_same / len(prediction_tokens)
+            recall = 1.0 * num_same / len(references_tokens)
+            f1_score = (2 * precision * recall) / (precision + recall)
+
+        f1_list.append(f1_score)
+
+    return sum(f1_list) / len(f1_list)
diff --git a/lm_eval/tasks/afrobench/afriqa/utils.py b/lm_eval/tasks/afrobench/afriqa/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fef58f013ff9d31da0a952e1315cc09b53c2e74
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afriqa/utils.py
@@ -0,0 +1,125 @@
+import argparse
+import os
+
+import yaml
+
+
+class FunctionTag:
+    def __init__(self, value):
+        self.value = value
+
+
+def prompt_func(mode, lang):
+    prompt_map = {
+        "prompt_1": "Your task is to answer a question given a context."
+        "Make sure you respond with the shortest span containing the answer in the context.\n"
+        "Question: {{question_lang}}\n"
+        "Context: {{context}}\n"
+        "Answer:",
+        "prompt_2": f"Your task is to answer a question given a context. The question is in {lang}, while the context is in English or French."
+        "Make sure you respond with the shortest span in the context that contains the answer.\n"
+        "Question: {{question_lang}}\n"
+        "Context: {{context}}\n"
+        "Answer:",
+        "prompt_3": "Given the context, provide the answer to the following question."
+        "Ensure your response is concise and directly from the context.\n"
+        "Question: {{question_lang}}\n"
+        "Context: {{context}}\n"
+        "Answer:",
+        "prompt_4": "You are an AI assistant and your task is to answer the question based on the provided context."
+        "Your answer should be the shortest span that contains the answer within the context.\n"
+        "Question: {{question_lang}}\n"
+        "Context: {{context}}\n"
+        "Answer:",
+        "prompt_5": "Using the context, find the answer to the question."
+        "Respond with the briefest span that includes the answer from the context.\n"
+        "Question: {{question_lang}}\n"
+        "Context: {{context}}\n"
+        "Answer:",
+    }
+    return prompt_map[mode]
+
+
+def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
+    """
+    Generate a yaml file for each language.
+
+    :param output_dir: The directory to output the files to.
+    :param overwrite: Whether to overwrite files if they already exist.
+    """
+    err = []
+    languages = {
+        "bem": "Bemba",
+        "fon": "Fon",
+        "hau": "Hausa",
+        "ibo": "Igbo",
+        "kin": "Kinyarwanda",
+        "swa": "Swahili",
+        "twi": "Twi",
+        "wol": "Wolof",
+        "yor": "Yoruba",
+        "zul": "Zulu",
+    }
+
+    for lang in languages.keys():
+        try:
+            file_name = f"afriqa_{lang}.yaml"
+            task_name = f"afriqa_{lang}_{mode}"
+            yaml_template = "afriqa"
+            yaml_details = {
+                "include": yaml_template,
+                "task": task_name,
+                "dataset_name": lang,
+                "doc_to_text": prompt_func(mode, languages[lang]),
+            }
+            file_path = os.path.join(output_dir, mode)
+            os.makedirs(file_path, exist_ok=True)
+
+            with open(
+                f"{output_dir}/{mode}/{file_name}",
+                "w" if overwrite else "x",
+                encoding="utf8",
+            ) as f:
+                f.write("# Generated by utils.py\n")
+                yaml.dump(
+                    yaml_details,
+                    f,
+                    allow_unicode=True,
+                )
+        except FileExistsError:
+            err.append(file_name)
+
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist (use --overwrite flag):"
+            f" {', '.join(err)}"
+        )
+
+
+def main() -> None:
+    """Parse CLI args and generate language-specific yaml files."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--overwrite",
+        default=True,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default="./",
+        help="Directory to write yaml files to",
+    )
+    parser.add_argument(
+        "--mode",
+        default="prompt_1",
+        choices=["prompt_1", "prompt_2", "prompt_3", "prompt_4", "prompt_5"],
+        help="Prompt number",
+    )
+    args = parser.parse_args()
+
+    gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite, mode=args.mode)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lm_eval/tasks/afrobench/afrisenti/README.md b/lm_eval/tasks/afrobench/afrisenti/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..99bd489e3eb2cd99eb888a0c2903a4c6259668df
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/README.md
@@ -0,0 +1,58 @@
+#
+
+## Paper
+Title: `AfriSenti: A Twitter Sentiment Analysis Benchmark for African Languages`
+
+Paper Link: https://aclanthology.org/2023.emnlp-main.862/
+
+## Abstract
+>Africa is home to over 2,000 languages from over six language families and has the highest linguistic diversity among all continents. This includes 75 languages with at least one million speakers each. Yet, there is little NLP research conducted on African languages. Crucial in enabling such research is the availability of high-quality annotated datasets. In this paper, we introduce AfriSenti, a sentiment analysis benchmark that contains a total of >110,000 tweets in 14 African languages (Amharic, Algerian Arabic, Hausa, Igbo, Kinyarwanda, Moroccan Arabic, Mozambican Portuguese, Nigerian Pidgin, Oromo, Swahili, Tigrinya, Twi, Xitsonga, and Yoruba) from four language families. The tweets were annotated by native speakers and used in the AfriSenti-SemEval shared task (with over 200 participants, see website: https://afrisenti-semeval.github.io). We describe the data collection methodology, annotation process, and the challenges we dealt with when curating each dataset. We further report baseline experiments conducted on the AfriSenti datasets and discuss their usefulness.
+
+HomePage: https://github.com/afrisenti-semeval/afrisent-semeval-2023
+
+### Citation
+
+```
+@inproceedings{muhammad-etal-2023-afrisenti,
+    title = "{A}fri{S}enti: A {T}witter Sentiment Analysis Benchmark for {A}frican Languages",
+    author = "Muhammad, Shamsuddeen Hassan  and
+      Abdulmumin, Idris  and
+      Ayele, Abinew Ali  and
+      Ousidhoum, Nedjma  and
+      Adelani, David Ifeoluwa  and
+      Yimam, Seid Muhie  and
+      Ahmad, Ibrahim Sa'id  and
+      Beloucif, Meriem  and
+      Mohammad, Saif M.  and
+      Ruder, Sebastian  and
+      Hourrane, Oumaima  and
+      Brazdil, Pavel  and
+      Jorge, Alipio  and
+      Ali, Felermino D{\'a}rio M{\'a}rio Ant{\'o}nio  and
+      David, Davis  and
+      Osei, Salomey  and
+      Shehu Bello, Bello  and
+      Ibrahim, Falalu  and
+      Gwadabe, Tajuddeen  and
+      Rutunda, Samuel  and
+      Belay, Tadesse  and
+      Messelle, Wendimu Baye  and
+      Balcha, Hailu Beshada  and
+      Chala, Sisay Adugna  and
+      Gebremichael, Hagos Tesfahun  and
+      Opoku, Bernard  and
+      Arthur, Stephen",
+    editor = "Bouamor, Houda  and
+      Pino, Juan  and
+      Bali, Kalika",
+    booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
+    month = dec,
+    year = "2023",
+    address = "Singapore",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2023.emnlp-main.862/",
+    doi = "10.18653/v1/2023.emnlp-main.862",
+    pages = "13968--13981",
+    abstract = "Africa is home to over 2,000 languages from over six language families and has the highest linguistic diversity among all continents. This includes 75 languages with at least one million speakers each. Yet, there is little NLP research conducted on African languages. Crucial in enabling such research is the availability of high-quality annotated datasets. In this paper, we introduce AfriSenti, a sentiment analysis benchmark that contains a total of {\ensuremath{>}}110,000 tweets in 14 African languages (Amharic, Algerian Arabic, Hausa, Igbo, Kinyarwanda, Moroccan Arabic, Mozambican Portuguese, Nigerian Pidgin, Oromo, Swahili, Tigrinya, Twi, Xitsonga, and Yoruba) from four language families. The tweets were annotated by native speakers and used in the AfriSenti-SemEval shared task (with over 200 participants, see website: https://afrisenti-semeval.github.io). We describe the data collection methodology, annotation process, and the challenges we dealt with when curating each dataset. We further report baseline experiments conducted on the AfriSenti datasets and discuss their usefulness."
+}
+```
diff --git a/lm_eval/tasks/afrobench/afrisenti/afrisenti.yaml b/lm_eval/tasks/afrobench/afrisenti/afrisenti.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..36a1efdb3033e70060251e346847b73fd9de2f60
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/afrisenti.yaml
@@ -0,0 +1,13 @@
+group: afrisenti
+task:
+  - afrisenti_prompt_1
+  - afrisenti_prompt_2
+  - afrisenti_prompt_3
+  - afrisenti_prompt_4
+  - afrisenti_prompt_5
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1
diff --git a/lm_eval/tasks/afrobench/afrisenti/fewshot.sh b/lm_eval/tasks/afrobench/afrisenti/fewshot.sh
new file mode 100644
index 0000000000000000000000000000000000000000..428d455b65ac917efee1810a68626f36e777e2d9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/fewshot.sh
@@ -0,0 +1,109 @@
+lm_eval --model hf \
+        --model_args pretrained=masakhane/African-ultrachat-alpaca  \
+        --tasks afrimmlu_direct_amh,afrimmlu_direct_eng,afrimmlu_direct_ewe,afrimmlu_direct_fra,afrimmlu_direct_hau,afrimmlu_direct_ibo,afrimmlu_direct_kin,afrimmlu_direct_lin,afrimmlu_direct_lug,afrimmlu_direct_orm,afrimmlu_direct_sna,afrimmlu_direct_sot,afrimmlu_direct_twi,afrimmlu_direct_wol,afrimmlu_direct_xho,afrimmlu_direct_yor,afrimmlu_direct_zul   \
+        --device cuda:0     \
+        --batch_size 1 \
+        --num_fewshot 0 \
+        --verbosity DEBUG \
+        --wandb_args project=afrimmlu
+
+
+lm_eval --model hf \
+        --model_args pretrained=bigscience/mt0-small,parallelize=true \
+        --tasks afrisenti_amh_prompt_1,afrisenti_arq_prompt_1,afrisenti_ary_prompt_1,afrisenti_hau_prompt_1,afrisenti_ibo_prompt_1,afrisenti_kin_prompt_1,afrisenti_orm_prompt_1,afrisenti_pcm_prompt_1,afrisenti_por_prompt_1,afrisenti_swa_prompt_1,afrisenti_tir_prompt_1,afrisenti_tso_prompt_1,afrisenti_twi_prompt_1,afrisenti_yor_prompt_1\
+        --device cuda:0     \
+        --batch_size 1 \
+        --num_fewshot 0 \
+        --verbosity DEBUG \
+        --limit 5
+
+
+lm_eval --model hf \
+        --model_args pretrained=bigscience/mt0-xxl,parallelize=true  \
+        --tasks afrisenti_amh_prompt_1,afrisenti_arq_prompt_1,afrisenti_ary_prompt_1,afrisenti_hau_prompt_1,afrisenti_ibo_prompt_1,afrisenti_kin_prompt_1,afrisenti_orm_prompt_1,afrisenti_pcm_prompt_1,afrisenti_por_prompt_1,afrisenti_swa_prompt_1,afrisenti_tir_prompt_1,afrisenti_tso_prompt_1,afrisenti_twi_prompt_1,afrisenti_yor_prompt_1\
+        --batch_size 128 \
+        --num_fewshot 0 \
+        --verbosity DEBUG
+
+lm_eval --model hf \
+        --model_args pretrained=google/gemma-2-27b-it,parallelize=true,trust_remote_code=True \
+        --tasks afriqa_wol_prompt_2\
+        --batch_size 1 \
+        --device 'cuda' \
+        --num_fewshot 5 \
+        --verbosity DEBUG \
+        --output_path './afriqa_results/' \
+        --log_samples
+
+lm_eval --model vllm \
+        --model_args pretrained=meta-llama/Llama-2-7b-chat-hf,tensor_parallel_size=2,dtype='auto',gpu_memory_utilization=0.8,data_parallel_size=1 \
+        --tasks masakhapos_pcm_prompt_1,masakhapos_pcm_prompt_2,masakhapos_pcm_prompt_3,masakhapos_pcm_prompt_4,masakhapos_pcm_prompt_5 \
+        --batch_size 'auto' \
+        --device 'cuda' \
+        --num_fewshot 0 \
+        --verbosity DEBUG \
+        --limit 2
+
+
+lm_eval --model vllm \
+        --model_args pretrained=meta-llama/Llama-2-7b-chat-hf,tensor_parallel_size=2,dtype='auto',gpu_memory_utilization=0.8,data_parallel_size=1 \
+        --tasks masakhapos_pcm_prompt_1,masakhapos_pcm_prompt_2,masakhapos_pcm_prompt_3,masakhapos_bam_prompt_2,masakhapos_bbj_prompt_3 \
+        --batch_size 'auto' \
+        --device 'cuda' \
+        --num_fewshot 0 \
+        --verbosity DEBUG
+
+lm_eval --model vllm \
+        --model_args pretrained=google/gemma-1.1-7b-it,tensor_parallel_size=2,dtype='auto',gpu_memory_utilization=0.8,data_parallel_size=1 \
+        --tasks masakhaner_pcm_prompt_1\
+        --batch_size 'auto' \
+        --device 'cuda' \
+        --num_fewshot 0 \
+        --verbosity DEBUG \
+        --limit 5
+
+lm_eval --model vllm \
+        --model_args pretrained=google/gemma-2-9b-it,tensor_parallel_size=2,dtype='auto',gpu_memory_utilization=0.8,data_parallel_size=1 \
+        --tasks masakhaner_pcm_prompt_1,masakhaner_pcm_prompt_2,masakhaner_pcm_prompt_3,masakhaner_pcm_prompt_4,masakhaner_pcm_prompt_5\
+        --batch_size 'auto' \
+        --device 'cuda' \
+        --num_fewshot 0 \
+        --verbosity DEBUG \
+        --limit 5
+
+lm_eval --model vllm \
+        --model_args pretrained=google/gemma-1.1-7b-it,tensor_parallel_size=2,dtype='auto',gpu_memory_utilization=0.8,data_parallel_size=1 \
+        --tasks flores_eng_Latn-fuv_Latn_prompt_1,flores_eng_Latn-fuv_Latn_prompt_2,flores_eng_Latn-fuv_Latn_prompt_3,flores_fuv_Latn-eng_Latn_prompt_1,flores_fuv_Latn-eng_Latn_prompt_2,flores_fuv_Latn-eng_Latn_prompt_3 \
+        --batch_size 'auto' \
+        --device 'cuda' \
+        --num_fewshot 0 \
+        --verbosity DEBUG \
+        --limit 2
+
+lm_eval --model vllm \
+        --model_args pretrained=google/gemma-2-27b-it,tensor_parallel_size=2,dtype='auto',gpu_memory_utilization=0.9,data_parallel_size=1 \
+        --tasks masakhapos_twi_prompt_3,masakhapos_wol_prompt_3,masakhapos_xho_prompt_3,masakhapos_yor_prompt_3,masakhapos_zul_prompt_3\
+        --batch_size 'auto' \
+        --num_fewshot 5 \
+        --verbosity DEBUG \
+        --output_path './masakhapos_results/' \
+        --log_samples
+
+lm_eval --model hf \
+        --model_args pretrained=bigscience/mt0-small,parallelize=true \
+        --tasks  injongointent_amh_prompt_1,injongointent_eng_prompt_1,injongointent_yor_prompt_1,injongointent_ibo_prompt_1,injongointent_wol_prompt_1\
+        --device 'mps'  \
+        --batch_size 1 \
+        --num_fewshot 0 \
+        --verbosity DEBUG \
+        --limit 5
+
+lm_eval --model hf \
+        --model_args pretrained=google/gemma-3-27b-it,parallelize=true \
+        --tasks  afrobench_sentiment_tasks\
+        --device 'cuda'  \
+        --batch_size 1 \
+        --num_fewshot 0 \
+        --verbosity DEBUG \
+        --output_path './senti_results/' \
+        --log_samples
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti b/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti
new file mode 100644
index 0000000000000000000000000000000000000000..69ef6b2bc08bbc198e2c6610c7c40041db4d20a4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti
@@ -0,0 +1,41 @@
+tag:
+    - afrobench_sentiment_tasks
+    - afrisenti_prompt_1
+task: null
+dataset_path: masakhane/afrisenti
+dataset_name: null
+dataset_kwargs: {trust_remote_code: True}
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: train
+doc_to_text: 'Does this statement; "{{tweet}}" have a Neutral, Positive or Negative sentiment? Labels only'
+doc_to_target: label
+doc_to_choice:
+    - "negative"
+    - "positive"
+    - "neutral"
+should_decontaminate: true
+doc_to_decontamination_query: tweet
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_amh.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7eefbe867360070e0701a558c124ad4ad7da786a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_amh.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: amh
+include: afrisenti
+task: afrisenti_amh_prompt_1
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_arq.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_arq.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8b2e2522d94e2d0da8cb9efec72d225c7161f8e7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_arq.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: arq
+include: afrisenti
+task: afrisenti_arq_prompt_1
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_ary.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_ary.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8f9ef3f20d654a0e97e40a9dd4e3d6bd2e7d949b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_ary.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ary
+include: afrisenti
+task: afrisenti_ary_prompt_1
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_hau.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f0ab9071abbc0211b8048743db43347bb5df1583
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_hau.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: hau
+include: afrisenti
+task: afrisenti_hau_prompt_1
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_ibo.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b0176d08764dae9a5fd8af57dc903b6b55ab0124
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_ibo.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: ibo
+include: afrisenti
+task: afrisenti_ibo_prompt_1
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_kin.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..75bb717a6e22d931404d2c7cfc919cfa8f99453d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_kin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: kin
+include: afrisenti
+task: afrisenti_kin_prompt_1
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_orm.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..65c63b06fbb721d59033e5f748c6899270e92831
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_orm.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: orm
+include: afrisenti
+task: afrisenti_orm_prompt_1
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_pcm.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_pcm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0f24fe9fc01cc162294aa1b387676591450b2d39
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_pcm.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: pcm
+include: afrisenti
+task: afrisenti_pcm_prompt_1
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_por.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_por.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1e1b4cd60a1533ab6804624c8e967b46c37a69be
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_por.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: por
+include: afrisenti
+task: afrisenti_por_prompt_1
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_swa.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3386948ccf5eef56d293cc39d0811ab06e1a5127
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_swa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: swa
+include: afrisenti
+task: afrisenti_swa_prompt_1
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_tir.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_tir.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c4942628e8115f58a047d7819052b27cc50883e9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_tir.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: tir
+include: afrisenti
+task: afrisenti_tir_prompt_1
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_tso.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_tso.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d500693270946b6581020bc68a38612bdfd4f033
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_tso.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: tso
+include: afrisenti
+task: afrisenti_tso_prompt_1
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_twi.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a68bb23dcaeedcc6c97768c42a1e49c26b425e40
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_twi.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: twi
+include: afrisenti
+task: afrisenti_twi_prompt_1
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_yor.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fda98c2c82c6e323eae8c96843e657e07a4d9665
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_1/afrisenti_yor.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: yor
+include: afrisenti
+task: afrisenti_yor_prompt_1
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_1/run.sh b/lm_eval/tasks/afrobench/afrisenti/prompt_1/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..50d1a1338f87330219dd4c6f79fa85ef918bb21c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_1/run.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+models=(
+
+  "google/gemma-1.1-7b-it"
+  "CohereForAI/aya-101"
+  "meta-llama/Llama-2-7b-chat-hf"
+  "meta-llama/Meta-Llama-3-8B-Instruct"
+  "google/gemma-2-9b-it"
+  "bigscience/mt0-xxl"
+  "google/gemma-2-27b-it"
+  "meta-llama/Meta-Llama-3-70B-Instruct"
+)
+task=afrisenti_amh_prompt_1,afrisenti_arq_prompt_1,afrisenti_ary_prompt_1,afrisenti_hau_prompt_1,afrisenti_ibo_prompt_1,afrisenti_kin_prompt_1,afrisenti_pcm_prompt_1,afrisenti_por_prompt_1,afrisenti_swa_prompt_1,afrisenti_tir_prompt_1,afrisenti_tso_prompt_1,afrisenti_twi_prompt_1,afrisenti_yor_prompt_1
+
+for model in "${models[@]}"
+do
+  echo "Evaluating model: $model"
+  for fewshot in 0 5
+  do
+    export OUTPUT_DIR=results/$fewshot
+
+    mkdir -p "$OUTPUT_DIR"
+
+    lm_eval --model hf \
+            --model_args "pretrained=${model}" \
+            --tasks $task\
+            --device cuda:0 \
+            --batch_size 16 \
+            --output_path "$OUTPUT_DIR" \
+            --num_fewshot $fewshot \
+            --verbosity DEBUG
+  done
+done
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_1/utils.py b/lm_eval/tasks/afrobench/afrisenti/prompt_1/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e735e2deb1f9c53152c072615aebe8ba3acb90b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_1/utils.py
@@ -0,0 +1 @@
+from lm_eval.utils import weighted_f1_score
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_1/xx.py b/lm_eval/tasks/afrobench/afrisenti/prompt_1/xx.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca0e325e526e33dfd19ba03a93652244977fc119
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_1/xx.py
@@ -0,0 +1,13 @@
+from datasets import load_dataset
+
+
+# ['amh', 'hau', 'ibo', 'arq', 'ary', 'yor', 'por', 'twi', 'tso', 'tir', 'orm', 'pcm', 'kin', 'swa']
+
+data = load_dataset("masakhane/afrisenti", "pcm", trust_remote_code=True)
+print(data)
+print(data["test"][:5])
+#
+# ['Naija', 'Pipo', 'wey', 'dey', 'for', 'inside', 'social', 'Media', 'sef', 'don', 'put', 'hand', 'for', 'ear', 'give',
+#  'federal', 'goment', 'and', 'polical', 'leader', 'dem', 'ova', 'di', 'kilin', '.']
+#
+# [6, 0, 14, 17, 2, 2, 6, 0, 7, 17, 16, 0, 2, 0, 16, 0, 0, 9, 0, 0, 11, 2, 8, 0, 1]
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti b/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti
new file mode 100644
index 0000000000000000000000000000000000000000..879f2826c3f26025fcb5e41342f86ef3f9c6c677
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti
@@ -0,0 +1,39 @@
+tag:
+    - afrobench_sentiment_tasks
+    - afrisent_prompt_2
+dataset_path: masakhane/afrisenti
+dataset_name: null
+dataset_kwargs: {trust_remote_code: True}
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: train
+doc_to_target: label
+doc_to_choice:
+    - "negative"
+    - "positive"
+    - "neutral"
+should_decontaminate: true
+doc_to_decontamination_query: 'text: {{tweet}} \nlabel: '
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_amh.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d97b2c25787d9338546dba3707afcda31ad31269
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_amh.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: amh
+doc_to_text: Does this Amharic statement; '{{tweet}}' have a Neutral, Positive or
+  Negative sentiment? Labels only
+include: afrisenti
+task: afrisenti_amh_prompt_2
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_arq.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_arq.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c61e310dee3a9f97557aaaa8d465ce329ba29610
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_arq.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: arq
+doc_to_text: Does this Algerian Arabic statement; '{{tweet}}' have a Neutral, Positive
+  or Negative sentiment? Labels only
+include: afrisenti
+task: afrisenti_arq_prompt_2
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_ary.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_ary.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4e76d385b3dc5ee5bbbe817336ec9d78feef7eb7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_ary.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ary
+doc_to_text: Does this Moroccan Arabic statement; '{{tweet}}' have a Neutral, Positive
+  or Negative sentiment? Labels only
+include: afrisenti
+task: afrisenti_ary_prompt_2
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_hau.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f7b0ccb2811b30acc98fe594af33b0a38fb2a88b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_hau.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: Does this Hausa statement; '{{tweet}}' have a Neutral, Positive or Negative
+  sentiment? Labels only
+include: afrisenti
+task: afrisenti_hau_prompt_2
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_ibo.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d4d6c6c8094bb5b41e5488c972f2702705c9f3d5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_ibo.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: Does this Igbo statement; '{{tweet}}' have a Neutral, Positive or Negative
+  sentiment? Labels only
+include: afrisenti
+task: afrisenti_ibo_prompt_2
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_kin.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5067b9fb75cd1579a8bc915a3a010696ca60b177
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_kin.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_text: Does this Kinyarwanda statement; '{{tweet}}' have a Neutral, Positive
+  or Negative sentiment? Labels only
+include: afrisenti
+task: afrisenti_kin_prompt_2
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_orm.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e8abbbfbd73687a4249238a3f2ad988b85634531
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_orm.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: orm
+doc_to_text: Does this Oromo statement; '{{tweet}}' have a Neutral, Positive or Negative
+  sentiment? Labels only
+include: afrisenti
+task: afrisenti_orm_prompt_2
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_pcm.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_pcm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4dd98925299e9b872d49bdb9bea0ebd2b48d1ec7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_pcm.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: pcm
+doc_to_text: Does this Nigerian Pidgin statement; '{{tweet}}' have a Neutral, Positive
+  or Negative sentiment? Labels only
+include: afrisenti
+task: afrisenti_pcm_prompt_2
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_por.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_por.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4b8beecff946f4764becafb103f890ea924926a9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_por.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: por
+doc_to_text: Does this Mozambique Portuguese statement; '{{tweet}}' have a Neutral,
+  Positive or Negative sentiment? Labels only
+include: afrisenti
+task: afrisenti_por_prompt_2
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_swa.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..496da1a1d1e2b4fcfa004e918af85e7321a1ed29
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_swa.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: Does this Swahili statement; '{{tweet}}' have a Neutral, Positive or
+  Negative sentiment? Labels only
+include: afrisenti
+task: afrisenti_swa_prompt_2
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_tir.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_tir.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3899c992ed3180d76f2ff677c148026da6d5e9a1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_tir.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: tir
+doc_to_text: Does this Tigrinya statement; '{{tweet}}' have a Neutral, Positive or
+  Negative sentiment? Labels only
+include: afrisenti
+task: afrisenti_tir_prompt_2
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_tso.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_tso.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b371b7479489bd5387d2d18cd7f2edab8496dc00
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_tso.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: tso
+doc_to_text: Does this Xithonga statement; '{{tweet}}' have a Neutral, Positive or
+  Negative sentiment? Labels only
+include: afrisenti
+task: afrisenti_tso_prompt_2
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_twi.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c985efc4d32d30ae7f18ed5aac13c97d4dbe112b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_twi.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_text: Does this Twi statement; '{{tweet}}' have a Neutral, Positive or Negative
+  sentiment? Labels only
+include: afrisenti
+task: afrisenti_twi_prompt_2
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_yor.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..78932ed4cfe5f88bc91a6a0d26eb8c33a71c1ecb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_2/afrisenti_yor.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: Does this Yoruba statement; '{{tweet}}' have a Neutral, Positive or Negative
+  sentiment? Labels only
+include: afrisenti
+task: afrisenti_yor_prompt_2
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_2/run.sh b/lm_eval/tasks/afrobench/afrisenti/prompt_2/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..48797912512124c9c5287dcdd654e5fa04a029b0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_2/run.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+models=(
+
+  "google/gemma-1.1-7b-it"
+  "CohereForAI/aya-101"
+  "meta-llama/Llama-2-7b-chat-hf"
+  "meta-llama/Meta-Llama-3-8B-Instruct"
+  "google/gemma-2-9b-it"
+  "bigscience/mt0-xxl"
+  "google/gemma-2-27b-it"
+  "meta-llama/Meta-Llama-3-70B-Instruct"
+)
+
+for model in "${models[@]}"
+do
+  echo "Evaluating model: $model"
+  for fewshot in 0 5
+  do
+    export OUTPUT_DIR=./results/$fewshot
+
+    mkdir -p "$OUTPUT_DIR"
+
+    lm_eval --model hf \
+            --model_args "pretrained=${model},parallelize: true" \
+            --tasks afribench\
+            --batch_size 256 \
+            --output_path "$OUTPUT_DIR" \
+            --num_fewshot $fewshot \
+            --verbosity DEBUG \
+            --limit 2
+  done
+done
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_2/utils.py b/lm_eval/tasks/afrobench/afrisenti/prompt_2/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e735e2deb1f9c53152c072615aebe8ba3acb90b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_2/utils.py
@@ -0,0 +1 @@
+from lm_eval.utils import weighted_f1_score
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_2/xx.py b/lm_eval/tasks/afrobench/afrisenti/prompt_2/xx.py
new file mode 100644
index 0000000000000000000000000000000000000000..4aa0db7af761fd7ea8858383b4564130a374f223
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_2/xx.py
@@ -0,0 +1,5 @@
+from datasets import load_dataset
+
+
+data = load_dataset("HausaNLP/AfriSenti-Twitter", "yor", trust_remote_code=True)
+print(data)
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti b/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti
new file mode 100644
index 0000000000000000000000000000000000000000..53cb77771f2cc6622fa4c67ea5ea20485df761d6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti
@@ -0,0 +1,39 @@
+tag:
+    - afrobench_sentiment_tasks
+    - afrisenti_prompt_3
+dataset_path: masakhane/afrisenti
+dataset_name: null
+dataset_kwargs: {trust_remote_code: True}
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: train
+doc_to_target: label
+doc_to_choice:
+    - "negative"
+    - "positive"
+    - "neutral"
+should_decontaminate: true
+doc_to_decontamination_query: 'text: {{tweet}} \nlabel: '
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_amh.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2645b72befa6b0048986f68aa14b4d5ed60027dc
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_amh.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: amh
+doc_to_text: "You are an assistant able to detect sentiments in tweets. \n\nGiven\
+  \ the sentiment labels Neutral, Positive or Negative; what is the sentiment of the\
+  \ Amharic statement below? Return only the labels. \n\ntext: {{tweet}} \nlabel:"
+include: afrisenti
+task: afrisenti_amh_prompt_3
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_arq.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_arq.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0b90f690e93f249e8c4668bb74a667ff19a39247
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_arq.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: arq
+doc_to_text: "You are an assistant able to detect sentiments in tweets. \n\nGiven\
+  \ the sentiment labels Neutral, Positive or Negative; what is the sentiment of the\
+  \ Algerian Arabic statement below? Return only the labels. \n\ntext: {{tweet}} \n\
+  label:"
+include: afrisenti
+task: afrisenti_arq_prompt_3
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_ary.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_ary.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ba11ee3e5146db8bfef7680becfb95a6f0a9b6aa
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_ary.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: ary
+doc_to_text: "You are an assistant able to detect sentiments in tweets. \n\nGiven\
+  \ the sentiment labels Neutral, Positive or Negative; what is the sentiment of the\
+  \ Moroccan Arabic statement below? Return only the labels. \n\ntext: {{tweet}} \n\
+  label:"
+include: afrisenti
+task: afrisenti_ary_prompt_3
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_hau.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4f4e6b3fb3252929fcd2d0f30240ca8d4553a009
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_hau.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: "You are an assistant able to detect sentiments in tweets. \n\nGiven\
+  \ the sentiment labels Neutral, Positive or Negative; what is the sentiment of the\
+  \ Hausa statement below? Return only the labels. \n\ntext: {{tweet}} \nlabel:"
+include: afrisenti
+task: afrisenti_hau_prompt_3
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_ibo.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bbcc88d70447a6257d1acefa94b65336d6b330c1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_ibo.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: "You are an assistant able to detect sentiments in tweets. \n\nGiven\
+  \ the sentiment labels Neutral, Positive or Negative; what is the sentiment of the\
+  \ Igbo statement below? Return only the labels. \n\ntext: {{tweet}} \nlabel:"
+include: afrisenti
+task: afrisenti_ibo_prompt_3
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_kin.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..52d84b2684f9a071c0dc6b5889ad790e3116b0fc
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_kin.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_text: "You are an assistant able to detect sentiments in tweets. \n\nGiven\
+  \ the sentiment labels Neutral, Positive or Negative; what is the sentiment of the\
+  \ Kinyarwanda statement below? Return only the labels. \n\ntext: {{tweet}} \nlabel:"
+include: afrisenti
+task: afrisenti_kin_prompt_3
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_orm.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e2d524bfd781b37cb156ce09fd7fc5aae493392b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_orm.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: orm
+doc_to_text: "You are an assistant able to detect sentiments in tweets. \n\nGiven\
+  \ the sentiment labels Neutral, Positive or Negative; what is the sentiment of the\
+  \ Oromo statement below? Return only the labels. \n\ntext: {{tweet}} \nlabel:"
+include: afrisenti
+task: afrisenti_orm_prompt_3
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_pcm.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_pcm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..eb0ac8ff3bc1b3e0c8efa9fe9afdc40ea0c0690a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_pcm.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: pcm
+doc_to_text: "You are an assistant able to detect sentiments in tweets. \n\nGiven\
+  \ the sentiment labels Neutral, Positive or Negative; what is the sentiment of the\
+  \ Nigerian Pidgin statement below? Return only the labels. \n\ntext: {{tweet}} \n\
+  label:"
+include: afrisenti
+task: afrisenti_pcm_prompt_3
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_por.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_por.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..821a4355b044844d3608c01950b949ce5f292ba2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_por.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: por
+doc_to_text: "You are an assistant able to detect sentiments in tweets. \n\nGiven\
+  \ the sentiment labels Neutral, Positive or Negative; what is the sentiment of the\
+  \ Mozambique Portuguese statement below? Return only the labels. \n\ntext: {{tweet}}\
+  \ \nlabel:"
+include: afrisenti
+task: afrisenti_por_prompt_3
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_swa.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d8e92842e01b61cc815fb48f7a390c6f13587e18
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_swa.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: "You are an assistant able to detect sentiments in tweets. \n\nGiven\
+  \ the sentiment labels Neutral, Positive or Negative; what is the sentiment of the\
+  \ Swahili statement below? Return only the labels. \n\ntext: {{tweet}} \nlabel:"
+include: afrisenti
+task: afrisenti_swa_prompt_3
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_tir.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_tir.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c0f96c24a290d33a3c16754f3c0412ac11cb285a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_tir.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: tir
+doc_to_text: "You are an assistant able to detect sentiments in tweets. \n\nGiven\
+  \ the sentiment labels Neutral, Positive or Negative; what is the sentiment of the\
+  \ Tigrinya statement below? Return only the labels. \n\ntext: {{tweet}} \nlabel:"
+include: afrisenti
+task: afrisenti_tir_prompt_3
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_tso.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_tso.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8355035e963a13e2da541c65d4f778c5f0d46a58
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_tso.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: tso
+doc_to_text: "You are an assistant able to detect sentiments in tweets. \n\nGiven\
+  \ the sentiment labels Neutral, Positive or Negative; what is the sentiment of the\
+  \ Xithonga statement below? Return only the labels. \n\ntext: {{tweet}} \nlabel:"
+include: afrisenti
+task: afrisenti_tso_prompt_3
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_twi.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..98809176e9693cd65ec711fb81739fdbe0030e70
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_twi.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_text: "You are an assistant able to detect sentiments in tweets. \n\nGiven\
+  \ the sentiment labels Neutral, Positive or Negative; what is the sentiment of the\
+  \ Twi statement below? Return only the labels. \n\ntext: {{tweet}} \nlabel:"
+include: afrisenti
+task: afrisenti_twi_prompt_3
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_yor.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9d1b7ac324b28b7a64220b6b318aabf9537594fc
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_3/afrisenti_yor.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: "You are an assistant able to detect sentiments in tweets. \n\nGiven\
+  \ the sentiment labels Neutral, Positive or Negative; what is the sentiment of the\
+  \ Yoruba statement below? Return only the labels. \n\ntext: {{tweet}} \nlabel:"
+include: afrisenti
+task: afrisenti_yor_prompt_3
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_3/utils.py b/lm_eval/tasks/afrobench/afrisenti/prompt_3/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e735e2deb1f9c53152c072615aebe8ba3acb90b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_3/utils.py
@@ -0,0 +1 @@
+from lm_eval.utils import weighted_f1_score
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_3/xx.py b/lm_eval/tasks/afrobench/afrisenti/prompt_3/xx.py
new file mode 100644
index 0000000000000000000000000000000000000000..2133cfa0139de116c3d54e6c3866c5e4c26bbc53
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_3/xx.py
@@ -0,0 +1,5 @@
+from datasets import load_dataset
+
+
+data = load_dataset("masakhane/afrisenti", "por", trust_remote_code=True)
+print(data)
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti b/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti
new file mode 100644
index 0000000000000000000000000000000000000000..6464d7b21693a1565f8479757a89a650cf84ff0c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti
@@ -0,0 +1,39 @@
+tag:
+    - afrobench_sentiment_tasks
+    - afrisenti_prompt_4
+dataset_path: masakhane/afrisenti
+dataset_name: null
+dataset_kwargs: {trust_remote_code: True}
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: train
+doc_to_target: label
+doc_to_choice:
+    - "negative"
+    - "positive"
+    - "neutral"
+should_decontaminate: true
+doc_to_decontamination_query: 'text: {{tweet}} \nlabel: '
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_amh.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8a30a72a45d38f6f4221a1e6ceaef93dd5472bbd
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_amh.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: amh
+doc_to_text: "Label the following text as Neutral, Positive, or Negative. Provide\
+  \ only the label as your response. \n\ntext: {{tweet}} \nlabel: "
+include: afrisenti
+task: afrisenti_amh_prompt_4
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_arq.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_arq.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..125771f5d6877585bb2b9a50121da7e5a56d7805
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_arq.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: arq
+doc_to_text: "Label the following text as Neutral, Positive, or Negative. Provide\
+  \ only the label as your response. \n\ntext: {{tweet}} \nlabel: "
+include: afrisenti
+task: afrisenti_arq_prompt_4
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_ary.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_ary.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7868fbf3e6739cd69a4732a09b80cc08359ccbe8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_ary.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ary
+doc_to_text: "Label the following text as Neutral, Positive, or Negative. Provide\
+  \ only the label as your response. \n\ntext: {{tweet}} \nlabel: "
+include: afrisenti
+task: afrisenti_ary_prompt_4
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_hau.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5e7e9a443e5926dfe4de5074e1416417d38b4447
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_hau.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: "Label the following text as Neutral, Positive, or Negative. Provide\
+  \ only the label as your response. \n\ntext: {{tweet}} \nlabel: "
+include: afrisenti
+task: afrisenti_hau_prompt_4
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_ibo.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..686e16c29e4aad0ff13e160befeb31e5b25a7f54
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_ibo.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: "Label the following text as Neutral, Positive, or Negative. Provide\
+  \ only the label as your response. \n\ntext: {{tweet}} \nlabel: "
+include: afrisenti
+task: afrisenti_ibo_prompt_4
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_kin.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7061ab63523b3b5cb34ec1bb0d35db11fdefd5d0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_kin.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_text: "Label the following text as Neutral, Positive, or Negative. Provide\
+  \ only the label as your response. \n\ntext: {{tweet}} \nlabel: "
+include: afrisenti
+task: afrisenti_kin_prompt_4
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_orm.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8f745ebb28e843d400ccba5973de312d841a2592
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_orm.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: orm
+doc_to_text: "Label the following text as Neutral, Positive, or Negative. Provide\
+  \ only the label as your response. \n\ntext: {{tweet}} \nlabel: "
+include: afrisenti
+task: afrisenti_orm_prompt_4
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_pcm.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_pcm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b5071134ac6d1645650edef34ee6dde2d3bdabce
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_pcm.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: pcm
+doc_to_text: "Label the following text as Neutral, Positive, or Negative. Provide\
+  \ only the label as your response. \n\ntext: {{tweet}} \nlabel: "
+include: afrisenti
+task: afrisenti_pcm_prompt_4
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_por.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_por.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f5196bcf58303558d5a214caffde460b8675d08f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_por.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: por
+doc_to_text: "Label the following text as Neutral, Positive, or Negative. Provide\
+  \ only the label as your response. \n\ntext: {{tweet}} \nlabel: "
+include: afrisenti
+task: afrisenti_por_prompt_4
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_swa.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..97b9e4f1a2fc87433bf511e6e511bd9f54284d5e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_swa.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: "Label the following text as Neutral, Positive, or Negative. Provide\
+  \ only the label as your response. \n\ntext: {{tweet}} \nlabel: "
+include: afrisenti
+task: afrisenti_swa_prompt_4
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_tir.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_tir.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..02dfca854e166ca3d96a570e8db126e646fbd5b5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_tir.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: tir
+doc_to_text: "Label the following text as Neutral, Positive, or Negative. Provide\
+  \ only the label as your response. \n\ntext: {{tweet}} \nlabel: "
+include: afrisenti
+task: afrisenti_tir_prompt_4
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_tso.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_tso.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fa83c1378d15b8c3cedd8582025aea8ea795bb07
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_tso.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: tso
+doc_to_text: "Label the following text as Neutral, Positive, or Negative. Provide\
+  \ only the label as your response. \n\ntext: {{tweet}} \nlabel: "
+include: afrisenti
+task: afrisenti_tso_prompt_4
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_twi.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b4366d1f27902651502a96adfc9c09fb8d82dfde
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_twi.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_text: "Label the following text as Neutral, Positive, or Negative. Provide\
+  \ only the label as your response. \n\ntext: {{tweet}} \nlabel: "
+include: afrisenti
+task: afrisenti_twi_prompt_4
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_yor.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8394706cdfbfb1124d4c25c987901d5f338cd5f7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_4/afrisenti_yor.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: "Label the following text as Neutral, Positive, or Negative. Provide\
+  \ only the label as your response. \n\ntext: {{tweet}} \nlabel: "
+include: afrisenti
+task: afrisenti_yor_prompt_4
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_4/utils.py b/lm_eval/tasks/afrobench/afrisenti/prompt_4/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e735e2deb1f9c53152c072615aebe8ba3acb90b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_4/utils.py
@@ -0,0 +1 @@
+from lm_eval.utils import weighted_f1_score
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_4/xx.py b/lm_eval/tasks/afrobench/afrisenti/prompt_4/xx.py
new file mode 100644
index 0000000000000000000000000000000000000000..4515053c4265ba0b6bc9afa9f876d20ef5fc5c2a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_4/xx.py
@@ -0,0 +1,5 @@
+from datasets import load_dataset
+
+
+data = load_dataset("masakhane/afrisenti", "orm", trust_remote_code=True)
+print(data)
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti b/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti
new file mode 100644
index 0000000000000000000000000000000000000000..5107bb80d5333a462afda9a8efb62a6fd039a733
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti
@@ -0,0 +1,39 @@
+tag:
+    - afrobench_sentiment_tasks
+    - afrisenti_prompt_5
+dataset_path: masakhane/afrisenti
+dataset_name: null
+dataset_kwargs: {trust_remote_code: True}
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: train
+doc_to_target: label
+doc_to_choice:
+    - "negative"
+    - "positive"
+    - "neutral"
+should_decontaminate: true
+doc_to_decontamination_query: 'Text: {{tweet}} \nlabel:'
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_amh.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..866ffbe9fcab9e67a1ba4a9781dddd1ef60e8043
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_amh.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: amh
+doc_to_text: "You are tasked with performing sentiment classification on the following\
+  \ Amharic text. For each input, classify the sentiment as positive, negative, or\
+  \ neutral. Use the following guidelines: \n\n Positive: The text expresses happiness,\
+  \ satisfaction, or optimism. \nNegative: The text conveys disappointment, dissatisfaction,\
+  \ or pessimism. \nNeutral: The text is factual, objective, or without strong emotional\
+  \ undertones. \n\nIf the text contains both positive and negative sentiments, choose\
+  \ the dominant sentiment. For ambiguous or unclear sentiments, select the label\
+  \ that best reflects the overall tone. Please provide a single classification for\
+  \ each input.\n\ntext: {{tweet}} \nlabel: "
+include: afrisenti
+task: afrisenti_amh_prompt_5
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_arq.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_arq.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..783785c031aab91f8840df8e70316ab1d4e24606
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_arq.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: arq
+doc_to_text: "You are tasked with performing sentiment classification on the following\
+  \ Algerian Arabic text. For each input, classify the sentiment as positive, negative,\
+  \ or neutral. Use the following guidelines: \n\n Positive: The text expresses happiness,\
+  \ satisfaction, or optimism. \nNegative: The text conveys disappointment, dissatisfaction,\
+  \ or pessimism. \nNeutral: The text is factual, objective, or without strong emotional\
+  \ undertones. \n\nIf the text contains both positive and negative sentiments, choose\
+  \ the dominant sentiment. For ambiguous or unclear sentiments, select the label\
+  \ that best reflects the overall tone. Please provide a single classification for\
+  \ each input.\n\ntext: {{tweet}} \nlabel: "
+include: afrisenti
+task: afrisenti_arq_prompt_5
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_ary.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_ary.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e601dc19ddd1bf960ee56e5345bbbcc8c6b84caf
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_ary.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: ary
+doc_to_text: "You are tasked with performing sentiment classification on the following\
+  \ Moroccan Arabic text. For each input, classify the sentiment as positive, negative,\
+  \ or neutral. Use the following guidelines: \n\n Positive: The text expresses happiness,\
+  \ satisfaction, or optimism. \nNegative: The text conveys disappointment, dissatisfaction,\
+  \ or pessimism. \nNeutral: The text is factual, objective, or without strong emotional\
+  \ undertones. \n\nIf the text contains both positive and negative sentiments, choose\
+  \ the dominant sentiment. For ambiguous or unclear sentiments, select the label\
+  \ that best reflects the overall tone. Please provide a single classification for\
+  \ each input.\n\ntext: {{tweet}} \nlabel: "
+include: afrisenti
+task: afrisenti_ary_prompt_5
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_hau.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2ab2adc1aaaf15ce9dced631d20ec2efb428ace5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_hau.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: "You are tasked with performing sentiment classification on the following\
+  \ Hausa text. For each input, classify the sentiment as positive, negative, or neutral.\
+  \ Use the following guidelines: \n\n Positive: The text expresses happiness, satisfaction,\
+  \ or optimism. \nNegative: The text conveys disappointment, dissatisfaction, or\
+  \ pessimism. \nNeutral: The text is factual, objective, or without strong emotional\
+  \ undertones. \n\nIf the text contains both positive and negative sentiments, choose\
+  \ the dominant sentiment. For ambiguous or unclear sentiments, select the label\
+  \ that best reflects the overall tone. Please provide a single classification for\
+  \ each input.\n\ntext: {{tweet}} \nlabel: "
+include: afrisenti
+task: afrisenti_hau_prompt_5
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_ibo.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dba7b17964cdc320243578f9e9249a00caea4a44
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_ibo.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: "You are tasked with performing sentiment classification on the following\
+  \ Igbo text. For each input, classify the sentiment as positive, negative, or neutral.\
+  \ Use the following guidelines: \n\n Positive: The text expresses happiness, satisfaction,\
+  \ or optimism. \nNegative: The text conveys disappointment, dissatisfaction, or\
+  \ pessimism. \nNeutral: The text is factual, objective, or without strong emotional\
+  \ undertones. \n\nIf the text contains both positive and negative sentiments, choose\
+  \ the dominant sentiment. For ambiguous or unclear sentiments, select the label\
+  \ that best reflects the overall tone. Please provide a single classification for\
+  \ each input.\n\ntext: {{tweet}} \nlabel: "
+include: afrisenti
+task: afrisenti_ibo_prompt_5
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_kin.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..16ea6f0c5734b4795acb06b5d854d5cb33b97c05
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_kin.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_text: "You are tasked with performing sentiment classification on the following\
+  \ Kinyarwanda text. For each input, classify the sentiment as positive, negative,\
+  \ or neutral. Use the following guidelines: \n\n Positive: The text expresses happiness,\
+  \ satisfaction, or optimism. \nNegative: The text conveys disappointment, dissatisfaction,\
+  \ or pessimism. \nNeutral: The text is factual, objective, or without strong emotional\
+  \ undertones. \n\nIf the text contains both positive and negative sentiments, choose\
+  \ the dominant sentiment. For ambiguous or unclear sentiments, select the label\
+  \ that best reflects the overall tone. Please provide a single classification for\
+  \ each input.\n\ntext: {{tweet}} \nlabel: "
+include: afrisenti
+task: afrisenti_kin_prompt_5
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_orm.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c61ea75ee6b448a7dcf30ecdfcef7328201379ae
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_orm.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: orm
+doc_to_text: "You are tasked with performing sentiment classification on the following\
+  \ Oromo text. For each input, classify the sentiment as positive, negative, or neutral.\
+  \ Use the following guidelines: \n\n Positive: The text expresses happiness, satisfaction,\
+  \ or optimism. \nNegative: The text conveys disappointment, dissatisfaction, or\
+  \ pessimism. \nNeutral: The text is factual, objective, or without strong emotional\
+  \ undertones. \n\nIf the text contains both positive and negative sentiments, choose\
+  \ the dominant sentiment. For ambiguous or unclear sentiments, select the label\
+  \ that best reflects the overall tone. Please provide a single classification for\
+  \ each input.\n\ntext: {{tweet}} \nlabel: "
+include: afrisenti
+task: afrisenti_orm_prompt_5
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_pcm.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_pcm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6de78061a142a0bd80d69619a5eba4ce3756d616
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_pcm.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: pcm
+doc_to_text: "You are tasked with performing sentiment classification on the following\
+  \ Nigerian Pidgin text. For each input, classify the sentiment as positive, negative,\
+  \ or neutral. Use the following guidelines: \n\n Positive: The text expresses happiness,\
+  \ satisfaction, or optimism. \nNegative: The text conveys disappointment, dissatisfaction,\
+  \ or pessimism. \nNeutral: The text is factual, objective, or without strong emotional\
+  \ undertones. \n\nIf the text contains both positive and negative sentiments, choose\
+  \ the dominant sentiment. For ambiguous or unclear sentiments, select the label\
+  \ that best reflects the overall tone. Please provide a single classification for\
+  \ each input.\n\ntext: {{tweet}} \nlabel: "
+include: afrisenti
+task: afrisenti_pcm_prompt_5
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_por.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_por.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..48b728d5dd38bb55632fd3fb22f37fbcbeb66eea
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_por.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: por
+doc_to_text: "You are tasked with performing sentiment classification on the following\
+  \ Mozambique Portuguese text. For each input, classify the sentiment as positive,\
+  \ negative, or neutral. Use the following guidelines: \n\n Positive: The text expresses\
+  \ happiness, satisfaction, or optimism. \nNegative: The text conveys disappointment,\
+  \ dissatisfaction, or pessimism. \nNeutral: The text is factual, objective, or without\
+  \ strong emotional undertones. \n\nIf the text contains both positive and negative\
+  \ sentiments, choose the dominant sentiment. For ambiguous or unclear sentiments,\
+  \ select the label that best reflects the overall tone. Please provide a single\
+  \ classification for each input.\n\ntext: {{tweet}} \nlabel: "
+include: afrisenti
+task: afrisenti_por_prompt_5
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_swa.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fee357ab47703dea57fe9b78a2d46da0f0212874
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_swa.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: "You are tasked with performing sentiment classification on the following\
+  \ Swahili text. For each input, classify the sentiment as positive, negative, or\
+  \ neutral. Use the following guidelines: \n\n Positive: The text expresses happiness,\
+  \ satisfaction, or optimism. \nNegative: The text conveys disappointment, dissatisfaction,\
+  \ or pessimism. \nNeutral: The text is factual, objective, or without strong emotional\
+  \ undertones. \n\nIf the text contains both positive and negative sentiments, choose\
+  \ the dominant sentiment. For ambiguous or unclear sentiments, select the label\
+  \ that best reflects the overall tone. Please provide a single classification for\
+  \ each input.\n\ntext: {{tweet}} \nlabel: "
+include: afrisenti
+task: afrisenti_swa_prompt_5
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_tir.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_tir.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..47a67e1c9358682be41792e77ef9e85fb1735baf
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_tir.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: tir
+doc_to_text: "You are tasked with performing sentiment classification on the following\
+  \ Tigrinya text. For each input, classify the sentiment as positive, negative, or\
+  \ neutral. Use the following guidelines: \n\n Positive: The text expresses happiness,\
+  \ satisfaction, or optimism. \nNegative: The text conveys disappointment, dissatisfaction,\
+  \ or pessimism. \nNeutral: The text is factual, objective, or without strong emotional\
+  \ undertones. \n\nIf the text contains both positive and negative sentiments, choose\
+  \ the dominant sentiment. For ambiguous or unclear sentiments, select the label\
+  \ that best reflects the overall tone. Please provide a single classification for\
+  \ each input.\n\ntext: {{tweet}} \nlabel: "
+include: afrisenti
+task: afrisenti_tir_prompt_5
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_tso.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_tso.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5f5705285e81a7d998dd77b44c9221697aaea435
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_tso.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: tso
+doc_to_text: "You are tasked with performing sentiment classification on the following\
+  \ Xithonga text. For each input, classify the sentiment as positive, negative, or\
+  \ neutral. Use the following guidelines: \n\n Positive: The text expresses happiness,\
+  \ satisfaction, or optimism. \nNegative: The text conveys disappointment, dissatisfaction,\
+  \ or pessimism. \nNeutral: The text is factual, objective, or without strong emotional\
+  \ undertones. \n\nIf the text contains both positive and negative sentiments, choose\
+  \ the dominant sentiment. For ambiguous or unclear sentiments, select the label\
+  \ that best reflects the overall tone. Please provide a single classification for\
+  \ each input.\n\ntext: {{tweet}} \nlabel: "
+include: afrisenti
+task: afrisenti_tso_prompt_5
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_twi.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c0b4fe03da64d5f1940dc9aeebe29de2cea09227
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_twi.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_text: "You are tasked with performing sentiment classification on the following\
+  \ Twi text. For each input, classify the sentiment as positive, negative, or neutral.\
+  \ Use the following guidelines: \n\n Positive: The text expresses happiness, satisfaction,\
+  \ or optimism. \nNegative: The text conveys disappointment, dissatisfaction, or\
+  \ pessimism. \nNeutral: The text is factual, objective, or without strong emotional\
+  \ undertones. \n\nIf the text contains both positive and negative sentiments, choose\
+  \ the dominant sentiment. For ambiguous or unclear sentiments, select the label\
+  \ that best reflects the overall tone. Please provide a single classification for\
+  \ each input.\n\ntext: {{tweet}} \nlabel: "
+include: afrisenti
+task: afrisenti_twi_prompt_5
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_yor.yaml b/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b96edb4104e85d01642b2ceca14fc59f2c296ecd
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_5/afrisenti_yor.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: "You are tasked with performing sentiment classification on the following\
+  \ Yoruba text. For each input, classify the sentiment as positive, negative, or\
+  \ neutral. Use the following guidelines: \n\n Positive: The text expresses happiness,\
+  \ satisfaction, or optimism. \nNegative: The text conveys disappointment, dissatisfaction,\
+  \ or pessimism. \nNeutral: The text is factual, objective, or without strong emotional\
+  \ undertones. \n\nIf the text contains both positive and negative sentiments, choose\
+  \ the dominant sentiment. For ambiguous or unclear sentiments, select the label\
+  \ that best reflects the overall tone. Please provide a single classification for\
+  \ each input.\n\ntext: {{tweet}} \nlabel: "
+include: afrisenti
+task: afrisenti_yor_prompt_5
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_5/utils.py b/lm_eval/tasks/afrobench/afrisenti/prompt_5/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e735e2deb1f9c53152c072615aebe8ba3acb90b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_5/utils.py
@@ -0,0 +1 @@
+from lm_eval.utils import weighted_f1_score
diff --git a/lm_eval/tasks/afrobench/afrisenti/prompt_5/xx.py b/lm_eval/tasks/afrobench/afrisenti/prompt_5/xx.py
new file mode 100644
index 0000000000000000000000000000000000000000..375facffa5030cdde562a9ab4474193a9d45f597
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/prompt_5/xx.py
@@ -0,0 +1,8 @@
+# data = load_dataset('HausaNLP/AfriSenti-Twitter', 'yor', trust_remote_code=True)
+# print(data)
+
+import torch
+
+
+print(torch.cuda.is_available())  # Should return True
+print(torch.cuda.device_count())
diff --git a/lm_eval/tasks/afrobench/afrisenti/utils.py b/lm_eval/tasks/afrobench/afrisenti/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5f9b74e2eb12db6a985c8428830933f8adcc936
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrisenti/utils.py
@@ -0,0 +1,124 @@
+import argparse
+
+import yaml
+
+
+class FunctionTag:
+    def __init__(self, value):
+        self.value = value
+
+
+def prompt_func(mode, lang):
+    prompt_map = {
+        "prompt_1": "Does this statement; {{tweet}} have a Neutral, Positive or Negative sentiment? Labels only",
+        "prompt_2": f"Does this {lang} statement; "
+        "'{{tweet}}' have a Neutral, Positive or Negative sentiment? Labels only",
+        "prompt_3": f"You are an assistant able to detect sentiments in tweets. \n\n"
+        f"Given the sentiment labels Neutral, Positive or Negative; what is "
+        f"the sentiment of the {lang} statement below? Return only the labels. "
+        "\n\ntext: {{tweet}} \nlabel:",
+        "prompt_4": "Label the following text as Neutral, Positive, or Negative. Provide only the label as your "
+        "response. \n\ntext: {{tweet}} \nlabel: ",
+        "prompt_5": f"You are tasked with performing sentiment classification on the following {lang} text. "
+        f"For each input, classify the sentiment as positive, negative, or neutral. "
+        f"Use the following guidelines: \n\n "
+        f"Positive: The text expresses happiness, satisfaction, or optimism. \n"
+        f"Negative: The text conveys disappointment, dissatisfaction, or pessimism. \n"
+        f"Neutral: The text is factual, objective, or without strong emotional undertones. \n\n"
+        f"If the text contains both positive and negative sentiments, choose the dominant sentiment. "
+        f"For ambiguous or unclear sentiments, select the label that best reflects the overall tone. "
+        "Please provide a single classification for each input.\n\ntext: {{tweet}} \nlabel: ",
+    }
+    return prompt_map[mode]
+
+
+def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
+    """
+    Generate a yaml file for each language.
+
+    :param output_dir: The directory to output the files to.
+    :param overwrite: Whether to overwrite files if they already exist.
+    """
+    err = []
+    languages = {
+        "amh": "Amharic",
+        "arq": "Algerian Arabic",
+        "ary": "Moroccan Arabic",
+        "hau": "Hausa",
+        "ibo": "Igbo",
+        "kin": "Kinyarwanda",
+        "orm": "Oromo",
+        "pcm": "Nigerian Pidgin",
+        "por": "Mozambique Portuguese",
+        "swa": "Swahili",
+        "tir": "Tigrinya",
+        "tso": "Xithonga",
+        "twi": "Twi",
+        "yor": "Yoruba",
+    }
+    for lang in languages.keys():
+        try:
+            file_name = f"afrisenti_{lang}.yaml"
+            task_name = f"afrisenti_{lang}_{mode}"
+            yaml_template = "afrisenti"
+            if int(mode.split("_")[-1]) > 1:
+                yaml_details = {
+                    "include": yaml_template,
+                    "task": task_name,
+                    "dataset_name": lang,
+                    "doc_to_text": prompt_func(mode, languages[lang]),
+                }
+            else:
+                yaml_details = {
+                    "include": yaml_template,
+                    "task": task_name,
+                    "dataset_name": lang,
+                }
+            with open(
+                f"{output_dir}/{mode}/{file_name}",
+                "w" if overwrite else "x",
+                encoding="utf8",
+            ) as f:
+                f.write("# Generated by utils.py\n")
+                yaml.dump(
+                    yaml_details,
+                    f,
+                    allow_unicode=True,
+                )
+        except FileExistsError:
+            err.append(file_name)
+
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist (use --overwrite flag):"
+            f" {', '.join(err)}"
+        )
+
+
+def main() -> None:
+    """Parse CLI args and generate language-specific yaml files."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--overwrite",
+        default=True,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default="./",
+        help="Directory to write yaml files to",
+    )
+    parser.add_argument(
+        "--mode",
+        default="prompt_1",
+        choices=["prompt_1", "prompt_2", "prompt_3", "prompt_4", "prompt_5"],
+        help="Prompt number",
+    )
+    args = parser.parse_args()
+
+    gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite, mode=args.mode)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lm_eval/tasks/afrobench/afrobench-lite.yaml b/lm_eval/tasks/afrobench/afrobench-lite.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a23c050a2d09646492778e80dbc3a30dc281f580
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrobench-lite.yaml
@@ -0,0 +1,15 @@
+group: afrobench_lite
+task:
+  - afrimgsm_cot_tasks
+  - afrimmlu_tasks
+  - afrixnli_tasks
+  - belebele_tasks
+  - sib_tasks
+  - african_flores_tasks
+  - injongointent_tasks
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1
diff --git a/lm_eval/tasks/afrobench/afrobench.yaml b/lm_eval/tasks/afrobench/afrobench.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..52234bef5cde6b5695fa6510019bcf37502ddd40
--- /dev/null
+++ b/lm_eval/tasks/afrobench/afrobench.yaml
@@ -0,0 +1,23 @@
+group: afrobench
+task:
+#  - adr_tasks
+##  - afrihate_tasks #dataset not publicly available yet
+#  - afrimgsm_cot_tasks
+#  - afrixnli_tasks
+#  - afrobench_xqa_tasks
+#  - afrobench_sentiment_tasks
+  - afrobench_MT_tasks
+#  - afrobench_TC_tasks
+#  - afrobench_mmlu_tasks
+#  - injongointent_tasks
+#  - masakhaner_tasks
+#  - masakhapos_tasks
+#  - RC_tasks
+#  - uhura_arc_easy_tasks
+#  - xlsum_tasks
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1
diff --git a/lm_eval/tasks/afrobench/belebele/README.md b/lm_eval/tasks/afrobench/belebele/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..10d46a44e2098e7e8aaadf57dbdfe5eb52156d7a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/README.md
@@ -0,0 +1,41 @@
+#
+
+## Paper
+Title: `The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 Language Variants`
+
+Paper Link: https://aclanthology.org/2023.emnlp-main.862/
+
+## Abstract
+>Belebele is a multiple-choice machine reading comprehension (MRC) dataset spanning 122 language variants. This dataset enables the evaluation of mono- and multi-lingual models in high-, medium-, and low-resource languages. Each question has four multiple-choice answers and is linked to a short passage from the FLORES-200 dataset. The human annotation procedure was carefully curated to create questions that discriminate between different levels of generalizable language comprehension and is reinforced by extensive quality checks. While all questions directly relate to the passage, the English dataset on its own proves difficult enough to challenge state-of-the-art language models. Being fully parallel, this dataset enables direct comparison of model performance across all languages. Belebele opens up new avenues for evaluating and analyzing the multilingual abilities of language models and NLP systems.
+
+HomePage: https://github.com/facebookresearch/belebele
+
+### Citation
+
+```
+@inproceedings{bandarkar-etal-2024-belebele,
+    title = "The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 Language Variants",
+    author = "Bandarkar, Lucas  and
+      Liang, Davis  and
+      Muller, Benjamin  and
+      Artetxe, Mikel  and
+      Shukla, Satya Narayan  and
+      Husa, Donald  and
+      Goyal, Naman  and
+      Krishnan, Abhinandan  and
+      Zettlemoyer, Luke  and
+      Khabsa, Madian",
+    editor = "Ku, Lun-Wei  and
+      Martins, Andre  and
+      Srikumar, Vivek",
+    booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
+    month = aug,
+    year = "2024",
+    address = "Bangkok, Thailand",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2024.acl-long.44/",
+    doi = "10.18653/v1/2024.acl-long.44",
+    pages = "749--775",
+    abstract = "We present Belebele, a multiple-choice machine reading comprehension (MRC) dataset spanning 122 language variants. Significantly expanding the language coverage of natural language understanding (NLU) benchmarks, this dataset enables the evaluation of text models in high-, medium-, and low-resource languages. Each question is based on a short passage from the FLORES-200 dataset and has four multiple-choice answers. The questions were carefully curated to discriminate between models with different levels of general language comprehension. The English dataset on its own proves difficult enough to challenge state-of-the-art language models. Being fully parallel, this dataset enables direct comparison of model performance across all languages. We use this dataset to evaluate the capabilities of multilingual masked language models (MLMs) and large language models (LLMs). We present extensive results and findings, notably that despite significant cross-lingual transfer in English-centric LLMs, much smaller MLMs pretrained on balanced multilingual data still understand far more languages. Overall, Belebele opens up new avenues for evaluating and analyzing the multilingual capabilities of NLP systems."
+}
+```
diff --git a/lm_eval/tasks/afrobench/belebele/belebele.yaml b/lm_eval/tasks/afrobench/belebele/belebele.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5c7d3a9dc4450ffbb152abd6eac2655c2cf2199c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/belebele.yaml
@@ -0,0 +1,13 @@
+group: belebele
+task:
+  - belebele_prompt_1
+  - belebele_prompt_2
+  - belebele_prompt_3
+  - belebele_prompt_4
+  - belebele_prompt_5
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 2
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_1/belebele b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele
new file mode 100644
index 0000000000000000000000000000000000000000..51553e0e077d968e1fca29e27783b225ccaf7323
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele
@@ -0,0 +1,23 @@
+tag:
+    - belebele_tasks
+    - belebele_prompt_1
+    - RC_tasks
+dataset_path: facebook/belebele
+dataset_name: null
+output_type: multiple_choice
+test_split: test
+fewshot_config:
+  sampler: first_n
+doc_to_target: "{{['1', '2', '3', '4'].index(correct_answer_num)}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{question}}"
+doc_to_choice: ["A", "B", "C", "D"]
+metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+  - metric: acc_norm
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_afr.yaml b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_afr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e3a7c2b97208d5ac2ced4bbacc3192a290fa6ea2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_afr.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: afr_Latn
+doc_to_text: 'P: {{flores_passage}}
+
+  Q: {{question.strip()}}
+
+  A: {{mc_answer1}}
+
+  B: {{mc_answer2}}
+
+  C: {{mc_answer3}}
+
+  D: {{mc_answer4}}
+
+  Please choose the correct answer from the options above:'
+include: belebele
+task: belebele_afr_prompt_1
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_amh.yaml b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7ee55e8eb291cd47abc402f7c4323d2f95caee1e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_amh.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: amh_Ethi
+doc_to_text: 'P: {{flores_passage}}
+
+  Q: {{question.strip()}}
+
+  A: {{mc_answer1}}
+
+  B: {{mc_answer2}}
+
+  C: {{mc_answer3}}
+
+  D: {{mc_answer4}}
+
+  Please choose the correct answer from the options above:'
+include: belebele
+task: belebele_amh_prompt_1
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_ary.yaml b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_ary.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..82f0d5230d4d24a77d381be67a22dd270255fea1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_ary.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: ary_Arab
+doc_to_text: 'P: {{flores_passage}}
+
+  Q: {{question.strip()}}
+
+  A: {{mc_answer1}}
+
+  B: {{mc_answer2}}
+
+  C: {{mc_answer3}}
+
+  D: {{mc_answer4}}
+
+  Please choose the correct answer from the options above:'
+include: belebele
+task: belebele_ary_prompt_1
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_arz.yaml b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_arz.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..38f8c3edc09bd33823c0a221bcfe8a6d7d758d91
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_arz.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: arz_Arab
+doc_to_text: 'P: {{flores_passage}}
+
+  Q: {{question.strip()}}
+
+  A: {{mc_answer1}}
+
+  B: {{mc_answer2}}
+
+  C: {{mc_answer3}}
+
+  D: {{mc_answer4}}
+
+  Please choose the correct answer from the options above:'
+include: belebele
+task: belebele_arz_prompt_1
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_bam.yaml b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_bam.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f2bc2d49f78a094ad11711baee3df33923077c89
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_bam.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: bam_Latn
+doc_to_text: 'P: {{flores_passage}}
+
+  Q: {{question.strip()}}
+
+  A: {{mc_answer1}}
+
+  B: {{mc_answer2}}
+
+  C: {{mc_answer3}}
+
+  D: {{mc_answer4}}
+
+  Please choose the correct answer from the options above:'
+include: belebele
+task: belebele_bam_prompt_1
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_eng.yaml b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ef1f0463d4c1b7c6661c570f00f9beb855cfd534
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_eng.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: eng_Latn
+doc_to_text: 'P: {{flores_passage}}
+
+  Q: {{question.strip()}}
+
+  A: {{mc_answer1}}
+
+  B: {{mc_answer2}}
+
+  C: {{mc_answer3}}
+
+  D: {{mc_answer4}}
+
+  Please choose the correct answer from the options above:'
+include: belebele
+task: belebele_eng_prompt_1
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_fra.yaml b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5f2513826f2808d242ced3d0b7278878bb846649
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_fra.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: fra_Latn
+doc_to_text: 'P: {{flores_passage}}
+
+  Q: {{question.strip()}}
+
+  A: {{mc_answer1}}
+
+  B: {{mc_answer2}}
+
+  C: {{mc_answer3}}
+
+  D: {{mc_answer4}}
+
+  Please choose the correct answer from the options above:'
+include: belebele
+task: belebele_fra_prompt_1
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_fuv.yaml b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_fuv.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b24422c03aa1f81b18bab14d8810c916afb8e367
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_fuv.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: fuv_Latn
+doc_to_text: 'P: {{flores_passage}}
+
+  Q: {{question.strip()}}
+
+  A: {{mc_answer1}}
+
+  B: {{mc_answer2}}
+
+  C: {{mc_answer3}}
+
+  D: {{mc_answer4}}
+
+  Please choose the correct answer from the options above:'
+include: belebele
+task: belebele_fuv_prompt_1
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_gaz.yaml b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_gaz.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b999f4a85fab866f0cbc6ccab5bc78ddd7a65bc5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_gaz.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: gaz_Latn
+doc_to_text: 'P: {{flores_passage}}
+
+  Q: {{question.strip()}}
+
+  A: {{mc_answer1}}
+
+  B: {{mc_answer2}}
+
+  C: {{mc_answer3}}
+
+  D: {{mc_answer4}}
+
+  Please choose the correct answer from the options above:'
+include: belebele
+task: belebele_gaz_prompt_1
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_hau.yaml b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..933e90b50653a19529277c30e0c940130d656528
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_hau.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: hau_Latn
+doc_to_text: 'P: {{flores_passage}}
+
+  Q: {{question.strip()}}
+
+  A: {{mc_answer1}}
+
+  B: {{mc_answer2}}
+
+  C: {{mc_answer3}}
+
+  D: {{mc_answer4}}
+
+  Please choose the correct answer from the options above:'
+include: belebele
+task: belebele_hau_prompt_1
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_ibo.yaml b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fa17935c3b331e347ddc13512b46c80e484e24d9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_ibo.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: ibo_Latn
+doc_to_text: 'P: {{flores_passage}}
+
+  Q: {{question.strip()}}
+
+  A: {{mc_answer1}}
+
+  B: {{mc_answer2}}
+
+  C: {{mc_answer3}}
+
+  D: {{mc_answer4}}
+
+  Please choose the correct answer from the options above:'
+include: belebele
+task: belebele_ibo_prompt_1
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_kea.yaml b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_kea.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ad535d498447fde8ee74423a80858d42a85f558c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_kea.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: kea_Latn
+doc_to_text: 'P: {{flores_passage}}
+
+  Q: {{question.strip()}}
+
+  A: {{mc_answer1}}
+
+  B: {{mc_answer2}}
+
+  C: {{mc_answer3}}
+
+  D: {{mc_answer4}}
+
+  Please choose the correct answer from the options above:'
+include: belebele
+task: belebele_kea_prompt_1
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_kin.yaml b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..de957a59b3cb665760f8c36c5968ce75c5b4271c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_kin.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: kin_Latn
+doc_to_text: 'P: {{flores_passage}}
+
+  Q: {{question.strip()}}
+
+  A: {{mc_answer1}}
+
+  B: {{mc_answer2}}
+
+  C: {{mc_answer3}}
+
+  D: {{mc_answer4}}
+
+  Please choose the correct answer from the options above:'
+include: belebele
+task: belebele_kin_prompt_1
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_lin.yaml b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b3247f065c8f5137f52af8965f18a2a23e942b64
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_lin.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: lin_Latn
+doc_to_text: 'P: {{flores_passage}}
+
+  Q: {{question.strip()}}
+
+  A: {{mc_answer1}}
+
+  B: {{mc_answer2}}
+
+  C: {{mc_answer3}}
+
+  D: {{mc_answer4}}
+
+  Please choose the correct answer from the options above:'
+include: belebele
+task: belebele_lin_prompt_1
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_lug.yaml b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8b2ef7a14b8b1f566b1a55b6e8e02e12b327becb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_lug.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: lug_Latn
+doc_to_text: 'P: {{flores_passage}}
+
+  Q: {{question.strip()}}
+
+  A: {{mc_answer1}}
+
+  B: {{mc_answer2}}
+
+  C: {{mc_answer3}}
+
+  D: {{mc_answer4}}
+
+  Please choose the correct answer from the options above:'
+include: belebele
+task: belebele_lug_prompt_1
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_luo.yaml b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_luo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b667c1d66cff764f6d57cfa6ff698796b608e446
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_luo.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: luo_Latn
+doc_to_text: 'P: {{flores_passage}}
+
+  Q: {{question.strip()}}
+
+  A: {{mc_answer1}}
+
+  B: {{mc_answer2}}
+
+  C: {{mc_answer3}}
+
+  D: {{mc_answer4}}
+
+  Please choose the correct answer from the options above:'
+include: belebele
+task: belebele_luo_prompt_1
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_nya.yaml b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_nya.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c220c7738f369cc8255efc2b96c4d9654a5c33f7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_nya.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: nya_Latn
+doc_to_text: 'P: {{flores_passage}}
+
+  Q: {{question.strip()}}
+
+  A: {{mc_answer1}}
+
+  B: {{mc_answer2}}
+
+  C: {{mc_answer3}}
+
+  D: {{mc_answer4}}
+
+  Please choose the correct answer from the options above:'
+include: belebele
+task: belebele_nya_prompt_1
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_plt.yaml b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_plt.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d5c286e8517e6348798dc9c7807b1aec687c9ad1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_plt.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: plt_Latn
+doc_to_text: 'P: {{flores_passage}}
+
+  Q: {{question.strip()}}
+
+  A: {{mc_answer1}}
+
+  B: {{mc_answer2}}
+
+  C: {{mc_answer3}}
+
+  D: {{mc_answer4}}
+
+  Please choose the correct answer from the options above:'
+include: belebele
+task: belebele_plt_prompt_1
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_por.yaml b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_por.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ceba2310c5e483727eb5f4386058b666cd49620c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_por.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: por_Latn
+doc_to_text: 'P: {{flores_passage}}
+
+  Q: {{question.strip()}}
+
+  A: {{mc_answer1}}
+
+  B: {{mc_answer2}}
+
+  C: {{mc_answer3}}
+
+  D: {{mc_answer4}}
+
+  Please choose the correct answer from the options above:'
+include: belebele
+task: belebele_por_prompt_1
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_sna.yaml b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..eec0b1e19fe1bdd53a5add1d16b6b9a89e12a213
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_sna.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: sna_Latn
+doc_to_text: 'P: {{flores_passage}}
+
+  Q: {{question.strip()}}
+
+  A: {{mc_answer1}}
+
+  B: {{mc_answer2}}
+
+  C: {{mc_answer3}}
+
+  D: {{mc_answer4}}
+
+  Please choose the correct answer from the options above:'
+include: belebele
+task: belebele_sna_prompt_1
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_som.yaml b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_som.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..24af555865acd5d2f8c48c5aa2814e401891ccd4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_som.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: som_Latn
+doc_to_text: 'P: {{flores_passage}}
+
+  Q: {{question.strip()}}
+
+  A: {{mc_answer1}}
+
+  B: {{mc_answer2}}
+
+  C: {{mc_answer3}}
+
+  D: {{mc_answer4}}
+
+  Please choose the correct answer from the options above:'
+include: belebele
+task: belebele_som_prompt_1
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_sot.yaml b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..10cde5be846773e764b762157bc528a5acf3fc1f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_sot.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: sot_Latn
+doc_to_text: 'P: {{flores_passage}}
+
+  Q: {{question.strip()}}
+
+  A: {{mc_answer1}}
+
+  B: {{mc_answer2}}
+
+  C: {{mc_answer3}}
+
+  D: {{mc_answer4}}
+
+  Please choose the correct answer from the options above:'
+include: belebele
+task: belebele_sot_prompt_1
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_ssw.yaml b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_ssw.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..032b4629b569fc340c12fc701dbc694fd8631d95
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_ssw.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: ssw_Latn
+doc_to_text: 'P: {{flores_passage}}
+
+  Q: {{question.strip()}}
+
+  A: {{mc_answer1}}
+
+  B: {{mc_answer2}}
+
+  C: {{mc_answer3}}
+
+  D: {{mc_answer4}}
+
+  Please choose the correct answer from the options above:'
+include: belebele
+task: belebele_ssw_prompt_1
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_swa.yaml b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4c4ae7b786261ef4e7b48bc35c077207a232bb31
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_swa.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: swh_Latn
+doc_to_text: 'P: {{flores_passage}}
+
+  Q: {{question.strip()}}
+
+  A: {{mc_answer1}}
+
+  B: {{mc_answer2}}
+
+  C: {{mc_answer3}}
+
+  D: {{mc_answer4}}
+
+  Please choose the correct answer from the options above:'
+include: belebele
+task: belebele_swa_prompt_1
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_tir.yaml b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_tir.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9b62e848daa5776384c780acce0b270cf88607c5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_tir.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: tir_Ethi
+doc_to_text: 'P: {{flores_passage}}
+
+  Q: {{question.strip()}}
+
+  A: {{mc_answer1}}
+
+  B: {{mc_answer2}}
+
+  C: {{mc_answer3}}
+
+  D: {{mc_answer4}}
+
+  Please choose the correct answer from the options above:'
+include: belebele
+task: belebele_tir_prompt_1
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_tsn.yaml b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_tsn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..147a1c9857be7f9557759765a7a6d5d490564bf7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_tsn.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: tsn_Latn
+doc_to_text: 'P: {{flores_passage}}
+
+  Q: {{question.strip()}}
+
+  A: {{mc_answer1}}
+
+  B: {{mc_answer2}}
+
+  C: {{mc_answer3}}
+
+  D: {{mc_answer4}}
+
+  Please choose the correct answer from the options above:'
+include: belebele
+task: belebele_tsn_prompt_1
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_tso.yaml b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_tso.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..869c50153d289b01ee408656701d0fb14dcb679d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_tso.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: tso_Latn
+doc_to_text: 'P: {{flores_passage}}
+
+  Q: {{question.strip()}}
+
+  A: {{mc_answer1}}
+
+  B: {{mc_answer2}}
+
+  C: {{mc_answer3}}
+
+  D: {{mc_answer4}}
+
+  Please choose the correct answer from the options above:'
+include: belebele
+task: belebele_tso_prompt_1
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_wol.yaml b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1aed1e5ac7954377f0e594e5e2f3b8260d140c62
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_wol.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: wol_Latn
+doc_to_text: 'P: {{flores_passage}}
+
+  Q: {{question.strip()}}
+
+  A: {{mc_answer1}}
+
+  B: {{mc_answer2}}
+
+  C: {{mc_answer3}}
+
+  D: {{mc_answer4}}
+
+  Please choose the correct answer from the options above:'
+include: belebele
+task: belebele_wol_prompt_1
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_xho.yaml b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..549560ac1d4cd2181750d6e587f355f743d52cef
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_xho.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: xho_Latn
+doc_to_text: 'P: {{flores_passage}}
+
+  Q: {{question.strip()}}
+
+  A: {{mc_answer1}}
+
+  B: {{mc_answer2}}
+
+  C: {{mc_answer3}}
+
+  D: {{mc_answer4}}
+
+  Please choose the correct answer from the options above:'
+include: belebele
+task: belebele_xho_prompt_1
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_yor.yaml b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..70c55eba1c99673595f00b297237391dd767c74c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_yor.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: yor_Latn
+doc_to_text: 'P: {{flores_passage}}
+
+  Q: {{question.strip()}}
+
+  A: {{mc_answer1}}
+
+  B: {{mc_answer2}}
+
+  C: {{mc_answer3}}
+
+  D: {{mc_answer4}}
+
+  Please choose the correct answer from the options above:'
+include: belebele
+task: belebele_yor_prompt_1
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_zul.yaml b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..257396921dbe6be6b5954170511093b382e9ba9b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_1/belebele_zul.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: zul_Latn
+doc_to_text: 'P: {{flores_passage}}
+
+  Q: {{question.strip()}}
+
+  A: {{mc_answer1}}
+
+  B: {{mc_answer2}}
+
+  C: {{mc_answer3}}
+
+  D: {{mc_answer4}}
+
+  Please choose the correct answer from the options above:'
+include: belebele
+task: belebele_zul_prompt_1
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_2/belebele b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele
new file mode 100644
index 0000000000000000000000000000000000000000..75f673a425116056c7516d0b7e8f54844f0c9716
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele
@@ -0,0 +1,23 @@
+tag:
+    - belebele_tasks
+    - belebele_prompt_2
+    - RC_tasks
+dataset_path: facebook/belebele
+dataset_name: null
+output_type: multiple_choice
+test_split: test
+fewshot_config:
+  sampler: first_n
+doc_to_target: "{{['1', '2', '3', '4'].index(correct_answer_num)}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{question}}"
+doc_to_choice: ["A", "B", "C", "D"]
+metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+  - metric: acc_norm
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_afr.yaml b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_afr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d7c7180959511b34bc8582976aa14f1b4327ad1a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_afr.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: afr_Latn
+doc_to_text: 'Passage: {{flores_passage}}
+
+  Question: {{question.strip()}}
+
+  1: {{mc_answer1}}
+
+  2: {{mc_answer2}}
+
+  3: {{mc_answer3}}
+
+  4: {{mc_answer4}}
+
+  Please select the correct answer from the given choices:'
+include: belebele
+task: belebele_afr_prompt_2
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_amh.yaml b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f8f95cac2a0f27d8a1699215855167bffe3b38a0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_amh.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: amh_Ethi
+doc_to_text: 'Passage: {{flores_passage}}
+
+  Question: {{question.strip()}}
+
+  1: {{mc_answer1}}
+
+  2: {{mc_answer2}}
+
+  3: {{mc_answer3}}
+
+  4: {{mc_answer4}}
+
+  Please select the correct answer from the given choices:'
+include: belebele
+task: belebele_amh_prompt_2
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_ary.yaml b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_ary.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..12a784902ad61589942794fe4661375154909999
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_ary.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: ary_Arab
+doc_to_text: 'Passage: {{flores_passage}}
+
+  Question: {{question.strip()}}
+
+  1: {{mc_answer1}}
+
+  2: {{mc_answer2}}
+
+  3: {{mc_answer3}}
+
+  4: {{mc_answer4}}
+
+  Please select the correct answer from the given choices:'
+include: belebele
+task: belebele_ary_prompt_2
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_arz.yaml b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_arz.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a975b485cd567a5f574add79bc5651e58d533219
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_arz.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: arz_Arab
+doc_to_text: 'Passage: {{flores_passage}}
+
+  Question: {{question.strip()}}
+
+  1: {{mc_answer1}}
+
+  2: {{mc_answer2}}
+
+  3: {{mc_answer3}}
+
+  4: {{mc_answer4}}
+
+  Please select the correct answer from the given choices:'
+include: belebele
+task: belebele_arz_prompt_2
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_bam.yaml b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_bam.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..814d32b5483290ef00380b30ce0797492f696497
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_bam.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: bam_Latn
+doc_to_text: 'Passage: {{flores_passage}}
+
+  Question: {{question.strip()}}
+
+  1: {{mc_answer1}}
+
+  2: {{mc_answer2}}
+
+  3: {{mc_answer3}}
+
+  4: {{mc_answer4}}
+
+  Please select the correct answer from the given choices:'
+include: belebele
+task: belebele_bam_prompt_2
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_eng.yaml b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..510f1fbb0d2070f3c547fbe76f3c5a69e4ba31f0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_eng.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: eng_Latn
+doc_to_text: 'Passage: {{flores_passage}}
+
+  Question: {{question.strip()}}
+
+  1: {{mc_answer1}}
+
+  2: {{mc_answer2}}
+
+  3: {{mc_answer3}}
+
+  4: {{mc_answer4}}
+
+  Please select the correct answer from the given choices:'
+include: belebele
+task: belebele_eng_prompt_2
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_fra.yaml b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..157433727099d0bc545ddb1e821aa80fbb37f3a1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_fra.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: fra_Latn
+doc_to_text: 'Passage: {{flores_passage}}
+
+  Question: {{question.strip()}}
+
+  1: {{mc_answer1}}
+
+  2: {{mc_answer2}}
+
+  3: {{mc_answer3}}
+
+  4: {{mc_answer4}}
+
+  Please select the correct answer from the given choices:'
+include: belebele
+task: belebele_fra_prompt_2
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_fuv.yaml b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_fuv.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3bf02ff0fa62983d5e4c9c8e63129b1e40da333a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_fuv.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: fuv_Latn
+doc_to_text: 'Passage: {{flores_passage}}
+
+  Question: {{question.strip()}}
+
+  1: {{mc_answer1}}
+
+  2: {{mc_answer2}}
+
+  3: {{mc_answer3}}
+
+  4: {{mc_answer4}}
+
+  Please select the correct answer from the given choices:'
+include: belebele
+task: belebele_fuv_prompt_2
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_gaz.yaml b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_gaz.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bc2b2704e382ea4eaa32276e369ad43b8fa298b1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_gaz.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: gaz_Latn
+doc_to_text: 'Passage: {{flores_passage}}
+
+  Question: {{question.strip()}}
+
+  1: {{mc_answer1}}
+
+  2: {{mc_answer2}}
+
+  3: {{mc_answer3}}
+
+  4: {{mc_answer4}}
+
+  Please select the correct answer from the given choices:'
+include: belebele
+task: belebele_gaz_prompt_2
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_hau.yaml b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7af70e03fa6ab6f868205f2ad2b5abc4727a0dfb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_hau.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: hau_Latn
+doc_to_text: 'Passage: {{flores_passage}}
+
+  Question: {{question.strip()}}
+
+  1: {{mc_answer1}}
+
+  2: {{mc_answer2}}
+
+  3: {{mc_answer3}}
+
+  4: {{mc_answer4}}
+
+  Please select the correct answer from the given choices:'
+include: belebele
+task: belebele_hau_prompt_2
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_ibo.yaml b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..92d895803b9927d2a8288a7b9ff4493f23476c77
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_ibo.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: ibo_Latn
+doc_to_text: 'Passage: {{flores_passage}}
+
+  Question: {{question.strip()}}
+
+  1: {{mc_answer1}}
+
+  2: {{mc_answer2}}
+
+  3: {{mc_answer3}}
+
+  4: {{mc_answer4}}
+
+  Please select the correct answer from the given choices:'
+include: belebele
+task: belebele_ibo_prompt_2
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_kea.yaml b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_kea.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7f1dcf9117db0c0ef3065975c7bce8abab626ae1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_kea.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: kea_Latn
+doc_to_text: 'Passage: {{flores_passage}}
+
+  Question: {{question.strip()}}
+
+  1: {{mc_answer1}}
+
+  2: {{mc_answer2}}
+
+  3: {{mc_answer3}}
+
+  4: {{mc_answer4}}
+
+  Please select the correct answer from the given choices:'
+include: belebele
+task: belebele_kea_prompt_2
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_kin.yaml b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e686e477dca84fe5fa1c666c553a0ac1c0efe1b5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_kin.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: kin_Latn
+doc_to_text: 'Passage: {{flores_passage}}
+
+  Question: {{question.strip()}}
+
+  1: {{mc_answer1}}
+
+  2: {{mc_answer2}}
+
+  3: {{mc_answer3}}
+
+  4: {{mc_answer4}}
+
+  Please select the correct answer from the given choices:'
+include: belebele
+task: belebele_kin_prompt_2
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_lin.yaml b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..544eb9ddec49e0fbca3c29d56a3108db039d8979
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_lin.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: lin_Latn
+doc_to_text: 'Passage: {{flores_passage}}
+
+  Question: {{question.strip()}}
+
+  1: {{mc_answer1}}
+
+  2: {{mc_answer2}}
+
+  3: {{mc_answer3}}
+
+  4: {{mc_answer4}}
+
+  Please select the correct answer from the given choices:'
+include: belebele
+task: belebele_lin_prompt_2
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_lug.yaml b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fe97881b310795720928c4c43241c076297903c1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_lug.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: lug_Latn
+doc_to_text: 'Passage: {{flores_passage}}
+
+  Question: {{question.strip()}}
+
+  1: {{mc_answer1}}
+
+  2: {{mc_answer2}}
+
+  3: {{mc_answer3}}
+
+  4: {{mc_answer4}}
+
+  Please select the correct answer from the given choices:'
+include: belebele
+task: belebele_lug_prompt_2
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_luo.yaml b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_luo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d7bdde48a01bd9a08b75961f43bff0438889228c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_luo.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: luo_Latn
+doc_to_text: 'Passage: {{flores_passage}}
+
+  Question: {{question.strip()}}
+
+  1: {{mc_answer1}}
+
+  2: {{mc_answer2}}
+
+  3: {{mc_answer3}}
+
+  4: {{mc_answer4}}
+
+  Please select the correct answer from the given choices:'
+include: belebele
+task: belebele_luo_prompt_2
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_nya.yaml b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_nya.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..212c0635342943d90922467829854dd051c72298
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_nya.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: nya_Latn
+doc_to_text: 'Passage: {{flores_passage}}
+
+  Question: {{question.strip()}}
+
+  1: {{mc_answer1}}
+
+  2: {{mc_answer2}}
+
+  3: {{mc_answer3}}
+
+  4: {{mc_answer4}}
+
+  Please select the correct answer from the given choices:'
+include: belebele
+task: belebele_nya_prompt_2
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_plt.yaml b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_plt.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..57e71ac9c186e22ac235f85781ffb6400e41eb11
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_plt.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: plt_Latn
+doc_to_text: 'Passage: {{flores_passage}}
+
+  Question: {{question.strip()}}
+
+  1: {{mc_answer1}}
+
+  2: {{mc_answer2}}
+
+  3: {{mc_answer3}}
+
+  4: {{mc_answer4}}
+
+  Please select the correct answer from the given choices:'
+include: belebele
+task: belebele_plt_prompt_2
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_por.yaml b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_por.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f9be02a8e500875dfc2f5b20c1d0f6d3767dce93
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_por.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: por_Latn
+doc_to_text: 'Passage: {{flores_passage}}
+
+  Question: {{question.strip()}}
+
+  1: {{mc_answer1}}
+
+  2: {{mc_answer2}}
+
+  3: {{mc_answer3}}
+
+  4: {{mc_answer4}}
+
+  Please select the correct answer from the given choices:'
+include: belebele
+task: belebele_por_prompt_2
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_sna.yaml b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8a5ad43a21f7a2bd32933b47e9d5aa10a623cff3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_sna.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: sna_Latn
+doc_to_text: 'Passage: {{flores_passage}}
+
+  Question: {{question.strip()}}
+
+  1: {{mc_answer1}}
+
+  2: {{mc_answer2}}
+
+  3: {{mc_answer3}}
+
+  4: {{mc_answer4}}
+
+  Please select the correct answer from the given choices:'
+include: belebele
+task: belebele_sna_prompt_2
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_som.yaml b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_som.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d551d1d5c501ddc309b37b8688898359e44ecc3e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_som.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: som_Latn
+doc_to_text: 'Passage: {{flores_passage}}
+
+  Question: {{question.strip()}}
+
+  1: {{mc_answer1}}
+
+  2: {{mc_answer2}}
+
+  3: {{mc_answer3}}
+
+  4: {{mc_answer4}}
+
+  Please select the correct answer from the given choices:'
+include: belebele
+task: belebele_som_prompt_2
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_sot.yaml b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..18780c9017658355fc160320784f848867bf03cc
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_sot.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: sot_Latn
+doc_to_text: 'Passage: {{flores_passage}}
+
+  Question: {{question.strip()}}
+
+  1: {{mc_answer1}}
+
+  2: {{mc_answer2}}
+
+  3: {{mc_answer3}}
+
+  4: {{mc_answer4}}
+
+  Please select the correct answer from the given choices:'
+include: belebele
+task: belebele_sot_prompt_2
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_ssw.yaml b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_ssw.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e6bd0a69ac685b18d7d03eaa2deeb120913904ca
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_ssw.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: ssw_Latn
+doc_to_text: 'Passage: {{flores_passage}}
+
+  Question: {{question.strip()}}
+
+  1: {{mc_answer1}}
+
+  2: {{mc_answer2}}
+
+  3: {{mc_answer3}}
+
+  4: {{mc_answer4}}
+
+  Please select the correct answer from the given choices:'
+include: belebele
+task: belebele_ssw_prompt_2
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_swa.yaml b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9bdfd132cd8ac856e157d2dd502f7f030e95c0c4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_swa.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: swh_Latn
+doc_to_text: 'Passage: {{flores_passage}}
+
+  Question: {{question.strip()}}
+
+  1: {{mc_answer1}}
+
+  2: {{mc_answer2}}
+
+  3: {{mc_answer3}}
+
+  4: {{mc_answer4}}
+
+  Please select the correct answer from the given choices:'
+include: belebele
+task: belebele_swa_prompt_2
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_tir.yaml b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_tir.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1fba28cba714edf37a5169da7ba427543b1dd8e2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_tir.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: tir_Ethi
+doc_to_text: 'Passage: {{flores_passage}}
+
+  Question: {{question.strip()}}
+
+  1: {{mc_answer1}}
+
+  2: {{mc_answer2}}
+
+  3: {{mc_answer3}}
+
+  4: {{mc_answer4}}
+
+  Please select the correct answer from the given choices:'
+include: belebele
+task: belebele_tir_prompt_2
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_tsn.yaml b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_tsn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..337e08ceab543cad55a4ca0206bbf70590422642
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_tsn.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: tsn_Latn
+doc_to_text: 'Passage: {{flores_passage}}
+
+  Question: {{question.strip()}}
+
+  1: {{mc_answer1}}
+
+  2: {{mc_answer2}}
+
+  3: {{mc_answer3}}
+
+  4: {{mc_answer4}}
+
+  Please select the correct answer from the given choices:'
+include: belebele
+task: belebele_tsn_prompt_2
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_tso.yaml b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_tso.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3a0e24e4a4118e26150288d0f6842ec2baeec12a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_tso.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: tso_Latn
+doc_to_text: 'Passage: {{flores_passage}}
+
+  Question: {{question.strip()}}
+
+  1: {{mc_answer1}}
+
+  2: {{mc_answer2}}
+
+  3: {{mc_answer3}}
+
+  4: {{mc_answer4}}
+
+  Please select the correct answer from the given choices:'
+include: belebele
+task: belebele_tso_prompt_2
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_wol.yaml b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..439148106cf268746b7e41695a752a9ba8422a6c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_wol.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: wol_Latn
+doc_to_text: 'Passage: {{flores_passage}}
+
+  Question: {{question.strip()}}
+
+  1: {{mc_answer1}}
+
+  2: {{mc_answer2}}
+
+  3: {{mc_answer3}}
+
+  4: {{mc_answer4}}
+
+  Please select the correct answer from the given choices:'
+include: belebele
+task: belebele_wol_prompt_2
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_xho.yaml b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2077614892f77249687bf7b8889a211212325e83
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_xho.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: xho_Latn
+doc_to_text: 'Passage: {{flores_passage}}
+
+  Question: {{question.strip()}}
+
+  1: {{mc_answer1}}
+
+  2: {{mc_answer2}}
+
+  3: {{mc_answer3}}
+
+  4: {{mc_answer4}}
+
+  Please select the correct answer from the given choices:'
+include: belebele
+task: belebele_xho_prompt_2
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_yor.yaml b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f9684f05db11c39ea1e000f1b31ce04b3693cc01
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_yor.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: yor_Latn
+doc_to_text: 'Passage: {{flores_passage}}
+
+  Question: {{question.strip()}}
+
+  1: {{mc_answer1}}
+
+  2: {{mc_answer2}}
+
+  3: {{mc_answer3}}
+
+  4: {{mc_answer4}}
+
+  Please select the correct answer from the given choices:'
+include: belebele
+task: belebele_yor_prompt_2
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_zul.yaml b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c81180e17c7f71799e36536a4abcdb707ef7652d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_2/belebele_zul.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: zul_Latn
+doc_to_text: 'Passage: {{flores_passage}}
+
+  Question: {{question.strip()}}
+
+  1: {{mc_answer1}}
+
+  2: {{mc_answer2}}
+
+  3: {{mc_answer3}}
+
+  4: {{mc_answer4}}
+
+  Please select the correct answer from the given choices:'
+include: belebele
+task: belebele_zul_prompt_2
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_3/belebele b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele
new file mode 100644
index 0000000000000000000000000000000000000000..a27ea5fb3a06cf7d949c6bc464c721a697e8d981
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele
@@ -0,0 +1,23 @@
+tag:
+    - belebele_tasks
+    - belebele_prompt_3
+    - RC_tasks
+dataset_path: facebook/belebele
+dataset_name: null
+output_type: multiple_choice
+test_split: test
+fewshot_config:
+  sampler: first_n
+doc_to_target: "{{['1', '2', '3', '4'].index(correct_answer_num)}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{question}}"
+doc_to_choice: ["A", "B", "C", "D"]
+metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+  - metric: acc_norm
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_afr.yaml b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_afr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8c296cc93f26c6bb91bb7a844b22c8827ebc57fe
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_afr.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: afr_Latn
+doc_to_text: 'Context: {{flores_passage}}
+
+  Query: {{question.strip()}}
+
+  Option A: {{mc_answer1}}
+
+  Option B: {{mc_answer2}}
+
+  Option C: {{mc_answer3}}
+
+  Option D: {{mc_answer4}}
+
+  Please indicate the correct option from the list above:'
+include: belebele
+task: belebele_afr_prompt_3
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_amh.yaml b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0e6f2fd77b913715ca3d24c8cf209a46dca1398b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_amh.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: amh_Ethi
+doc_to_text: 'Context: {{flores_passage}}
+
+  Query: {{question.strip()}}
+
+  Option A: {{mc_answer1}}
+
+  Option B: {{mc_answer2}}
+
+  Option C: {{mc_answer3}}
+
+  Option D: {{mc_answer4}}
+
+  Please indicate the correct option from the list above:'
+include: belebele
+task: belebele_amh_prompt_3
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_ary.yaml b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_ary.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..203bf1c9239b45803f91c601047567aaa01c8ed9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_ary.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: ary_Arab
+doc_to_text: 'Context: {{flores_passage}}
+
+  Query: {{question.strip()}}
+
+  Option A: {{mc_answer1}}
+
+  Option B: {{mc_answer2}}
+
+  Option C: {{mc_answer3}}
+
+  Option D: {{mc_answer4}}
+
+  Please indicate the correct option from the list above:'
+include: belebele
+task: belebele_ary_prompt_3
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_arz.yaml b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_arz.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..97f13672f57f52613728a862805273451e78b5c7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_arz.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: arz_Arab
+doc_to_text: 'Context: {{flores_passage}}
+
+  Query: {{question.strip()}}
+
+  Option A: {{mc_answer1}}
+
+  Option B: {{mc_answer2}}
+
+  Option C: {{mc_answer3}}
+
+  Option D: {{mc_answer4}}
+
+  Please indicate the correct option from the list above:'
+include: belebele
+task: belebele_arz_prompt_3
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_bam.yaml b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_bam.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9b5d3415a3008831a38f2f6e3abb09eb79ba072e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_bam.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: bam_Latn
+doc_to_text: 'Context: {{flores_passage}}
+
+  Query: {{question.strip()}}
+
+  Option A: {{mc_answer1}}
+
+  Option B: {{mc_answer2}}
+
+  Option C: {{mc_answer3}}
+
+  Option D: {{mc_answer4}}
+
+  Please indicate the correct option from the list above:'
+include: belebele
+task: belebele_bam_prompt_3
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_eng.yaml b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ceb5270ec60454b3322a5058307bfce92caa3f8d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_eng.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: eng_Latn
+doc_to_text: 'Context: {{flores_passage}}
+
+  Query: {{question.strip()}}
+
+  Option A: {{mc_answer1}}
+
+  Option B: {{mc_answer2}}
+
+  Option C: {{mc_answer3}}
+
+  Option D: {{mc_answer4}}
+
+  Please indicate the correct option from the list above:'
+include: belebele
+task: belebele_eng_prompt_3
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_fra.yaml b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..affc5d12fd30c95380804fdeb538a50fd6bcf582
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_fra.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: fra_Latn
+doc_to_text: 'Context: {{flores_passage}}
+
+  Query: {{question.strip()}}
+
+  Option A: {{mc_answer1}}
+
+  Option B: {{mc_answer2}}
+
+  Option C: {{mc_answer3}}
+
+  Option D: {{mc_answer4}}
+
+  Please indicate the correct option from the list above:'
+include: belebele
+task: belebele_fra_prompt_3
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_fuv.yaml b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_fuv.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8ff7bfdad8e19ae4d4f1bc3a94e6158721243b88
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_fuv.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: fuv_Latn
+doc_to_text: 'Context: {{flores_passage}}
+
+  Query: {{question.strip()}}
+
+  Option A: {{mc_answer1}}
+
+  Option B: {{mc_answer2}}
+
+  Option C: {{mc_answer3}}
+
+  Option D: {{mc_answer4}}
+
+  Please indicate the correct option from the list above:'
+include: belebele
+task: belebele_fuv_prompt_3
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_gaz.yaml b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_gaz.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c067e7c1972edd8109312d4a313fa81d62f3effc
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_gaz.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: gaz_Latn
+doc_to_text: 'Context: {{flores_passage}}
+
+  Query: {{question.strip()}}
+
+  Option A: {{mc_answer1}}
+
+  Option B: {{mc_answer2}}
+
+  Option C: {{mc_answer3}}
+
+  Option D: {{mc_answer4}}
+
+  Please indicate the correct option from the list above:'
+include: belebele
+task: belebele_gaz_prompt_3
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_hau.yaml b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..689724b4f8593af87018cef13c7c596868df7c2b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_hau.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: hau_Latn
+doc_to_text: 'Context: {{flores_passage}}
+
+  Query: {{question.strip()}}
+
+  Option A: {{mc_answer1}}
+
+  Option B: {{mc_answer2}}
+
+  Option C: {{mc_answer3}}
+
+  Option D: {{mc_answer4}}
+
+  Please indicate the correct option from the list above:'
+include: belebele
+task: belebele_hau_prompt_3
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_ibo.yaml b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c5eaacad2b56cae67a575640a79df06e87624156
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_ibo.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: ibo_Latn
+doc_to_text: 'Context: {{flores_passage}}
+
+  Query: {{question.strip()}}
+
+  Option A: {{mc_answer1}}
+
+  Option B: {{mc_answer2}}
+
+  Option C: {{mc_answer3}}
+
+  Option D: {{mc_answer4}}
+
+  Please indicate the correct option from the list above:'
+include: belebele
+task: belebele_ibo_prompt_3
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_kea.yaml b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_kea.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c24b2ae7a28c976c7d3980ad86b61148aa0d1635
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_kea.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: kea_Latn
+doc_to_text: 'Context: {{flores_passage}}
+
+  Query: {{question.strip()}}
+
+  Option A: {{mc_answer1}}
+
+  Option B: {{mc_answer2}}
+
+  Option C: {{mc_answer3}}
+
+  Option D: {{mc_answer4}}
+
+  Please indicate the correct option from the list above:'
+include: belebele
+task: belebele_kea_prompt_3
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_kin.yaml b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ae0a821fc67dd16965c9e46dc0f8724c508dc7bb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_kin.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: kin_Latn
+doc_to_text: 'Context: {{flores_passage}}
+
+  Query: {{question.strip()}}
+
+  Option A: {{mc_answer1}}
+
+  Option B: {{mc_answer2}}
+
+  Option C: {{mc_answer3}}
+
+  Option D: {{mc_answer4}}
+
+  Please indicate the correct option from the list above:'
+include: belebele
+task: belebele_kin_prompt_3
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_lin.yaml b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..93e1a5b5438d97012207f12d9bc88a639252644f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_lin.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: lin_Latn
+doc_to_text: 'Context: {{flores_passage}}
+
+  Query: {{question.strip()}}
+
+  Option A: {{mc_answer1}}
+
+  Option B: {{mc_answer2}}
+
+  Option C: {{mc_answer3}}
+
+  Option D: {{mc_answer4}}
+
+  Please indicate the correct option from the list above:'
+include: belebele
+task: belebele_lin_prompt_3
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_lug.yaml b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..724947d41b746ee2507c1340003dbda425c89ddb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_lug.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: lug_Latn
+doc_to_text: 'Context: {{flores_passage}}
+
+  Query: {{question.strip()}}
+
+  Option A: {{mc_answer1}}
+
+  Option B: {{mc_answer2}}
+
+  Option C: {{mc_answer3}}
+
+  Option D: {{mc_answer4}}
+
+  Please indicate the correct option from the list above:'
+include: belebele
+task: belebele_lug_prompt_3
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_luo.yaml b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_luo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..21b4b8f730f3f24abaf93b0a71badee8f6c16819
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_luo.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: luo_Latn
+doc_to_text: 'Context: {{flores_passage}}
+
+  Query: {{question.strip()}}
+
+  Option A: {{mc_answer1}}
+
+  Option B: {{mc_answer2}}
+
+  Option C: {{mc_answer3}}
+
+  Option D: {{mc_answer4}}
+
+  Please indicate the correct option from the list above:'
+include: belebele
+task: belebele_luo_prompt_3
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_nya.yaml b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_nya.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..db045f723e91ed1e04317ad29ff7c7ee5bafa274
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_nya.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: nya_Latn
+doc_to_text: 'Context: {{flores_passage}}
+
+  Query: {{question.strip()}}
+
+  Option A: {{mc_answer1}}
+
+  Option B: {{mc_answer2}}
+
+  Option C: {{mc_answer3}}
+
+  Option D: {{mc_answer4}}
+
+  Please indicate the correct option from the list above:'
+include: belebele
+task: belebele_nya_prompt_3
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_plt.yaml b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_plt.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..946e417946c1c066f2c552d42e266fac048bf795
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_plt.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: plt_Latn
+doc_to_text: 'Context: {{flores_passage}}
+
+  Query: {{question.strip()}}
+
+  Option A: {{mc_answer1}}
+
+  Option B: {{mc_answer2}}
+
+  Option C: {{mc_answer3}}
+
+  Option D: {{mc_answer4}}
+
+  Please indicate the correct option from the list above:'
+include: belebele
+task: belebele_plt_prompt_3
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_por.yaml b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_por.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..72ca651b8c251e1dfd304c9e6312b8230c2e2b1a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_por.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: por_Latn
+doc_to_text: 'Context: {{flores_passage}}
+
+  Query: {{question.strip()}}
+
+  Option A: {{mc_answer1}}
+
+  Option B: {{mc_answer2}}
+
+  Option C: {{mc_answer3}}
+
+  Option D: {{mc_answer4}}
+
+  Please indicate the correct option from the list above:'
+include: belebele
+task: belebele_por_prompt_3
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_sna.yaml b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2f5d810ac1ec1bca42f68bf73da7a9ba10a2315a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_sna.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: sna_Latn
+doc_to_text: 'Context: {{flores_passage}}
+
+  Query: {{question.strip()}}
+
+  Option A: {{mc_answer1}}
+
+  Option B: {{mc_answer2}}
+
+  Option C: {{mc_answer3}}
+
+  Option D: {{mc_answer4}}
+
+  Please indicate the correct option from the list above:'
+include: belebele
+task: belebele_sna_prompt_3
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_som.yaml b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_som.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3d3a7c4e46094d0eaa70553ecf1c6ff95b557379
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_som.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: som_Latn
+doc_to_text: 'Context: {{flores_passage}}
+
+  Query: {{question.strip()}}
+
+  Option A: {{mc_answer1}}
+
+  Option B: {{mc_answer2}}
+
+  Option C: {{mc_answer3}}
+
+  Option D: {{mc_answer4}}
+
+  Please indicate the correct option from the list above:'
+include: belebele
+task: belebele_som_prompt_3
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_sot.yaml b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3db32d81ad3f709a2889da57042411ce00c8f294
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_sot.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: sot_Latn
+doc_to_text: 'Context: {{flores_passage}}
+
+  Query: {{question.strip()}}
+
+  Option A: {{mc_answer1}}
+
+  Option B: {{mc_answer2}}
+
+  Option C: {{mc_answer3}}
+
+  Option D: {{mc_answer4}}
+
+  Please indicate the correct option from the list above:'
+include: belebele
+task: belebele_sot_prompt_3
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_ssw.yaml b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_ssw.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..888ecf8423b80a3ac57980a91658c0f0c6a83254
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_ssw.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: ssw_Latn
+doc_to_text: 'Context: {{flores_passage}}
+
+  Query: {{question.strip()}}
+
+  Option A: {{mc_answer1}}
+
+  Option B: {{mc_answer2}}
+
+  Option C: {{mc_answer3}}
+
+  Option D: {{mc_answer4}}
+
+  Please indicate the correct option from the list above:'
+include: belebele
+task: belebele_ssw_prompt_3
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_swa.yaml b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ec8127aae7cb28ddcfae818b80c6d58bb71c70fb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_swa.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: swh_Latn
+doc_to_text: 'Context: {{flores_passage}}
+
+  Query: {{question.strip()}}
+
+  Option A: {{mc_answer1}}
+
+  Option B: {{mc_answer2}}
+
+  Option C: {{mc_answer3}}
+
+  Option D: {{mc_answer4}}
+
+  Please indicate the correct option from the list above:'
+include: belebele
+task: belebele_swa_prompt_3
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_tir.yaml b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_tir.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ab3545445999b415cc5a81864672db07f4cf548d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_tir.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: tir_Ethi
+doc_to_text: 'Context: {{flores_passage}}
+
+  Query: {{question.strip()}}
+
+  Option A: {{mc_answer1}}
+
+  Option B: {{mc_answer2}}
+
+  Option C: {{mc_answer3}}
+
+  Option D: {{mc_answer4}}
+
+  Please indicate the correct option from the list above:'
+include: belebele
+task: belebele_tir_prompt_3
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_tsn.yaml b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_tsn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..019a95fe9a49e4c3398d01d84af7fcfc72cf1214
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_tsn.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: tsn_Latn
+doc_to_text: 'Context: {{flores_passage}}
+
+  Query: {{question.strip()}}
+
+  Option A: {{mc_answer1}}
+
+  Option B: {{mc_answer2}}
+
+  Option C: {{mc_answer3}}
+
+  Option D: {{mc_answer4}}
+
+  Please indicate the correct option from the list above:'
+include: belebele
+task: belebele_tsn_prompt_3
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_tso.yaml b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_tso.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fcc97c4f09bc6abc5129d37ef2dabbbbcd6b71b8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_tso.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: tso_Latn
+doc_to_text: 'Context: {{flores_passage}}
+
+  Query: {{question.strip()}}
+
+  Option A: {{mc_answer1}}
+
+  Option B: {{mc_answer2}}
+
+  Option C: {{mc_answer3}}
+
+  Option D: {{mc_answer4}}
+
+  Please indicate the correct option from the list above:'
+include: belebele
+task: belebele_tso_prompt_3
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_wol.yaml b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..20af7b3c57d384f4f63461f47240d9e4cadb91e5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_wol.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: wol_Latn
+doc_to_text: 'Context: {{flores_passage}}
+
+  Query: {{question.strip()}}
+
+  Option A: {{mc_answer1}}
+
+  Option B: {{mc_answer2}}
+
+  Option C: {{mc_answer3}}
+
+  Option D: {{mc_answer4}}
+
+  Please indicate the correct option from the list above:'
+include: belebele
+task: belebele_wol_prompt_3
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_xho.yaml b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a205da905597abd079e2a8131b37e63c2d1a13f6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_xho.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: xho_Latn
+doc_to_text: 'Context: {{flores_passage}}
+
+  Query: {{question.strip()}}
+
+  Option A: {{mc_answer1}}
+
+  Option B: {{mc_answer2}}
+
+  Option C: {{mc_answer3}}
+
+  Option D: {{mc_answer4}}
+
+  Please indicate the correct option from the list above:'
+include: belebele
+task: belebele_xho_prompt_3
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_yor.yaml b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cdcbb8c244275b55670b542247d8db79c6b3bf54
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_yor.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: yor_Latn
+doc_to_text: 'Context: {{flores_passage}}
+
+  Query: {{question.strip()}}
+
+  Option A: {{mc_answer1}}
+
+  Option B: {{mc_answer2}}
+
+  Option C: {{mc_answer3}}
+
+  Option D: {{mc_answer4}}
+
+  Please indicate the correct option from the list above:'
+include: belebele
+task: belebele_yor_prompt_3
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_zul.yaml b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..da1ef4239b68c83c5ceaa1988a85329275db97fb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_3/belebele_zul.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: zul_Latn
+doc_to_text: 'Context: {{flores_passage}}
+
+  Query: {{question.strip()}}
+
+  Option A: {{mc_answer1}}
+
+  Option B: {{mc_answer2}}
+
+  Option C: {{mc_answer3}}
+
+  Option D: {{mc_answer4}}
+
+  Please indicate the correct option from the list above:'
+include: belebele
+task: belebele_zul_prompt_3
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_4/belebele b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele
new file mode 100644
index 0000000000000000000000000000000000000000..cc28101b1072e9ef48202c143592df0ff2f8286b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele
@@ -0,0 +1,23 @@
+tag:
+    - belebele_tasks
+    - belebele_prompt_4
+    - RC_tasks
+dataset_path: facebook/belebele
+dataset_name: null
+output_type: multiple_choice
+test_split: test
+fewshot_config:
+  sampler: first_n
+doc_to_target: "{{['1', '2', '3', '4'].index(correct_answer_num)}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{question}}"
+doc_to_choice: ["A", "B", "C", "D"]
+metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+  - metric: acc_norm
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_afr.yaml b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_afr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..325cb85f391fb709081167cb70ea97d5664d2c86
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_afr.yaml
@@ -0,0 +1,21 @@
+# Generated by utils.py
+dataset_name: afr_Latn
+doc_to_text: '{{flores_passage}}
+
+  Based on the above passage, answer the following question:
+
+  {{question.strip()}}
+
+  Choices:
+
+  A) {{mc_answer1}}
+
+  B) {{mc_answer2}}
+
+  C) {{mc_answer3}}
+
+  D) {{mc_answer4}}
+
+  Please provide the correct answer from the choices given:'
+include: belebele
+task: belebele_afr_prompt_4
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_amh.yaml b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..02eb0683fb6fdd1a603f3c7d864f58a3ada9c458
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_amh.yaml
@@ -0,0 +1,21 @@
+# Generated by utils.py
+dataset_name: amh_Ethi
+doc_to_text: '{{flores_passage}}
+
+  Based on the above passage, answer the following question:
+
+  {{question.strip()}}
+
+  Choices:
+
+  A) {{mc_answer1}}
+
+  B) {{mc_answer2}}
+
+  C) {{mc_answer3}}
+
+  D) {{mc_answer4}}
+
+  Please provide the correct answer from the choices given:'
+include: belebele
+task: belebele_amh_prompt_4
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_ary.yaml b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_ary.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4c7899d23cf3bb34c56bb5c1b866397deb6d96be
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_ary.yaml
@@ -0,0 +1,21 @@
+# Generated by utils.py
+dataset_name: ary_Arab
+doc_to_text: '{{flores_passage}}
+
+  Based on the above passage, answer the following question:
+
+  {{question.strip()}}
+
+  Choices:
+
+  A) {{mc_answer1}}
+
+  B) {{mc_answer2}}
+
+  C) {{mc_answer3}}
+
+  D) {{mc_answer4}}
+
+  Please provide the correct answer from the choices given:'
+include: belebele
+task: belebele_ary_prompt_4
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_arz.yaml b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_arz.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5acc3222b7bae84c471027e21afb2b6930ac7848
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_arz.yaml
@@ -0,0 +1,21 @@
+# Generated by utils.py
+dataset_name: arz_Arab
+doc_to_text: '{{flores_passage}}
+
+  Based on the above passage, answer the following question:
+
+  {{question.strip()}}
+
+  Choices:
+
+  A) {{mc_answer1}}
+
+  B) {{mc_answer2}}
+
+  C) {{mc_answer3}}
+
+  D) {{mc_answer4}}
+
+  Please provide the correct answer from the choices given:'
+include: belebele
+task: belebele_arz_prompt_4
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_bam.yaml b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_bam.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..466dddff4989fe2ba2e26959fd3f47d40ebba425
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_bam.yaml
@@ -0,0 +1,21 @@
+# Generated by utils.py
+dataset_name: bam_Latn
+doc_to_text: '{{flores_passage}}
+
+  Based on the above passage, answer the following question:
+
+  {{question.strip()}}
+
+  Choices:
+
+  A) {{mc_answer1}}
+
+  B) {{mc_answer2}}
+
+  C) {{mc_answer3}}
+
+  D) {{mc_answer4}}
+
+  Please provide the correct answer from the choices given:'
+include: belebele
+task: belebele_bam_prompt_4
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_eng.yaml b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..21dfa3ea83d46444712c98657b68cd2d4416ba1c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_eng.yaml
@@ -0,0 +1,21 @@
+# Generated by utils.py
+dataset_name: eng_Latn
+doc_to_text: '{{flores_passage}}
+
+  Based on the above passage, answer the following question:
+
+  {{question.strip()}}
+
+  Choices:
+
+  A) {{mc_answer1}}
+
+  B) {{mc_answer2}}
+
+  C) {{mc_answer3}}
+
+  D) {{mc_answer4}}
+
+  Please provide the correct answer from the choices given:'
+include: belebele
+task: belebele_eng_prompt_4
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_fra.yaml b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c7fea6f1ba3e5792d2ebe755111c5d924ad07999
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_fra.yaml
@@ -0,0 +1,21 @@
+# Generated by utils.py
+dataset_name: fra_Latn
+doc_to_text: '{{flores_passage}}
+
+  Based on the above passage, answer the following question:
+
+  {{question.strip()}}
+
+  Choices:
+
+  A) {{mc_answer1}}
+
+  B) {{mc_answer2}}
+
+  C) {{mc_answer3}}
+
+  D) {{mc_answer4}}
+
+  Please provide the correct answer from the choices given:'
+include: belebele
+task: belebele_fra_prompt_4
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_fuv.yaml b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_fuv.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..77fa7798b1b079a65146976e0ecbf68ce94504cd
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_fuv.yaml
@@ -0,0 +1,21 @@
+# Generated by utils.py
+dataset_name: fuv_Latn
+doc_to_text: '{{flores_passage}}
+
+  Based on the above passage, answer the following question:
+
+  {{question.strip()}}
+
+  Choices:
+
+  A) {{mc_answer1}}
+
+  B) {{mc_answer2}}
+
+  C) {{mc_answer3}}
+
+  D) {{mc_answer4}}
+
+  Please provide the correct answer from the choices given:'
+include: belebele
+task: belebele_fuv_prompt_4
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_gaz.yaml b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_gaz.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a9e54eb9b9a74a929ecc7fff7313bb2783f7a8d6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_gaz.yaml
@@ -0,0 +1,21 @@
+# Generated by utils.py
+dataset_name: gaz_Latn
+doc_to_text: '{{flores_passage}}
+
+  Based on the above passage, answer the following question:
+
+  {{question.strip()}}
+
+  Choices:
+
+  A) {{mc_answer1}}
+
+  B) {{mc_answer2}}
+
+  C) {{mc_answer3}}
+
+  D) {{mc_answer4}}
+
+  Please provide the correct answer from the choices given:'
+include: belebele
+task: belebele_gaz_prompt_4
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_hau.yaml b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..45dbfc5730a25b7945cc06efb258da867f86c690
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_hau.yaml
@@ -0,0 +1,21 @@
+# Generated by utils.py
+dataset_name: hau_Latn
+doc_to_text: '{{flores_passage}}
+
+  Based on the above passage, answer the following question:
+
+  {{question.strip()}}
+
+  Choices:
+
+  A) {{mc_answer1}}
+
+  B) {{mc_answer2}}
+
+  C) {{mc_answer3}}
+
+  D) {{mc_answer4}}
+
+  Please provide the correct answer from the choices given:'
+include: belebele
+task: belebele_hau_prompt_4
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_ibo.yaml b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..eb58d8a079a7c0fd1fbeb30976361190d91f067f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_ibo.yaml
@@ -0,0 +1,21 @@
+# Generated by utils.py
+dataset_name: ibo_Latn
+doc_to_text: '{{flores_passage}}
+
+  Based on the above passage, answer the following question:
+
+  {{question.strip()}}
+
+  Choices:
+
+  A) {{mc_answer1}}
+
+  B) {{mc_answer2}}
+
+  C) {{mc_answer3}}
+
+  D) {{mc_answer4}}
+
+  Please provide the correct answer from the choices given:'
+include: belebele
+task: belebele_ibo_prompt_4
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_kea.yaml b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_kea.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c8ce83009df730072f0d40dd8090d103c2825a9d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_kea.yaml
@@ -0,0 +1,21 @@
+# Generated by utils.py
+dataset_name: kea_Latn
+doc_to_text: '{{flores_passage}}
+
+  Based on the above passage, answer the following question:
+
+  {{question.strip()}}
+
+  Choices:
+
+  A) {{mc_answer1}}
+
+  B) {{mc_answer2}}
+
+  C) {{mc_answer3}}
+
+  D) {{mc_answer4}}
+
+  Please provide the correct answer from the choices given:'
+include: belebele
+task: belebele_kea_prompt_4
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_kin.yaml b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..028de73a190394f6c7c7de22f24060294cf1da3d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_kin.yaml
@@ -0,0 +1,21 @@
+# Generated by utils.py
+dataset_name: kin_Latn
+doc_to_text: '{{flores_passage}}
+
+  Based on the above passage, answer the following question:
+
+  {{question.strip()}}
+
+  Choices:
+
+  A) {{mc_answer1}}
+
+  B) {{mc_answer2}}
+
+  C) {{mc_answer3}}
+
+  D) {{mc_answer4}}
+
+  Please provide the correct answer from the choices given:'
+include: belebele
+task: belebele_kin_prompt_4
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_lin.yaml b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..95cad4e2f44fc85e0dd9276c0024de0f1d2be617
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_lin.yaml
@@ -0,0 +1,21 @@
+# Generated by utils.py
+dataset_name: lin_Latn
+doc_to_text: '{{flores_passage}}
+
+  Based on the above passage, answer the following question:
+
+  {{question.strip()}}
+
+  Choices:
+
+  A) {{mc_answer1}}
+
+  B) {{mc_answer2}}
+
+  C) {{mc_answer3}}
+
+  D) {{mc_answer4}}
+
+  Please provide the correct answer from the choices given:'
+include: belebele
+task: belebele_lin_prompt_4
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_lug.yaml b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4e7b6a20c34b08afb2d6f569a07a275203e48ae1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_lug.yaml
@@ -0,0 +1,21 @@
+# Generated by utils.py
+dataset_name: lug_Latn
+doc_to_text: '{{flores_passage}}
+
+  Based on the above passage, answer the following question:
+
+  {{question.strip()}}
+
+  Choices:
+
+  A) {{mc_answer1}}
+
+  B) {{mc_answer2}}
+
+  C) {{mc_answer3}}
+
+  D) {{mc_answer4}}
+
+  Please provide the correct answer from the choices given:'
+include: belebele
+task: belebele_lug_prompt_4
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_luo.yaml b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_luo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ce5ec04ac9e580585168a7a26b7f180ccba450f9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_luo.yaml
@@ -0,0 +1,21 @@
+# Generated by utils.py
+dataset_name: luo_Latn
+doc_to_text: '{{flores_passage}}
+
+  Based on the above passage, answer the following question:
+
+  {{question.strip()}}
+
+  Choices:
+
+  A) {{mc_answer1}}
+
+  B) {{mc_answer2}}
+
+  C) {{mc_answer3}}
+
+  D) {{mc_answer4}}
+
+  Please provide the correct answer from the choices given:'
+include: belebele
+task: belebele_luo_prompt_4
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_nya.yaml b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_nya.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..26d2f699da8daad91726627cd37a6a8d92965faf
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_nya.yaml
@@ -0,0 +1,21 @@
+# Generated by utils.py
+dataset_name: nya_Latn
+doc_to_text: '{{flores_passage}}
+
+  Based on the above passage, answer the following question:
+
+  {{question.strip()}}
+
+  Choices:
+
+  A) {{mc_answer1}}
+
+  B) {{mc_answer2}}
+
+  C) {{mc_answer3}}
+
+  D) {{mc_answer4}}
+
+  Please provide the correct answer from the choices given:'
+include: belebele
+task: belebele_nya_prompt_4
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_plt.yaml b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_plt.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ffdf1460a464cdc0a14459b8d48178b4246e6ed4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_plt.yaml
@@ -0,0 +1,21 @@
+# Generated by utils.py
+dataset_name: plt_Latn
+doc_to_text: '{{flores_passage}}
+
+  Based on the above passage, answer the following question:
+
+  {{question.strip()}}
+
+  Choices:
+
+  A) {{mc_answer1}}
+
+  B) {{mc_answer2}}
+
+  C) {{mc_answer3}}
+
+  D) {{mc_answer4}}
+
+  Please provide the correct answer from the choices given:'
+include: belebele
+task: belebele_plt_prompt_4
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_por.yaml b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_por.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e8c06382b61eaec15b027533a5553c77d2974180
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_por.yaml
@@ -0,0 +1,21 @@
+# Generated by utils.py
+dataset_name: por_Latn
+doc_to_text: '{{flores_passage}}
+
+  Based on the above passage, answer the following question:
+
+  {{question.strip()}}
+
+  Choices:
+
+  A) {{mc_answer1}}
+
+  B) {{mc_answer2}}
+
+  C) {{mc_answer3}}
+
+  D) {{mc_answer4}}
+
+  Please provide the correct answer from the choices given:'
+include: belebele
+task: belebele_por_prompt_4
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_sna.yaml b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3869a67959fbff48a93174161750246ee0ccc510
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_sna.yaml
@@ -0,0 +1,21 @@
+# Generated by utils.py
+dataset_name: sna_Latn
+doc_to_text: '{{flores_passage}}
+
+  Based on the above passage, answer the following question:
+
+  {{question.strip()}}
+
+  Choices:
+
+  A) {{mc_answer1}}
+
+  B) {{mc_answer2}}
+
+  C) {{mc_answer3}}
+
+  D) {{mc_answer4}}
+
+  Please provide the correct answer from the choices given:'
+include: belebele
+task: belebele_sna_prompt_4
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_som.yaml b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_som.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6d7be50ca6bd926924a79dc261156b7d00775966
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_som.yaml
@@ -0,0 +1,21 @@
+# Generated by utils.py
+dataset_name: som_Latn
+doc_to_text: '{{flores_passage}}
+
+  Based on the above passage, answer the following question:
+
+  {{question.strip()}}
+
+  Choices:
+
+  A) {{mc_answer1}}
+
+  B) {{mc_answer2}}
+
+  C) {{mc_answer3}}
+
+  D) {{mc_answer4}}
+
+  Please provide the correct answer from the choices given:'
+include: belebele
+task: belebele_som_prompt_4
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_sot.yaml b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ec30bccc32adc5a5d7a7d6789a4dfb3941d3cc4a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_sot.yaml
@@ -0,0 +1,21 @@
+# Generated by utils.py
+dataset_name: sot_Latn
+doc_to_text: '{{flores_passage}}
+
+  Based on the above passage, answer the following question:
+
+  {{question.strip()}}
+
+  Choices:
+
+  A) {{mc_answer1}}
+
+  B) {{mc_answer2}}
+
+  C) {{mc_answer3}}
+
+  D) {{mc_answer4}}
+
+  Please provide the correct answer from the choices given:'
+include: belebele
+task: belebele_sot_prompt_4
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_ssw.yaml b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_ssw.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..510e7b8f2a6d7528fc8162f00bdbaa7b6a89ac4a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_ssw.yaml
@@ -0,0 +1,21 @@
+# Generated by utils.py
+dataset_name: ssw_Latn
+doc_to_text: '{{flores_passage}}
+
+  Based on the above passage, answer the following question:
+
+  {{question.strip()}}
+
+  Choices:
+
+  A) {{mc_answer1}}
+
+  B) {{mc_answer2}}
+
+  C) {{mc_answer3}}
+
+  D) {{mc_answer4}}
+
+  Please provide the correct answer from the choices given:'
+include: belebele
+task: belebele_ssw_prompt_4
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_swa.yaml b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..afbdcad238a03bf388dd4ddb070f159cf41782ab
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_swa.yaml
@@ -0,0 +1,21 @@
+# Generated by utils.py
+dataset_name: swh_Latn
+doc_to_text: '{{flores_passage}}
+
+  Based on the above passage, answer the following question:
+
+  {{question.strip()}}
+
+  Choices:
+
+  A) {{mc_answer1}}
+
+  B) {{mc_answer2}}
+
+  C) {{mc_answer3}}
+
+  D) {{mc_answer4}}
+
+  Please provide the correct answer from the choices given:'
+include: belebele
+task: belebele_swa_prompt_4
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_tir.yaml b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_tir.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..827f1f3614be4b1945bc8495ef52729b6c0778c0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_tir.yaml
@@ -0,0 +1,21 @@
+# Generated by utils.py
+dataset_name: tir_Ethi
+doc_to_text: '{{flores_passage}}
+
+  Based on the above passage, answer the following question:
+
+  {{question.strip()}}
+
+  Choices:
+
+  A) {{mc_answer1}}
+
+  B) {{mc_answer2}}
+
+  C) {{mc_answer3}}
+
+  D) {{mc_answer4}}
+
+  Please provide the correct answer from the choices given:'
+include: belebele
+task: belebele_tir_prompt_4
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_tsn.yaml b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_tsn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a8f0a28d86b665bc913b1760dc378e8e4bf4146d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_tsn.yaml
@@ -0,0 +1,21 @@
+# Generated by utils.py
+dataset_name: tsn_Latn
+doc_to_text: '{{flores_passage}}
+
+  Based on the above passage, answer the following question:
+
+  {{question.strip()}}
+
+  Choices:
+
+  A) {{mc_answer1}}
+
+  B) {{mc_answer2}}
+
+  C) {{mc_answer3}}
+
+  D) {{mc_answer4}}
+
+  Please provide the correct answer from the choices given:'
+include: belebele
+task: belebele_tsn_prompt_4
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_tso.yaml b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_tso.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4f1a87faa18a1af4729eca19d50b6a86bda83771
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_tso.yaml
@@ -0,0 +1,21 @@
+# Generated by utils.py
+dataset_name: tso_Latn
+doc_to_text: '{{flores_passage}}
+
+  Based on the above passage, answer the following question:
+
+  {{question.strip()}}
+
+  Choices:
+
+  A) {{mc_answer1}}
+
+  B) {{mc_answer2}}
+
+  C) {{mc_answer3}}
+
+  D) {{mc_answer4}}
+
+  Please provide the correct answer from the choices given:'
+include: belebele
+task: belebele_tso_prompt_4
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_wol.yaml b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0e0f6a629eea7632d7879943813fbd8964de5b8f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_wol.yaml
@@ -0,0 +1,21 @@
+# Generated by utils.py
+dataset_name: wol_Latn
+doc_to_text: '{{flores_passage}}
+
+  Based on the above passage, answer the following question:
+
+  {{question.strip()}}
+
+  Choices:
+
+  A) {{mc_answer1}}
+
+  B) {{mc_answer2}}
+
+  C) {{mc_answer3}}
+
+  D) {{mc_answer4}}
+
+  Please provide the correct answer from the choices given:'
+include: belebele
+task: belebele_wol_prompt_4
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_xho.yaml b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c3510a4d82e5975dde9272cd83e856780d7a3766
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_xho.yaml
@@ -0,0 +1,21 @@
+# Generated by utils.py
+dataset_name: xho_Latn
+doc_to_text: '{{flores_passage}}
+
+  Based on the above passage, answer the following question:
+
+  {{question.strip()}}
+
+  Choices:
+
+  A) {{mc_answer1}}
+
+  B) {{mc_answer2}}
+
+  C) {{mc_answer3}}
+
+  D) {{mc_answer4}}
+
+  Please provide the correct answer from the choices given:'
+include: belebele
+task: belebele_xho_prompt_4
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_yor.yaml b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..526e24ef92ecfbada52046a0d72e878ef939e86c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_yor.yaml
@@ -0,0 +1,21 @@
+# Generated by utils.py
+dataset_name: yor_Latn
+doc_to_text: '{{flores_passage}}
+
+  Based on the above passage, answer the following question:
+
+  {{question.strip()}}
+
+  Choices:
+
+  A) {{mc_answer1}}
+
+  B) {{mc_answer2}}
+
+  C) {{mc_answer3}}
+
+  D) {{mc_answer4}}
+
+  Please provide the correct answer from the choices given:'
+include: belebele
+task: belebele_yor_prompt_4
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_zul.yaml b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7472e5213b9a5a016818e89eaf901611423131b9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_4/belebele_zul.yaml
@@ -0,0 +1,21 @@
+# Generated by utils.py
+dataset_name: zul_Latn
+doc_to_text: '{{flores_passage}}
+
+  Based on the above passage, answer the following question:
+
+  {{question.strip()}}
+
+  Choices:
+
+  A) {{mc_answer1}}
+
+  B) {{mc_answer2}}
+
+  C) {{mc_answer3}}
+
+  D) {{mc_answer4}}
+
+  Please provide the correct answer from the choices given:'
+include: belebele
+task: belebele_zul_prompt_4
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_5/belebele b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele
new file mode 100644
index 0000000000000000000000000000000000000000..0d85bf5172c2e8b9408448196191f3b7d40367a0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele
@@ -0,0 +1,23 @@
+tag:
+    - belebele_tasks
+    - belebele_prompt_5
+    - RC_tasks
+dataset_path: facebook/belebele
+dataset_name: null
+output_type: multiple_choice
+test_split: test
+fewshot_config:
+  sampler: first_n
+doc_to_target: "{{['1', '2', '3', '4'].index(correct_answer_num)}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{question}}"
+doc_to_choice: ["A", "B", "C", "D"]
+metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+  - metric: acc_norm
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_afr.yaml b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_afr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..01a724719757a2655800615e982a1ff1272dc438
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_afr.yaml
@@ -0,0 +1,19 @@
+# Generated by utils.py
+dataset_name: afr_Latn
+doc_to_text: 'Read the passage: {{flores_passage}}
+
+  Then answer the question: {{question.strip()}}
+
+  Options:
+
+  A. {{mc_answer1}}
+
+  B. {{mc_answer2}}
+
+  C. {{mc_answer3}}
+
+  D. {{mc_answer4}}
+
+  Please choose the correct option from the above list:'
+include: belebele
+task: belebele_afr_prompt_5
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_amh.yaml b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f707d7c38153cd8304f7f02e331dc00858cb59c3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_amh.yaml
@@ -0,0 +1,19 @@
+# Generated by utils.py
+dataset_name: amh_Ethi
+doc_to_text: 'Read the passage: {{flores_passage}}
+
+  Then answer the question: {{question.strip()}}
+
+  Options:
+
+  A. {{mc_answer1}}
+
+  B. {{mc_answer2}}
+
+  C. {{mc_answer3}}
+
+  D. {{mc_answer4}}
+
+  Please choose the correct option from the above list:'
+include: belebele
+task: belebele_amh_prompt_5
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_ary.yaml b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_ary.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2cf68405132141da73c1c4c0085bffa47c6aab41
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_ary.yaml
@@ -0,0 +1,19 @@
+# Generated by utils.py
+dataset_name: ary_Arab
+doc_to_text: 'Read the passage: {{flores_passage}}
+
+  Then answer the question: {{question.strip()}}
+
+  Options:
+
+  A. {{mc_answer1}}
+
+  B. {{mc_answer2}}
+
+  C. {{mc_answer3}}
+
+  D. {{mc_answer4}}
+
+  Please choose the correct option from the above list:'
+include: belebele
+task: belebele_ary_prompt_5
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_arz.yaml b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_arz.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4c0314a96d451eb0cbbe04ef2036fba63d7927f5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_arz.yaml
@@ -0,0 +1,19 @@
+# Generated by utils.py
+dataset_name: arz_Arab
+doc_to_text: 'Read the passage: {{flores_passage}}
+
+  Then answer the question: {{question.strip()}}
+
+  Options:
+
+  A. {{mc_answer1}}
+
+  B. {{mc_answer2}}
+
+  C. {{mc_answer3}}
+
+  D. {{mc_answer4}}
+
+  Please choose the correct option from the above list:'
+include: belebele
+task: belebele_arz_prompt_5
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_bam.yaml b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_bam.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..704c41a5ec8a68081ac33ea39b72f962e75e130f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_bam.yaml
@@ -0,0 +1,19 @@
+# Generated by utils.py
+dataset_name: bam_Latn
+doc_to_text: 'Read the passage: {{flores_passage}}
+
+  Then answer the question: {{question.strip()}}
+
+  Options:
+
+  A. {{mc_answer1}}
+
+  B. {{mc_answer2}}
+
+  C. {{mc_answer3}}
+
+  D. {{mc_answer4}}
+
+  Please choose the correct option from the above list:'
+include: belebele
+task: belebele_bam_prompt_5
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_eng.yaml b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..62617bf1e3266c46897623097836dbc3337add03
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_eng.yaml
@@ -0,0 +1,19 @@
+# Generated by utils.py
+dataset_name: eng_Latn
+doc_to_text: 'Read the passage: {{flores_passage}}
+
+  Then answer the question: {{question.strip()}}
+
+  Options:
+
+  A. {{mc_answer1}}
+
+  B. {{mc_answer2}}
+
+  C. {{mc_answer3}}
+
+  D. {{mc_answer4}}
+
+  Please choose the correct option from the above list:'
+include: belebele
+task: belebele_eng_prompt_5
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_fra.yaml b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..05131046414169281a6bdb6046cbfef3f461939f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_fra.yaml
@@ -0,0 +1,19 @@
+# Generated by utils.py
+dataset_name: fra_Latn
+doc_to_text: 'Read the passage: {{flores_passage}}
+
+  Then answer the question: {{question.strip()}}
+
+  Options:
+
+  A. {{mc_answer1}}
+
+  B. {{mc_answer2}}
+
+  C. {{mc_answer3}}
+
+  D. {{mc_answer4}}
+
+  Please choose the correct option from the above list:'
+include: belebele
+task: belebele_fra_prompt_5
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_fuv.yaml b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_fuv.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..35103b5c19223a0a11e6a595b3f28fa3635455e9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_fuv.yaml
@@ -0,0 +1,19 @@
+# Generated by utils.py
+dataset_name: fuv_Latn
+doc_to_text: 'Read the passage: {{flores_passage}}
+
+  Then answer the question: {{question.strip()}}
+
+  Options:
+
+  A. {{mc_answer1}}
+
+  B. {{mc_answer2}}
+
+  C. {{mc_answer3}}
+
+  D. {{mc_answer4}}
+
+  Please choose the correct option from the above list:'
+include: belebele
+task: belebele_fuv_prompt_5
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_gaz.yaml b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_gaz.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3822a5886fc0a179e95140208ba7751306c09619
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_gaz.yaml
@@ -0,0 +1,19 @@
+# Generated by utils.py
+dataset_name: gaz_Latn
+doc_to_text: 'Read the passage: {{flores_passage}}
+
+  Then answer the question: {{question.strip()}}
+
+  Options:
+
+  A. {{mc_answer1}}
+
+  B. {{mc_answer2}}
+
+  C. {{mc_answer3}}
+
+  D. {{mc_answer4}}
+
+  Please choose the correct option from the above list:'
+include: belebele
+task: belebele_gaz_prompt_5
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_hau.yaml b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9a0a53114456c6223050d8a444052e2a15b3e2aa
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_hau.yaml
@@ -0,0 +1,19 @@
+# Generated by utils.py
+dataset_name: hau_Latn
+doc_to_text: 'Read the passage: {{flores_passage}}
+
+  Then answer the question: {{question.strip()}}
+
+  Options:
+
+  A. {{mc_answer1}}
+
+  B. {{mc_answer2}}
+
+  C. {{mc_answer3}}
+
+  D. {{mc_answer4}}
+
+  Please choose the correct option from the above list:'
+include: belebele
+task: belebele_hau_prompt_5
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_ibo.yaml b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f5a8e29bcd708791b528111da1b3301586825561
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_ibo.yaml
@@ -0,0 +1,19 @@
+# Generated by utils.py
+dataset_name: ibo_Latn
+doc_to_text: 'Read the passage: {{flores_passage}}
+
+  Then answer the question: {{question.strip()}}
+
+  Options:
+
+  A. {{mc_answer1}}
+
+  B. {{mc_answer2}}
+
+  C. {{mc_answer3}}
+
+  D. {{mc_answer4}}
+
+  Please choose the correct option from the above list:'
+include: belebele
+task: belebele_ibo_prompt_5
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_kea.yaml b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_kea.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..45fb47ad9854a75127428774a10dbd6eb12d83a9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_kea.yaml
@@ -0,0 +1,19 @@
+# Generated by utils.py
+dataset_name: kea_Latn
+doc_to_text: 'Read the passage: {{flores_passage}}
+
+  Then answer the question: {{question.strip()}}
+
+  Options:
+
+  A. {{mc_answer1}}
+
+  B. {{mc_answer2}}
+
+  C. {{mc_answer3}}
+
+  D. {{mc_answer4}}
+
+  Please choose the correct option from the above list:'
+include: belebele
+task: belebele_kea_prompt_5
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_kin.yaml b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8bd9a07b8853165e8b5022a2d110cd450f2708ce
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_kin.yaml
@@ -0,0 +1,19 @@
+# Generated by utils.py
+dataset_name: kin_Latn
+doc_to_text: 'Read the passage: {{flores_passage}}
+
+  Then answer the question: {{question.strip()}}
+
+  Options:
+
+  A. {{mc_answer1}}
+
+  B. {{mc_answer2}}
+
+  C. {{mc_answer3}}
+
+  D. {{mc_answer4}}
+
+  Please choose the correct option from the above list:'
+include: belebele
+task: belebele_kin_prompt_5
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_lin.yaml b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ff6493b711b0126ed2b32ad0d7d7f668c5c71482
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_lin.yaml
@@ -0,0 +1,19 @@
+# Generated by utils.py
+dataset_name: lin_Latn
+doc_to_text: 'Read the passage: {{flores_passage}}
+
+  Then answer the question: {{question.strip()}}
+
+  Options:
+
+  A. {{mc_answer1}}
+
+  B. {{mc_answer2}}
+
+  C. {{mc_answer3}}
+
+  D. {{mc_answer4}}
+
+  Please choose the correct option from the above list:'
+include: belebele
+task: belebele_lin_prompt_5
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_lug.yaml b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1b64c68ba1b3a8aa1e026cb191dcf96873285d40
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_lug.yaml
@@ -0,0 +1,19 @@
+# Generated by utils.py
+dataset_name: lug_Latn
+doc_to_text: 'Read the passage: {{flores_passage}}
+
+  Then answer the question: {{question.strip()}}
+
+  Options:
+
+  A. {{mc_answer1}}
+
+  B. {{mc_answer2}}
+
+  C. {{mc_answer3}}
+
+  D. {{mc_answer4}}
+
+  Please choose the correct option from the above list:'
+include: belebele
+task: belebele_lug_prompt_5
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_luo.yaml b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_luo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f81859aae5913c79df2bb245fe48016dba0920a6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_luo.yaml
@@ -0,0 +1,19 @@
+# Generated by utils.py
+dataset_name: luo_Latn
+doc_to_text: 'Read the passage: {{flores_passage}}
+
+  Then answer the question: {{question.strip()}}
+
+  Options:
+
+  A. {{mc_answer1}}
+
+  B. {{mc_answer2}}
+
+  C. {{mc_answer3}}
+
+  D. {{mc_answer4}}
+
+  Please choose the correct option from the above list:'
+include: belebele
+task: belebele_luo_prompt_5
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_nya.yaml b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_nya.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c957760af620a7bf10726fb9611baa5758d1a03d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_nya.yaml
@@ -0,0 +1,19 @@
+# Generated by utils.py
+dataset_name: nya_Latn
+doc_to_text: 'Read the passage: {{flores_passage}}
+
+  Then answer the question: {{question.strip()}}
+
+  Options:
+
+  A. {{mc_answer1}}
+
+  B. {{mc_answer2}}
+
+  C. {{mc_answer3}}
+
+  D. {{mc_answer4}}
+
+  Please choose the correct option from the above list:'
+include: belebele
+task: belebele_nya_prompt_5
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_plt.yaml b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_plt.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..baad68ab37cf0a5c5d34f9c16a0fdfaa473e5968
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_plt.yaml
@@ -0,0 +1,19 @@
+# Generated by utils.py
+dataset_name: plt_Latn
+doc_to_text: 'Read the passage: {{flores_passage}}
+
+  Then answer the question: {{question.strip()}}
+
+  Options:
+
+  A. {{mc_answer1}}
+
+  B. {{mc_answer2}}
+
+  C. {{mc_answer3}}
+
+  D. {{mc_answer4}}
+
+  Please choose the correct option from the above list:'
+include: belebele
+task: belebele_plt_prompt_5
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_por.yaml b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_por.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..13b4e63948d0bc6ba9c886490a8585d2339bd357
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_por.yaml
@@ -0,0 +1,19 @@
+# Generated by utils.py
+dataset_name: por_Latn
+doc_to_text: 'Read the passage: {{flores_passage}}
+
+  Then answer the question: {{question.strip()}}
+
+  Options:
+
+  A. {{mc_answer1}}
+
+  B. {{mc_answer2}}
+
+  C. {{mc_answer3}}
+
+  D. {{mc_answer4}}
+
+  Please choose the correct option from the above list:'
+include: belebele
+task: belebele_por_prompt_5
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_sna.yaml b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fd4fc080074beb21801196d5df19ea59e6928b4a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_sna.yaml
@@ -0,0 +1,19 @@
+# Generated by utils.py
+dataset_name: sna_Latn
+doc_to_text: 'Read the passage: {{flores_passage}}
+
+  Then answer the question: {{question.strip()}}
+
+  Options:
+
+  A. {{mc_answer1}}
+
+  B. {{mc_answer2}}
+
+  C. {{mc_answer3}}
+
+  D. {{mc_answer4}}
+
+  Please choose the correct option from the above list:'
+include: belebele
+task: belebele_sna_prompt_5
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_som.yaml b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_som.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3dfa40665cc2874333872503f880c4c373c03b52
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_som.yaml
@@ -0,0 +1,19 @@
+# Generated by utils.py
+dataset_name: som_Latn
+doc_to_text: 'Read the passage: {{flores_passage}}
+
+  Then answer the question: {{question.strip()}}
+
+  Options:
+
+  A. {{mc_answer1}}
+
+  B. {{mc_answer2}}
+
+  C. {{mc_answer3}}
+
+  D. {{mc_answer4}}
+
+  Please choose the correct option from the above list:'
+include: belebele
+task: belebele_som_prompt_5
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_sot.yaml b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c78c862a08e3e6e6968d9e3e039a6cd67c99e978
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_sot.yaml
@@ -0,0 +1,19 @@
+# Generated by utils.py
+dataset_name: sot_Latn
+doc_to_text: 'Read the passage: {{flores_passage}}
+
+  Then answer the question: {{question.strip()}}
+
+  Options:
+
+  A. {{mc_answer1}}
+
+  B. {{mc_answer2}}
+
+  C. {{mc_answer3}}
+
+  D. {{mc_answer4}}
+
+  Please choose the correct option from the above list:'
+include: belebele
+task: belebele_sot_prompt_5
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_ssw.yaml b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_ssw.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d2e8b96f93a43c64e36d0f2b8199524344511b70
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_ssw.yaml
@@ -0,0 +1,19 @@
+# Generated by utils.py
+dataset_name: ssw_Latn
+doc_to_text: 'Read the passage: {{flores_passage}}
+
+  Then answer the question: {{question.strip()}}
+
+  Options:
+
+  A. {{mc_answer1}}
+
+  B. {{mc_answer2}}
+
+  C. {{mc_answer3}}
+
+  D. {{mc_answer4}}
+
+  Please choose the correct option from the above list:'
+include: belebele
+task: belebele_ssw_prompt_5
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_swa.yaml b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5a44af344142293043db80d9f55140569b7fdebf
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_swa.yaml
@@ -0,0 +1,19 @@
+# Generated by utils.py
+dataset_name: swh_Latn
+doc_to_text: 'Read the passage: {{flores_passage}}
+
+  Then answer the question: {{question.strip()}}
+
+  Options:
+
+  A. {{mc_answer1}}
+
+  B. {{mc_answer2}}
+
+  C. {{mc_answer3}}
+
+  D. {{mc_answer4}}
+
+  Please choose the correct option from the above list:'
+include: belebele
+task: belebele_swa_prompt_5
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_tir.yaml b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_tir.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4ef9af2ade11b49af64195a00811be7bf69b34d7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_tir.yaml
@@ -0,0 +1,19 @@
+# Generated by utils.py
+dataset_name: tir_Ethi
+doc_to_text: 'Read the passage: {{flores_passage}}
+
+  Then answer the question: {{question.strip()}}
+
+  Options:
+
+  A. {{mc_answer1}}
+
+  B. {{mc_answer2}}
+
+  C. {{mc_answer3}}
+
+  D. {{mc_answer4}}
+
+  Please choose the correct option from the above list:'
+include: belebele
+task: belebele_tir_prompt_5
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_tsn.yaml b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_tsn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0de5669b2a6031c7a5960bc174951d99cdc02502
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_tsn.yaml
@@ -0,0 +1,19 @@
+# Generated by utils.py
+dataset_name: tsn_Latn
+doc_to_text: 'Read the passage: {{flores_passage}}
+
+  Then answer the question: {{question.strip()}}
+
+  Options:
+
+  A. {{mc_answer1}}
+
+  B. {{mc_answer2}}
+
+  C. {{mc_answer3}}
+
+  D. {{mc_answer4}}
+
+  Please choose the correct option from the above list:'
+include: belebele
+task: belebele_tsn_prompt_5
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_tso.yaml b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_tso.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..92def0f429f0c7517c8904a8a2ae86cc2f534653
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_tso.yaml
@@ -0,0 +1,19 @@
+# Generated by utils.py
+dataset_name: tso_Latn
+doc_to_text: 'Read the passage: {{flores_passage}}
+
+  Then answer the question: {{question.strip()}}
+
+  Options:
+
+  A. {{mc_answer1}}
+
+  B. {{mc_answer2}}
+
+  C. {{mc_answer3}}
+
+  D. {{mc_answer4}}
+
+  Please choose the correct option from the above list:'
+include: belebele
+task: belebele_tso_prompt_5
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_wol.yaml b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..10192b8a69a44faf3b155a16287d3da15b0c4c5f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_wol.yaml
@@ -0,0 +1,19 @@
+# Generated by utils.py
+dataset_name: wol_Latn
+doc_to_text: 'Read the passage: {{flores_passage}}
+
+  Then answer the question: {{question.strip()}}
+
+  Options:
+
+  A. {{mc_answer1}}
+
+  B. {{mc_answer2}}
+
+  C. {{mc_answer3}}
+
+  D. {{mc_answer4}}
+
+  Please choose the correct option from the above list:'
+include: belebele
+task: belebele_wol_prompt_5
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_xho.yaml b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9ea12584e13ae4b4e19ac5b9c1fcc3b54ae0f950
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_xho.yaml
@@ -0,0 +1,19 @@
+# Generated by utils.py
+dataset_name: xho_Latn
+doc_to_text: 'Read the passage: {{flores_passage}}
+
+  Then answer the question: {{question.strip()}}
+
+  Options:
+
+  A. {{mc_answer1}}
+
+  B. {{mc_answer2}}
+
+  C. {{mc_answer3}}
+
+  D. {{mc_answer4}}
+
+  Please choose the correct option from the above list:'
+include: belebele
+task: belebele_xho_prompt_5
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_yor.yaml b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c69e05cee5d26e91892e9303ad09f06856c254d1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_yor.yaml
@@ -0,0 +1,19 @@
+# Generated by utils.py
+dataset_name: yor_Latn
+doc_to_text: 'Read the passage: {{flores_passage}}
+
+  Then answer the question: {{question.strip()}}
+
+  Options:
+
+  A. {{mc_answer1}}
+
+  B. {{mc_answer2}}
+
+  C. {{mc_answer3}}
+
+  D. {{mc_answer4}}
+
+  Please choose the correct option from the above list:'
+include: belebele
+task: belebele_yor_prompt_5
diff --git a/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_zul.yaml b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c3c6905f9803bba171b95b48c540f730d16158bb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/prompt_5/belebele_zul.yaml
@@ -0,0 +1,19 @@
+# Generated by utils.py
+dataset_name: zul_Latn
+doc_to_text: 'Read the passage: {{flores_passage}}
+
+  Then answer the question: {{question.strip()}}
+
+  Options:
+
+  A. {{mc_answer1}}
+
+  B. {{mc_answer2}}
+
+  C. {{mc_answer3}}
+
+  D. {{mc_answer4}}
+
+  Please choose the correct option from the above list:'
+include: belebele
+task: belebele_zul_prompt_5
diff --git a/lm_eval/tasks/afrobench/belebele/utils.py b/lm_eval/tasks/afrobench/belebele/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7654a6cfe7974b352446e8c71c5740fb9e45f9f2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/belebele/utils.py
@@ -0,0 +1,155 @@
+import argparse
+import os
+
+import yaml
+
+
+def prompt_func(mode, lang):
+    prompt_map = {
+        "prompt_1": "P: {{flores_passage}}\nQ: {{question.strip()}}\nA: {{mc_answer1}}\nB: {{mc_answer2}}\nC: {{mc_answer3}}\nD: {{mc_answer4}}\nPlease choose the correct answer from the options above:",
+        "prompt_2": "Passage: {{flores_passage}}\nQuestion: {{question.strip()}}\n1: {{mc_answer1}}\n2: {{mc_answer2}}\n3: {{mc_answer3}}\n4: {{mc_answer4}}\nPlease select the correct answer from the given choices:",
+        "prompt_3": "Context: {{flores_passage}}\nQuery: {{question.strip()}}\nOption A: {{mc_answer1}}\nOption B: {{mc_answer2}}\nOption C: {{mc_answer3}}\nOption D: {{mc_answer4}}\nPlease indicate the correct option from the list above:",
+        "prompt_4": "{{flores_passage}}\nBased on the above passage, answer the following question:\n{{question.strip()}}\nChoices:\nA) {{mc_answer1}}\nB) {{mc_answer2}}\nC) {{mc_answer3}}\nD) {{mc_answer4}}\nPlease provide the correct answer from the choices given:",
+        "prompt_5": "Read the passage: {{flores_passage}}\nThen answer the question: {{question.strip()}}\nOptions:\nA. {{mc_answer1}}\nB. {{mc_answer2}}\nC. {{mc_answer3}}\nD. {{mc_answer4}}\nPlease choose the correct option from the above list:",
+    }
+    return prompt_map[mode]
+
+
+def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
+    """
+    Generate a yaml file for each language.
+
+    :param output_dir: The directory to output the files to.
+    :param overwrite: Whether to overwrite files if they already exist.
+    """
+    err = []
+    languages = {
+        "afr": "Afrikaans",
+        "amh": "Amharic",
+        "ary": "Moroccan Arabic",
+        "arz": "Egyptian Arabic",
+        "bam": "Bambara",
+        "eng": "English",
+        "fra": "French",
+        "hau": "Hausa",
+        "ibo": "Igbo",
+        "lin": "Lingala",
+        "por": "Portuguese",
+        "sna": "Shona",
+        "swa": "Swahili",
+        "tir": "Tigrinya",
+        "tso": "Tsonga",
+        "tsn": "Tswana",
+        "wol": "Wolof",
+        "xho": "Xhosa",
+        "yor": "Yoruba",
+        "zul": "Zulu",
+        "ssw": "Swati",
+        "sot": "Southern Sotho",
+        "som": "Somali",
+        "plt": "Plateau Malagasy",
+        "nya": "Nyanja",
+        "luo": "Luo",
+        "lug": "Luganda",
+        "kin": "Kinyarwanda",
+        "kea": "Kabuverdianu",
+        "gaz": "Oromo",
+        "fuv": "Nigerian Fulfulde",
+    }
+
+    lang_2_dataset_lang_code = {
+        "afr": "afr_Latn",
+        "amh": "amh_Ethi",
+        "ary": "ary_Arab",
+        "arz": "arz_Arab",
+        "bam": "bam_Latn",
+        "eng": "eng_Latn",
+        "fra": "fra_Latn",
+        "hau": "hau_Latn",
+        "ibo": "ibo_Latn",
+        "lin": "lin_Latn",
+        "por": "por_Latn",
+        "sna": "sna_Latn",
+        "swa": "swh_Latn",
+        "tir": "tir_Ethi",
+        "tso": "tso_Latn",
+        "tsn": "tsn_Latn",
+        "wol": "wol_Latn",
+        "xho": "xho_Latn",
+        "yor": "yor_Latn",
+        "zul": "zul_Latn",
+        "ssw": "ssw_Latn",
+        "sot": "sot_Latn",
+        "som": "som_Latn",
+        "plt": "plt_Latn",
+        "nya": "nya_Latn",
+        "luo": "luo_Latn",
+        "lug": "lug_Latn",
+        "kin": "kin_Latn",
+        "kea": "kea_Latn",
+        "gaz": "gaz_Latn",
+        "fuv": "fuv_Latn",
+    }
+
+    for lang in languages.keys():
+        try:
+            file_name = f"belebele_{lang}.yaml"
+            task_name = f"belebele_{lang}_{mode}"
+            yaml_template = "belebele"
+            yaml_details = {
+                "include": yaml_template,
+                "task": task_name,
+                "dataset_name": lang_2_dataset_lang_code[lang],
+                "doc_to_text": prompt_func(mode, languages[lang]),
+            }
+            file_path = os.path.join(output_dir, mode)
+            os.makedirs(file_path, exist_ok=True)
+
+            with open(
+                f"{output_dir}/{mode}/{file_name}",
+                "w" if overwrite else "x",
+                encoding="utf8",
+            ) as f:
+                f.write("# Generated by utils.py\n")
+                yaml.dump(
+                    yaml_details,
+                    f,
+                    allow_unicode=True,
+                )
+        except FileExistsError:
+            err.append(file_name)
+
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist (use --overwrite flag):"
+            f" {', '.join(err)}"
+        )
+
+
+def main() -> None:
+    """Parse CLI args and generate language-specific yaml files."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--overwrite",
+        default=True,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default="./",
+        help="Directory to write yaml files to",
+    )
+    parser.add_argument(
+        "--mode",
+        default="prompt_5",
+        choices=["prompt_1", "prompt_2", "prompt_3", "prompt_4", "prompt_5"],
+        help="Prompt number",
+    )
+    args = parser.parse_args()
+
+    gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite, mode=args.mode)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lm_eval/tasks/afrobench/flores/README.md b/lm_eval/tasks/afrobench/flores/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ccf433a9f884576ef412148ea67e1a07c86bea30
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/README.md
@@ -0,0 +1,31 @@
+#
+
+## Paper
+Title: `The FLORES-200 Evaluation Benchmark for Low-Resource and Multilingual Machine Translation`
+
+Paper Link: https://arxiv.org/abs/2207.04672
+
+HomePage: https://huggingface.co/datasets/facebook/flores
+
+### Citation
+
+```
+@article{nllb2022,
+  author    = {NLLB Team, Marta R. Costa-jussà, James Cross, Onur Çelebi, Maha Elbayad, Kenneth Heafield, Kevin Heffernan, Elahe Kalbassi,  Janice Lam, Daniel Licht, Jean Maillard, Anna Sun, Skyler Wang, Guillaume Wenzek, Al Youngblood, Bapi Akula, Loic Barrault, Gabriel Mejia Gonzalez, Prangthip Hansanti, John Hoffman, Semarley Jarrett, Kaushik Ram Sadagopan, Dirk Rowe, Shannon Spruit, Chau Tran, Pierre Andrews, Necip Fazil Ayan, Shruti Bhosale, Sergey Edunov, Angela Fan, Cynthia Gao, Vedanuj Goswami, Francisco Guzmán, Philipp Koehn, Alexandre Mourachko, Christophe Ropers, Safiyyah Saleem, Holger Schwenk, Jeff Wang},
+  title     = {No Language Left Behind: Scaling Human-Centered Machine Translation},
+  year      = {2022}
+}
+
+@inproceedings{,
+  title={The FLORES-101  Evaluation Benchmark for Low-Resource and Multilingual Machine Translation},
+  author={Goyal, Naman and Gao, Cynthia and Chaudhary, Vishrav and Chen, Peng-Jen and Wenzek, Guillaume and Ju, Da and Krishnan, Sanjana and Ranzato, Marc'Aurelio and Guzm\'{a}n, Francisco and Fan, Angela},
+  year={2021}
+}
+
+@inproceedings{,
+  title={Two New Evaluation Datasets for Low-Resource Machine Translation: Nepali-English and Sinhala-English},
+  author={Guzm\'{a}n, Francisco and Chen, Peng-Jen and Ott, Myle and Pino, Juan and Lample, Guillaume and Koehn, Philipp and Chaudhary, Vishrav and Ranzato, Marc'Aurelio},
+  journal={arXiv preprint arXiv:1902.01382},
+  year={2019}
+}
+```
diff --git a/lm_eval/tasks/afrobench/flores/flores.yaml b/lm_eval/tasks/afrobench/flores/flores.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..09b6e39274a4a686d20f742927b9a2740c2ef59f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/flores.yaml
@@ -0,0 +1,14 @@
+group: african_flores
+task:
+  - flores_eng-afr_prompt_1
+  - flores_eng-afr_prompt_2
+  - flores_eng-afr_prompt_3
+  - flores_afr-eng_prompt_1
+  - flores_afr-eng_prompt_2
+  - flores_afr-eng_prompt_3
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1
diff --git a/lm_eval/tasks/afrobench/flores/gen_utils.py b/lm_eval/tasks/afrobench/flores/gen_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..37e22e13d6b024976e9198df78dfa7ae81845e8a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/gen_utils.py
@@ -0,0 +1,202 @@
+import argparse
+import os
+
+import yaml
+
+
+class FunctionTag:
+    def __init__(self, value):
+        self.value = value
+
+
+def prompt_func(mode, lang, lang_dict):
+    language_column_name = f"sentence_{lang}"
+    prompt_map = {
+        "prompt_1": f"{lang_dict[lang]}: {{{{{language_column_name}}}}} \nEnglish: ",
+        "prompt_1_reverse": f"English: {{{{sentence_eng_Latn}}}} \n{lang_dict[lang]}: ",
+        "prompt_2": f"You are a translation expert. Translate the following {lang_dict[lang]} sentences to English \n"
+        f"{lang_dict[lang]}: {{{{{language_column_name}}}}}\nEnglish: ",
+        "prompt_2_reverse": f"You are a translation expert. Translate the following English sentences to "
+        f"{lang_dict[lang]} "
+        "\nEnglish: {{sentence_eng_Latn}} "
+        f"\n{lang_dict[lang]}: ",
+        "prompt_3": f"As a {lang_dict[lang]} and English linguist, translate the following {lang_dict[lang]} sentences "
+        f"to English \n{lang_dict[lang]}: {{{{{language_column_name}}}}}\nEnglish: ",
+        "prompt_3_reverse": f"As a {lang_dict[lang]} and English linguist, translate the following English sentences to "
+        f"{lang_dict[lang]} "
+        "\nEnglish: {{sentence_eng_Latn}} "
+        f"\n{lang_dict[lang]}: ",
+    }
+    return prompt_map[mode]
+
+
+def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str, reverse: bool) -> None:
+    """
+    Generate a yaml file for each language.
+
+    :param output_dir: The directory to output the files to.
+    :param overwrite: Whether to overwrite files if they already exist.
+    """
+    err = []
+    languages = {
+        "ace_Latn": "Acehnese (Latin script)",
+        "ace_Arab": "Acehnese (Arabic script)",
+        "acq_Arab": "Ta’izzi-Adeni Arabic",
+        "aeb_Arab": "Tunisian Arabic",
+        "afr_Latn": "Afrikaans",
+        "aka_Latn": "Akan",
+        "amh_Ethi": "Amharic",
+        "ary_Arab": "Moroccan Arabic",
+        "arz_Arab": "Egyptian Arabic",
+        "bam_Latn": "Bambara",
+        "ban_Latn": "Balinese",
+        "bem_Latn": "Bemba",
+        "cjk_Latn": "Chokwe",
+        "dik_Latn": "Southwestern Dinka",
+        "dyu_Latn": "Dyula",
+        "ewe_Latn": "Ewe",
+        "fon_Latn": "Fon",
+        "fra_Latn": "French",
+        "fuv_Latn": "Nigerian Fulfulde",
+        "hau_Latn": "Hausa",
+        "ibo_Latn": "Igbo",
+        "kab_Latn": "Kabyle",
+        "kam_Latn": "Kamba",
+        "knc_Arab": "Central Kanuri (Arabic script)",
+        "knc_Latn": "Central Kanuri (Latin script)",
+        "kbp_Latn": "Kabiyè",
+        "kea_Latn": "Kabuverdianu",
+        "kik_Latn": "Kikuyu",
+        "kin_Latn": "Kinyarwanda",
+        "kmb_Latn": "Kimbundu",
+        "kon_Latn": "Kikongo",
+        "lin_Latn": "Lingala",
+        "lua_Latn": "Luba-Kasai",
+        "lug_Latn": "Luganda",
+        "luo_Latn": "Luo",
+        "plt_Latn": "Plateau Malagasy",
+        "mos_Latn": "Mossi",
+        "nso_Latn": "Northern Sotho",
+        "nus_Latn": "Nuer",
+        "nya_Latn": "Nyanja",
+        "gaz_Latn": "Oromo",
+        "run_Latn": "Rundi",
+        "sag_Latn": "Sango",
+        "sna_Latn": "Shona",
+        "som_Latn": "Somali",
+        "sot_Latn": "Southern Sotho",
+        "ssw_Latn": "Swati",
+        "sun_Latn": "Sundanese",
+        "swh_Latn": "Swahili",
+        "tir_Ethi": "Tigrinya",
+        "taq_Latn": "Tamasheq",
+        "taq_Tfng": "Tamasheq (Tifinagh script)",
+        "tsn_Latn": "Setswana",
+        "tso_Latn": "Tsonga",
+        "tum_Latn": "Tumbuka",
+        "twi_Latn": "Twi",
+        "tzm_Tfng": "Central Atlas Tamazight",
+        "umb_Latn": "Umbundu",
+        "wol_Latn": "Wolof",
+        "xho_Latn": "Xhosa",
+        "yor_Latn": "Yoruba",
+        "zul_Latn": "Zulu",
+    }
+
+    for lang in languages.keys():
+        try:
+            if not reverse:
+                file_name = f"flores_{lang}-eng_Latn.yaml"
+                task_name = f"flores_{lang}-eng_Latn_{mode}"
+                yaml_template = "flores"
+                yaml_details = {
+                    "include": yaml_template,
+                    "task": task_name,
+                    "dataset_name": f"{lang}-eng_Latn",
+                    "doc_to_target": "sentence_eng_Latn",
+                    "doc_to_text": prompt_func(mode, lang, languages),
+                }
+                os.makedirs(f"{output_dir}/{mode}/african-english", exist_ok=True)
+                with open(
+                    f"{output_dir}/{mode}/african-english/{file_name}",
+                    "w" if overwrite else "x",
+                    encoding="utf8",
+                ) as f:
+                    f.write("# Generated by utils.py\n")
+                    yaml.dump(
+                        yaml_details,
+                        f,
+                        allow_unicode=True,
+                    )
+            else:
+                file_name = f"flores_eng_Latn-{lang}.yaml"
+                task_name = f"flores_eng_Latn-{lang}_{mode}"
+                yaml_template = "flores"
+                # mode_reverse = f"{mode}_reverse"
+                yaml_details = {
+                    "include": yaml_template,
+                    "task": task_name,
+                    "dataset_name": f"eng_Latn-{lang}",
+                    "doc_to_target": f"sentence_{lang}",
+                    "doc_to_text": prompt_func(f"{mode}_reverse", lang, languages),
+                }
+                os.makedirs(f"{output_dir}/{mode}/english-african", exist_ok=True)
+                with open(
+                    f"{output_dir}/{mode}/english-african/{file_name}",
+                    "w" if overwrite else "x",
+                    encoding="utf8",
+                ) as f:
+                    f.write("# Generated by utils.py\n")
+                    yaml.dump(
+                        yaml_details,
+                        f,
+                        allow_unicode=True,
+                    )
+        except FileExistsError:
+            err.append(file_name)
+
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist (use --overwrite flag):"
+            f" {', '.join(err)}"
+        )
+
+
+def main() -> None:
+    """Parse CLI args and generate language-specific yaml files."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--overwrite",
+        default=True,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default="./",
+        help="Directory to write yaml files to",
+    )
+    parser.add_argument(
+        "--mode",
+        default="prompt_1",
+        choices=["prompt_1", "prompt_2", "prompt_3"],
+        help="Prompt number",
+    )
+    parser.add_argument(
+        "--reverse",
+        default=True,
+        choices=[True, False],
+        help="Reverse the translation direction",
+    )
+    args = parser.parse_args()
+
+    gen_lang_yamls(
+        output_dir=args.output_dir,
+        overwrite=args.overwrite,
+        mode=args.mode,
+        reverse=args.reverse,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores
new file mode 100644
index 0000000000000000000000000000000000000000..c25cf195cd032014435335eadf13e102f47598f9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores
@@ -0,0 +1,27 @@
+tag:
+- african_flores_tasks
+- flores_afr-eng
+- flores_afr-eng_prompt_1
+- afrobench_MT_tasks
+dataset_path: facebook/flores
+dataset_kwargs: {trust_remote_code: True}
+output_type: generate_until
+validation_split: dev
+fewshot_split: dev
+test_split: devtest
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: chrf
+    aggregation: chrf
+    higher_is_better: true
+generation_kwargs:
+  until:
+    - "**"
+    - </s>
+  do_sample: false
+  temperature: 0.0
+repeats: 1
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_ace_Arab-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_ace_Arab-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c558249774e6182755078db81154e3d88db656c1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_ace_Arab-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ace_Arab-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Acehnese (Arabic script): {{sentence_ace_Arab}} \nEnglish: "
+include: flores
+task: flores_ace_Arab-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_ace_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_ace_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1f0a6ee27cfc218e297ccaf05ab7d0bcef5da57b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_ace_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ace_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Acehnese (Latin script): {{sentence_ace_Latn}} \nEnglish: "
+include: flores
+task: flores_ace_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_acq_Arab-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_acq_Arab-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3634e7a66c7b11be9a0450f6f5ab953707897eb1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_acq_Arab-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: acq_Arab-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Ta’izzi-Adeni Arabic: {{sentence_acq_Arab}} \nEnglish: "
+include: flores
+task: flores_acq_Arab-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_aeb_Arab-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_aeb_Arab-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..53636d7c01d92e87b80da4bd6656b7740d0f11a6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_aeb_Arab-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: aeb_Arab-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Tunisian Arabic: {{sentence_aeb_Arab}} \nEnglish: "
+include: flores
+task: flores_aeb_Arab-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_afr_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_afr_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2ac14a0c04b362f925e578a30a9eb615a6bc1fed
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_afr_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: afr_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Afrikaans: {{sentence_afr_Latn}} \nEnglish: "
+include: flores
+task: flores_afr_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_aka_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_aka_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c3caf192676374f70a707c77d84bb6eac1deacb5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_aka_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: aka_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Akan: {{sentence_aka_Latn}} \nEnglish: "
+include: flores
+task: flores_aka_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_amh_Ethi-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_amh_Ethi-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6c0be0828a5110df25911b503c0db29b2fe6dbb3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_amh_Ethi-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: amh_Ethi-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Amharic: {{sentence_amh_Ethi}} \nEnglish: "
+include: flores
+task: flores_amh_Ethi-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_ary_Arab-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_ary_Arab-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8bcd452d6e86cf669bcbc97b7216d72dbeb37ffd
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_ary_Arab-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ary_Arab-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Moroccan Arabic: {{sentence_ary_Arab}} \nEnglish: "
+include: flores
+task: flores_ary_Arab-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_arz_Arab-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_arz_Arab-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..72552bab1ce9b04feb4619a28e4a424b1bdb99d3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_arz_Arab-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: arz_Arab-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Egyptian Arabic: {{sentence_arz_Arab}} \nEnglish: "
+include: flores
+task: flores_arz_Arab-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_bam_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_bam_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..14e8a1c74fb7f9bcdd47703121bc127420d5cf3a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_bam_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: bam_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Bambara: {{sentence_bam_Latn}} \nEnglish: "
+include: flores
+task: flores_bam_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_ban_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_ban_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..54a582446ec44263f7020676a7e5fa3eee88e780
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_ban_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ban_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Balinese: {{sentence_ban_Latn}} \nEnglish: "
+include: flores
+task: flores_ban_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_bem_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_bem_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..53bbe221d7ec3a98f4eedeb9fdcfd49a2d872198
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_bem_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: bem_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Bemba: {{sentence_bem_Latn}} \nEnglish: "
+include: flores
+task: flores_bem_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_cjk_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_cjk_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..63994d04d0dfc9ca7d8418835dcd47abf79d5031
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_cjk_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: cjk_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Chokwe: {{sentence_cjk_Latn}} \nEnglish: "
+include: flores
+task: flores_cjk_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_dik_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_dik_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fd9022b53f0325804b07bf5fb8c222a37c5eccde
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_dik_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: dik_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Southwestern Dinka: {{sentence_dik_Latn}} \nEnglish: "
+include: flores
+task: flores_dik_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_dyu_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_dyu_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e25e23d89d7090ec09c68af8c83705dd6a43d7d7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_dyu_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: dyu_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Dyula: {{sentence_dyu_Latn}} \nEnglish: "
+include: flores
+task: flores_dyu_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_ewe_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_ewe_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fffa31fcd93a02581b8e70e20d2b2fd84803365c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_ewe_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ewe_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Ewe: {{sentence_ewe_Latn}} \nEnglish: "
+include: flores
+task: flores_ewe_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_fon_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_fon_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..70c9bfbe0f59666124b03c55432d4981472da9d8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_fon_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: fon_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Fon: {{sentence_fon_Latn}} \nEnglish: "
+include: flores
+task: flores_fon_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_fra_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_fra_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2c515a8f6adff914e6c237a1d637d8a589bc976f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_fra_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: fra_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "French: {{sentence_fra_Latn}} \nEnglish: "
+include: flores
+task: flores_fra_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_fuv_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_fuv_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4a162567753f9b6ee34d52f5ac06233b54973eac
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_fuv_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: fuv_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Nigerian Fulfulde: {{sentence_fuv_Latn}} \nEnglish: "
+include: flores
+task: flores_fuv_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_gaz_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_gaz_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ec443459d6b9ac698106c6dc2c500d2b897557f1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_gaz_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: gaz_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Oromo: {{sentence_gaz_Latn}} \nEnglish: "
+include: flores
+task: flores_gaz_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_hau_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_hau_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8d518b5122fa8ebf09188fd1eca8f6f5e2e23983
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_hau_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: hau_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Hausa: {{sentence_hau_Latn}} \nEnglish: "
+include: flores
+task: flores_hau_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_ibo_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_ibo_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9c121ae73d95a5acffa72c27af04c9c1b16a7b43
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_ibo_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ibo_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Igbo: {{sentence_ibo_Latn}} \nEnglish: "
+include: flores
+task: flores_ibo_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_kab_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_kab_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..42c625488a60ed854f9769636ddc208d8d7d2e0c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_kab_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: kab_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Kabyle: {{sentence_kab_Latn}} \nEnglish: "
+include: flores
+task: flores_kab_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_kam_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_kam_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d7d10cc570d7c338e7928a36be4a3026cffaf159
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_kam_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: kam_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Kamba: {{sentence_kam_Latn}} \nEnglish: "
+include: flores
+task: flores_kam_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_kbp_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_kbp_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..43cc5e32a272d14bc8549d28f6f8784ab1e968dd
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_kbp_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: kbp_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Kabiyè: {{sentence_kbp_Latn}} \nEnglish: "
+include: flores
+task: flores_kbp_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_kea_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_kea_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4c894681ef571f72d6edfa953a9d0aeaafb5dcc9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_kea_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: kea_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Kabuverdianu: {{sentence_kea_Latn}} \nEnglish: "
+include: flores
+task: flores_kea_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_kik_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_kik_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dbdff8e215e247fbc4b0061154f3c66903aad80c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_kik_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: kik_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Kikuyu: {{sentence_kik_Latn}} \nEnglish: "
+include: flores
+task: flores_kik_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_kin_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_kin_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b11194a98eacd84551995aa1506993a9c8a52bf6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_kin_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: kin_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Kinyarwanda: {{sentence_kin_Latn}} \nEnglish: "
+include: flores
+task: flores_kin_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_kmb_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_kmb_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..258b847d28294196b7c4d7455320e24d0ad2a59a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_kmb_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: kmb_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Kimbundu: {{sentence_kmb_Latn}} \nEnglish: "
+include: flores
+task: flores_kmb_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_knc_Arab-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_knc_Arab-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..642dfc6f891572f65037299b8f3a8381f51f0421
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_knc_Arab-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: knc_Arab-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Central Kanuri (Arabic script): {{sentence_knc_Arab}} \nEnglish: "
+include: flores
+task: flores_knc_Arab-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_knc_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_knc_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7f904da712bda335eab0ccc0e7036b8190caf93e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_knc_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: knc_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Central Kanuri (Latin script): {{sentence_knc_Latn}} \nEnglish: "
+include: flores
+task: flores_knc_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_kon_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_kon_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..54fce1f8da44e9b2c889e71e8a8d9f5eea4b3ef7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_kon_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: kon_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Kikongo: {{sentence_kon_Latn}} \nEnglish: "
+include: flores
+task: flores_kon_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_lin_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_lin_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..41494a7263855a44fd217ac6d7cee38e714a8597
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_lin_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: lin_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Lingala: {{sentence_lin_Latn}} \nEnglish: "
+include: flores
+task: flores_lin_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_lua_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_lua_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9d54350a45f7838492555c4268e552dc609f0e12
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_lua_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: lua_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Luba-Kasai: {{sentence_lua_Latn}} \nEnglish: "
+include: flores
+task: flores_lua_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_lug_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_lug_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..35d8e31b1331a8e478bc6960c262a8d5eb5630df
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_lug_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: lug_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Luganda: {{sentence_lug_Latn}} \nEnglish: "
+include: flores
+task: flores_lug_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_luo_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_luo_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7a22ec7db9d0cf3d3497ab0367c32a2aef602513
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_luo_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: luo_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Luo: {{sentence_luo_Latn}} \nEnglish: "
+include: flores
+task: flores_luo_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_mos_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_mos_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4a4c1009c46290faafcb15930cb98217612d5c14
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_mos_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: mos_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Mossi: {{sentence_mos_Latn}} \nEnglish: "
+include: flores
+task: flores_mos_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_nso_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_nso_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2409753c8a86d873fc33b68cbaba493b154fd947
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_nso_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: nso_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Northern Sotho: {{sentence_nso_Latn}} \nEnglish: "
+include: flores
+task: flores_nso_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_nus_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_nus_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f77380957e4c1b61cd9a277e8e2d831fa7d9a0da
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_nus_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: nus_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Nuer: {{sentence_nus_Latn}} \nEnglish: "
+include: flores
+task: flores_nus_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_nya_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_nya_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..def5625dd7a4b6dd4288a796681fe6fbfc40c6ac
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_nya_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: nya_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Nyanja: {{sentence_nya_Latn}} \nEnglish: "
+include: flores
+task: flores_nya_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_plt_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_plt_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f877a307254dfe00e645ea544bc1a7fb64411162
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_plt_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: plt_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Plateau Malagasy: {{sentence_plt_Latn}} \nEnglish: "
+include: flores
+task: flores_plt_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_run_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_run_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e00eb85718ffff0eb4fd84a1ce50fc4ff92c9988
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_run_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: run_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Rundi: {{sentence_run_Latn}} \nEnglish: "
+include: flores
+task: flores_run_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_sag_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_sag_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e7f43c6b6cf91d52311d4f0981086afcd833d8b3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_sag_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: sag_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Sango: {{sentence_sag_Latn}} \nEnglish: "
+include: flores
+task: flores_sag_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_sna_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_sna_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d63b4c6baf598d665509cadcaf1c1613f7f2c77d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_sna_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: sna_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Shona: {{sentence_sna_Latn}} \nEnglish: "
+include: flores
+task: flores_sna_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_som_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_som_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f625c559f3c1dd1b0490d42c29515dfeaef28d68
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_som_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: som_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Somali: {{sentence_som_Latn}} \nEnglish: "
+include: flores
+task: flores_som_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_sot_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_sot_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..11653e6059e51d71b48e722abd1c519ddd956d00
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_sot_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: sot_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Southern Sotho: {{sentence_sot_Latn}} \nEnglish: "
+include: flores
+task: flores_sot_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_ssw_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_ssw_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fe3ceb9a874c035a06f3c9fdc46e254a544bc563
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_ssw_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ssw_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Swati: {{sentence_ssw_Latn}} \nEnglish: "
+include: flores
+task: flores_ssw_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_sun_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_sun_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c3f605f9400dacb782a0066b1f6559aaba6ed270
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_sun_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: sun_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Sundanese: {{sentence_sun_Latn}} \nEnglish: "
+include: flores
+task: flores_sun_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_swh_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_swh_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7651ac3159f59d94886dc97a8e854fb19e184115
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_swh_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: swh_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Swahili: {{sentence_swh_Latn}} \nEnglish: "
+include: flores
+task: flores_swh_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_taq_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_taq_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d3fca39004e66ae6c74858cea59e930527a41eff
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_taq_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: taq_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Tamasheq: {{sentence_taq_Latn}} \nEnglish: "
+include: flores
+task: flores_taq_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_taq_Tfng-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_taq_Tfng-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7152867ee6e32951f630f2d24d615ec248090fde
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_taq_Tfng-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: taq_Tfng-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Tamasheq (Tifinagh script): {{sentence_taq_Tfng}} \nEnglish: "
+include: flores
+task: flores_taq_Tfng-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_tir_Ethi-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_tir_Ethi-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cc13ae0413578e722f0f7c7e1c724e565b7a10c6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_tir_Ethi-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: tir_Ethi-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Tigrinya: {{sentence_tir_Ethi}} \nEnglish: "
+include: flores
+task: flores_tir_Ethi-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_tsn_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_tsn_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2a6c4e1c820c15ff4e1b3f2d8db43c102a4b206e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_tsn_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: tsn_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Setswana: {{sentence_tsn_Latn}} \nEnglish: "
+include: flores
+task: flores_tsn_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_tso_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_tso_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0d473ab03b198ff0691265f278a6aec6e688e967
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_tso_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: tso_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Tsonga: {{sentence_tso_Latn}} \nEnglish: "
+include: flores
+task: flores_tso_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_tum_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_tum_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c491f25b514977bd8554a0d53212525b028a6e41
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_tum_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: tum_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Tumbuka: {{sentence_tum_Latn}} \nEnglish: "
+include: flores
+task: flores_tum_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_twi_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_twi_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2d8ad29e375918e52cbd3337ac6965a461c7f7fc
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_twi_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: twi_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Twi: {{sentence_twi_Latn}} \nEnglish: "
+include: flores
+task: flores_twi_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_tzm_Tfng-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_tzm_Tfng-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ba4624651a68c219eefd7c9157f3bce58d751c48
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_tzm_Tfng-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: tzm_Tfng-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Central Atlas Tamazight: {{sentence_tzm_Tfng}} \nEnglish: "
+include: flores
+task: flores_tzm_Tfng-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_umb_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_umb_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0758003ad3bb766fad0fa545d39ba976d4443f26
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_umb_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: umb_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Umbundu: {{sentence_umb_Latn}} \nEnglish: "
+include: flores
+task: flores_umb_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_wol_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_wol_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..914e6c128220583cb7f2e064e0eed56948bbdfd9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_wol_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: wol_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Wolof: {{sentence_wol_Latn}} \nEnglish: "
+include: flores
+task: flores_wol_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_xho_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_xho_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bc130fb0447aa7e617debe3cf1086f55ac96aec6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_xho_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: xho_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Xhosa: {{sentence_xho_Latn}} \nEnglish: "
+include: flores
+task: flores_xho_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_yor_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_yor_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0ea0fbc4e65b9320f1d4bd701a98819c17b69ab0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_yor_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: yor_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Yoruba: {{sentence_yor_Latn}} \nEnglish: "
+include: flores
+task: flores_yor_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_zul_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_zul_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ea070b30e7822d9e55135f3842766e68f94c1f2c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/african-english/flores_zul_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: zul_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Zulu: {{sentence_zul_Latn}} \nEnglish: "
+include: flores
+task: flores_zul_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores
new file mode 100644
index 0000000000000000000000000000000000000000..e6f4d051431159f4360115226ea58dec2487c0c2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores
@@ -0,0 +1,27 @@
+tag:
+- african_flores_tasks
+- flores_eng-afr
+- flores_eng-afr_prompt_1
+- afrobench_MT_tasks
+dataset_path: facebook/flores
+dataset_kwargs: {trust_remote_code: True}
+output_type: generate_until
+validation_split: dev
+fewshot_split: dev
+test_split: devtest
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: chrf
+    aggregation: chrf
+    higher_is_better: true
+generation_kwargs:
+  until:
+    - "**"
+    - </s>
+  do_sample: false
+  temperature: 0.0
+repeats: 1
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-ace_Arab.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-ace_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a9da06483bc1e3f19e10636cdf1509ad899832ca
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-ace_Arab.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-ace_Arab
+doc_to_target: sentence_ace_Arab
+doc_to_text: "English: {{sentence_eng_Latn}} \nAcehnese (Arabic script): "
+include: flores
+task: flores_eng_Latn-ace_Arab_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-ace_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-ace_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d2ed60660bd3f565484d16440ce9fb2d82f6a555
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-ace_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-ace_Latn
+doc_to_target: sentence_ace_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nAcehnese (Latin script): "
+include: flores
+task: flores_eng_Latn-ace_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-acq_Arab.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-acq_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e61bb2472b427de012ce3d47122906df99f14089
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-acq_Arab.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-acq_Arab
+doc_to_target: sentence_acq_Arab
+doc_to_text: "English: {{sentence_eng_Latn}} \nTa’izzi-Adeni Arabic: "
+include: flores
+task: flores_eng_Latn-acq_Arab_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-aeb_Arab.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-aeb_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d59000626aa9ef7f6cfcd6bc6a315cbb25a90142
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-aeb_Arab.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-aeb_Arab
+doc_to_target: sentence_aeb_Arab
+doc_to_text: "English: {{sentence_eng_Latn}} \nTunisian Arabic: "
+include: flores
+task: flores_eng_Latn-aeb_Arab_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-afr_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-afr_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3b4c4d46b432d78f6e9947dbcd26868885560c53
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-afr_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-afr_Latn
+doc_to_target: sentence_afr_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nAfrikaans: "
+include: flores
+task: flores_eng_Latn-afr_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-aka_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-aka_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d66a637f75d19c72d3846819d249f9a67989e04c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-aka_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-aka_Latn
+doc_to_target: sentence_aka_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nAkan: "
+include: flores
+task: flores_eng_Latn-aka_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-amh_Ethi.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-amh_Ethi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e648d33270ae5944609c8ce50c2dd3e92bbfeb97
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-amh_Ethi.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-amh_Ethi
+doc_to_target: sentence_amh_Ethi
+doc_to_text: "English: {{sentence_eng_Latn}} \nAmharic: "
+include: flores
+task: flores_eng_Latn-amh_Ethi_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-ary_Arab.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-ary_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..54f9a2ad67ac390ba4cc4a7a6db6a1d2e5061a54
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-ary_Arab.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-ary_Arab
+doc_to_target: sentence_ary_Arab
+doc_to_text: "English: {{sentence_eng_Latn}} \nMoroccan Arabic: "
+include: flores
+task: flores_eng_Latn-ary_Arab_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-arz_Arab.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-arz_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a42fa079b4501f6402eebd3241c26f14b1e5af6e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-arz_Arab.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-arz_Arab
+doc_to_target: sentence_arz_Arab
+doc_to_text: "English: {{sentence_eng_Latn}} \nEgyptian Arabic: "
+include: flores
+task: flores_eng_Latn-arz_Arab_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-bam_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-bam_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4c85b7db9394d3c309b3e5c5b196a0e5451c4d0b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-bam_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-bam_Latn
+doc_to_target: sentence_bam_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nBambara: "
+include: flores
+task: flores_eng_Latn-bam_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-ban_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-ban_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f43a4b71131da9cf555964b79a6258ce7f36c2ef
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-ban_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-ban_Latn
+doc_to_target: sentence_ban_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nBalinese: "
+include: flores
+task: flores_eng_Latn-ban_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-bem_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-bem_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..252117ef888300c0dfaa64f2cacc55bf66292136
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-bem_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-bem_Latn
+doc_to_target: sentence_bem_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nBemba: "
+include: flores
+task: flores_eng_Latn-bem_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-cjk_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-cjk_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..eb4e3566b8383d7bce70a88bd3b663fa984c5154
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-cjk_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-cjk_Latn
+doc_to_target: sentence_cjk_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nChokwe: "
+include: flores
+task: flores_eng_Latn-cjk_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-dik_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-dik_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..36dea9d3a9dc3371a576315540f439af9e38b4e5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-dik_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-dik_Latn
+doc_to_target: sentence_dik_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nSouthwestern Dinka: "
+include: flores
+task: flores_eng_Latn-dik_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-dyu_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-dyu_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c32be8ac93c7cecfbc171eb898232c7296cf6886
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-dyu_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-dyu_Latn
+doc_to_target: sentence_dyu_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nDyula: "
+include: flores
+task: flores_eng_Latn-dyu_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-ewe_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-ewe_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0a71b4556a077b260bfb340a3c0c289ae79ac88b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-ewe_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-ewe_Latn
+doc_to_target: sentence_ewe_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nEwe: "
+include: flores
+task: flores_eng_Latn-ewe_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-fon_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-fon_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1000e13ad1f6864f002c741b8074d06073cb3dc8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-fon_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-fon_Latn
+doc_to_target: sentence_fon_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nFon: "
+include: flores
+task: flores_eng_Latn-fon_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-fra_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-fra_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..47b99a088c485bed46c51d1da0308ac569aaebd3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-fra_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-fra_Latn
+doc_to_target: sentence_fra_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nFrench: "
+include: flores
+task: flores_eng_Latn-fra_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-fuv_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-fuv_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8855378737fa985bf35a840c78f81d34f8542305
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-fuv_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-fuv_Latn
+doc_to_target: sentence_fuv_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nNigerian Fulfulde: "
+include: flores
+task: flores_eng_Latn-fuv_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-gaz_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-gaz_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8e124ae153091ed617f87388afb7d6c4c980d754
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-gaz_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-gaz_Latn
+doc_to_target: sentence_gaz_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nOromo: "
+include: flores
+task: flores_eng_Latn-gaz_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-hau_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-hau_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a9aaf537f1d1c491ae3de996d2180c9b32002647
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-hau_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-hau_Latn
+doc_to_target: sentence_hau_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nHausa: "
+include: flores
+task: flores_eng_Latn-hau_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-ibo_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-ibo_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ebf8f517c3e96d64716db741688161d520bd04a5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-ibo_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-ibo_Latn
+doc_to_target: sentence_ibo_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nIgbo: "
+include: flores
+task: flores_eng_Latn-ibo_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-kab_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-kab_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fd22cb7de77e624bea297d7011aa18aab3408b10
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-kab_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-kab_Latn
+doc_to_target: sentence_kab_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nKabyle: "
+include: flores
+task: flores_eng_Latn-kab_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-kam_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-kam_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..802ae7dca4e9b4165f2028cfce4be8c08c1b0edd
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-kam_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-kam_Latn
+doc_to_target: sentence_kam_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nKamba: "
+include: flores
+task: flores_eng_Latn-kam_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-kbp_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-kbp_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9cc1afd5e9af72a8a7f73a6789cb2dc0af1e9c39
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-kbp_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-kbp_Latn
+doc_to_target: sentence_kbp_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nKabiyè: "
+include: flores
+task: flores_eng_Latn-kbp_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-kea_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-kea_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..55d3e7767c8533eb9f0f94c37a33fdb628c2b27a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-kea_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-kea_Latn
+doc_to_target: sentence_kea_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nKabuverdianu: "
+include: flores
+task: flores_eng_Latn-kea_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-kik_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-kik_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bee435fe5495364b08420772d1dfade8f9ac671d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-kik_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-kik_Latn
+doc_to_target: sentence_kik_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nKikuyu: "
+include: flores
+task: flores_eng_Latn-kik_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-kin_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-kin_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..28530541f153ad52724b5d0aa13eca176fa73c29
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-kin_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-kin_Latn
+doc_to_target: sentence_kin_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nKinyarwanda: "
+include: flores
+task: flores_eng_Latn-kin_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-kmb_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-kmb_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5619209f346582b64e51b904288181fc18bc34d6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-kmb_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-kmb_Latn
+doc_to_target: sentence_kmb_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nKimbundu: "
+include: flores
+task: flores_eng_Latn-kmb_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-knc_Arab.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-knc_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fba5e257d7c5154ef3d89ace61d5f7fe2397ccc8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-knc_Arab.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-knc_Arab
+doc_to_target: sentence_knc_Arab
+doc_to_text: "English: {{sentence_eng_Latn}} \nCentral Kanuri (Arabic script): "
+include: flores
+task: flores_eng_Latn-knc_Arab_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-knc_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-knc_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c1f84d5753ff5e03d9d4d6463beeb9a390d5c2eb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-knc_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-knc_Latn
+doc_to_target: sentence_knc_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nCentral Kanuri (Latin script): "
+include: flores
+task: flores_eng_Latn-knc_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-kon_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-kon_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d6d8ef32d897edfa3086e7b93c39efa41a907e75
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-kon_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-kon_Latn
+doc_to_target: sentence_kon_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nKikongo: "
+include: flores
+task: flores_eng_Latn-kon_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-lin_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-lin_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f998f3590b5ebdc35388a9a875a6358366684260
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-lin_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-lin_Latn
+doc_to_target: sentence_lin_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nLingala: "
+include: flores
+task: flores_eng_Latn-lin_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-lua_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-lua_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..246fc1354a71eadbe1ea6e058387859fd5c018c1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-lua_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-lua_Latn
+doc_to_target: sentence_lua_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nLuba-Kasai: "
+include: flores
+task: flores_eng_Latn-lua_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-lug_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-lug_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3416989fdba2777eb13664a7db4409f091bb0b75
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-lug_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-lug_Latn
+doc_to_target: sentence_lug_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nLuganda: "
+include: flores
+task: flores_eng_Latn-lug_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-luo_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-luo_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8a56e1482037f8728945929f991a9217a9d91f05
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-luo_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-luo_Latn
+doc_to_target: sentence_luo_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nLuo: "
+include: flores
+task: flores_eng_Latn-luo_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-mos_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-mos_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..393862689847cd6d3c6701696f57bea6f190c564
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-mos_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-mos_Latn
+doc_to_target: sentence_mos_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nMossi: "
+include: flores
+task: flores_eng_Latn-mos_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-nso_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-nso_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..86bd9c6bcdc72f750dd3ae245c992e754ec6b55b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-nso_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-nso_Latn
+doc_to_target: sentence_nso_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nNorthern Sotho: "
+include: flores
+task: flores_eng_Latn-nso_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-nus_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-nus_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5ac9148958f11e460266f4ff55aab4b44263074c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-nus_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-nus_Latn
+doc_to_target: sentence_nus_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nNuer: "
+include: flores
+task: flores_eng_Latn-nus_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-nya_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-nya_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f4e35d78e708d25ad57abf36bb3ef4230f1acd66
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-nya_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-nya_Latn
+doc_to_target: sentence_nya_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nNyanja: "
+include: flores
+task: flores_eng_Latn-nya_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-plt_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-plt_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e07ffcd257e91028807b37bec7c259f04fd3adb9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-plt_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-plt_Latn
+doc_to_target: sentence_plt_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nPlateau Malagasy: "
+include: flores
+task: flores_eng_Latn-plt_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-run_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-run_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cad3666bdb35072af569f2405ea106c487121d57
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-run_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-run_Latn
+doc_to_target: sentence_run_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nRundi: "
+include: flores
+task: flores_eng_Latn-run_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-sag_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-sag_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9eaa3c8995add71ccf7052a621d92e76cd06861c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-sag_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-sag_Latn
+doc_to_target: sentence_sag_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nSango: "
+include: flores
+task: flores_eng_Latn-sag_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-sna_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-sna_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..16f70ba79f218b67b8e3efb730cde9d903ea38b3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-sna_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-sna_Latn
+doc_to_target: sentence_sna_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nShona: "
+include: flores
+task: flores_eng_Latn-sna_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-som_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-som_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b628b7a4eddf4a073c2b6a77c6ed295c0f9cca17
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-som_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-som_Latn
+doc_to_target: sentence_som_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nSomali: "
+include: flores
+task: flores_eng_Latn-som_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-sot_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-sot_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..62655dff56701879705027502b23986c3bd96f78
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-sot_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-sot_Latn
+doc_to_target: sentence_sot_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nSouthern Sotho: "
+include: flores
+task: flores_eng_Latn-sot_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-ssw_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-ssw_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c247e565f839de423ac4aeecc79198189471d126
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-ssw_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-ssw_Latn
+doc_to_target: sentence_ssw_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nSwati: "
+include: flores
+task: flores_eng_Latn-ssw_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-sun_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-sun_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ee3c4a5712ea83c5ab676c949c77192ba4f84735
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-sun_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-sun_Latn
+doc_to_target: sentence_sun_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nSundanese: "
+include: flores
+task: flores_eng_Latn-sun_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-swh_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-swh_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2b464c16601d0e7e385a6df32b83fcde41d24c91
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-swh_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-swh_Latn
+doc_to_target: sentence_swh_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nSwahili: "
+include: flores
+task: flores_eng_Latn-swh_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-taq_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-taq_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cc50d54faa83f621f08241f59baf6a14e4b6c674
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-taq_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-taq_Latn
+doc_to_target: sentence_taq_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nTamasheq: "
+include: flores
+task: flores_eng_Latn-taq_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-taq_Tfng.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-taq_Tfng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c8c0045338ad054dd48750ae88767f958f0b9e4f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-taq_Tfng.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-taq_Tfng
+doc_to_target: sentence_taq_Tfng
+doc_to_text: "English: {{sentence_eng_Latn}} \nTamasheq (Tifinagh script): "
+include: flores
+task: flores_eng_Latn-taq_Tfng_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-tir_Ethi.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-tir_Ethi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2d3110696c8c1088d2ad2683d8d9d45b3415038d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-tir_Ethi.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-tir_Ethi
+doc_to_target: sentence_tir_Ethi
+doc_to_text: "English: {{sentence_eng_Latn}} \nTigrinya: "
+include: flores
+task: flores_eng_Latn-tir_Ethi_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-tsn_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-tsn_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d782a2af5cde5ae3a006c205a0796ea1a15750d8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-tsn_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-tsn_Latn
+doc_to_target: sentence_tsn_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nSetswana: "
+include: flores
+task: flores_eng_Latn-tsn_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-tso_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-tso_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..85bca5e9ffdf083cca344501fb818d7e0e60b732
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-tso_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-tso_Latn
+doc_to_target: sentence_tso_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nTsonga: "
+include: flores
+task: flores_eng_Latn-tso_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-tum_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-tum_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a9036f3b7a1d2c4fa91a1f4278c1019cdf2bc68a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-tum_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-tum_Latn
+doc_to_target: sentence_tum_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nTumbuka: "
+include: flores
+task: flores_eng_Latn-tum_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-twi_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-twi_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9658615983d1c44bfd74d88def6db73a465ce96d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-twi_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-twi_Latn
+doc_to_target: sentence_twi_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nTwi: "
+include: flores
+task: flores_eng_Latn-twi_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-tzm_Tfng.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-tzm_Tfng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..28728f412fdbe74d85d062a316eeb385b04b94a1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-tzm_Tfng.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-tzm_Tfng
+doc_to_target: sentence_tzm_Tfng
+doc_to_text: "English: {{sentence_eng_Latn}} \nCentral Atlas Tamazight: "
+include: flores
+task: flores_eng_Latn-tzm_Tfng_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-umb_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-umb_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bd95ac316ab006df8c4a52867ae3fdafafa36da2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-umb_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-umb_Latn
+doc_to_target: sentence_umb_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nUmbundu: "
+include: flores
+task: flores_eng_Latn-umb_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-wol_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-wol_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..eb6965245032f6821b4ca413d6ead9e892bdb407
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-wol_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-wol_Latn
+doc_to_target: sentence_wol_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nWolof: "
+include: flores
+task: flores_eng_Latn-wol_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-xho_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-xho_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..08480361c18c787ead563d02783982bd1ad8b8e8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-xho_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-xho_Latn
+doc_to_target: sentence_xho_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nXhosa: "
+include: flores
+task: flores_eng_Latn-xho_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-yor_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-yor_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d29e9a9c859134f25abdc46ca44a256d473415a0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-yor_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-yor_Latn
+doc_to_target: sentence_yor_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nYoruba: "
+include: flores
+task: flores_eng_Latn-yor_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-zul_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-zul_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..62de546051295ff6b413870f9eee5e806151f4cd
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/english-african/flores_eng_Latn-zul_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: eng_Latn-zul_Latn
+doc_to_target: sentence_zul_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nZulu: "
+include: flores
+task: flores_eng_Latn-zul_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/flores/prompt_1/flores b/lm_eval/tasks/afrobench/flores/prompt_1/flores
new file mode 100644
index 0000000000000000000000000000000000000000..74f9f33eb22662bec79709bd64d8d31f3fb8eae0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_1/flores
@@ -0,0 +1,24 @@
+tag:
+- flores_tasks
+- flores_afr-eng
+dataset_path: facebook/flores
+dataset_kwargs: {trust_remote_code: True}
+output_type: generate_until
+validation_split: dev
+fewshot_split: dev
+test_split: devtest
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: chrf
+    aggregation: chrf
+    higher_is_better: true
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+repeats: 1
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores
new file mode 100644
index 0000000000000000000000000000000000000000..e0fa69a2a441116ef15a4158cc366792d841f304
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores
@@ -0,0 +1,27 @@
+tag:
+- african_flores_tasks
+- flores_afr-eng
+- flores_afr-eng_prompt_2
+- afrobench_MT_tasks
+dataset_path: facebook/flores
+dataset_kwargs: {trust_remote_code: True}
+output_type: generate_until
+validation_split: dev
+fewshot_split: dev
+test_split: devtest
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: chrf
+    aggregation: chrf
+    higher_is_better: true
+generation_kwargs:
+  until:
+    - "**"
+    - </s>
+  do_sample: false
+  temperature: 0.0
+repeats: 1
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_ace_Arab-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_ace_Arab-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dd54b6c84dc428721616791f698ce55f5064aef4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_ace_Arab-eng_Latn.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: ace_Arab-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Acehnese (Arabic\
+  \ script) sentences to English \nAcehnese (Arabic script): {{sentence_ace_Arab}}\n\
+  English: "
+include: flores
+task: flores_ace_Arab-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_ace_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_ace_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b0814b27f80b3ac85e70179a124708ed5f9c3ac4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_ace_Latn-eng_Latn.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: ace_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Acehnese (Latin\
+  \ script) sentences to English \nAcehnese (Latin script): {{sentence_ace_Latn}}\n\
+  English: "
+include: flores
+task: flores_ace_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_acq_Arab-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_acq_Arab-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d1464b4965d566d628dcfa12583003a972af29aa
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_acq_Arab-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: acq_Arab-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Ta’izzi-Adeni\
+  \ Arabic sentences to English \nTa’izzi-Adeni Arabic: {{sentence_acq_Arab}}\nEnglish: "
+include: flores
+task: flores_acq_Arab-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_aeb_Arab-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_aeb_Arab-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2bbded5ff0d48bf2bbd582dbfa97cfa4d222b5c9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_aeb_Arab-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: aeb_Arab-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Tunisian Arabic\
+  \ sentences to English \nTunisian Arabic: {{sentence_aeb_Arab}}\nEnglish: "
+include: flores
+task: flores_aeb_Arab-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_afr_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_afr_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7b5847d8a997310bd191a3e0009d24abdedb6e4f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_afr_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: afr_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Afrikaans sentences\
+  \ to English \nAfrikaans: {{sentence_afr_Latn}}\nEnglish: "
+include: flores
+task: flores_afr_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_aka_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_aka_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5f9493c5ea18493d4b0a2f2070135b5fb01c0692
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_aka_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: aka_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Akan sentences\
+  \ to English \nAkan: {{sentence_aka_Latn}}\nEnglish: "
+include: flores
+task: flores_aka_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_amh_Ethi-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_amh_Ethi-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d615bfc3d6f9ed19dce78c024fdaa45a53fdac8b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_amh_Ethi-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: amh_Ethi-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Amharic sentences\
+  \ to English \nAmharic: {{sentence_amh_Ethi}}\nEnglish: "
+include: flores
+task: flores_amh_Ethi-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_ary_Arab-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_ary_Arab-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..feecf4510ac45bf5d364cf1782942e388c2119eb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_ary_Arab-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ary_Arab-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Moroccan Arabic\
+  \ sentences to English \nMoroccan Arabic: {{sentence_ary_Arab}}\nEnglish: "
+include: flores
+task: flores_ary_Arab-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_arz_Arab-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_arz_Arab-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..13f3e18b5d6ecd8d911741e4fe1d3ee7720f81df
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_arz_Arab-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: arz_Arab-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Egyptian Arabic\
+  \ sentences to English \nEgyptian Arabic: {{sentence_arz_Arab}}\nEnglish: "
+include: flores
+task: flores_arz_Arab-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_bam_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_bam_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a258d264b26a9c02be0ca900f058500b19f0c256
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_bam_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: bam_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Bambara sentences\
+  \ to English \nBambara: {{sentence_bam_Latn}}\nEnglish: "
+include: flores
+task: flores_bam_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_ban_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_ban_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c19cf00874080b980b18e811f61d31a332ca2a5b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_ban_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ban_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Balinese sentences\
+  \ to English \nBalinese: {{sentence_ban_Latn}}\nEnglish: "
+include: flores
+task: flores_ban_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_bem_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_bem_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9500a3b37c033a026795f47cc74c6a7df94325c1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_bem_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: bem_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Bemba sentences\
+  \ to English \nBemba: {{sentence_bem_Latn}}\nEnglish: "
+include: flores
+task: flores_bem_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_cjk_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_cjk_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..58185199f7dcfcff5a9c5c4ba501ea7edaa4b26f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_cjk_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: cjk_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Chokwe sentences\
+  \ to English \nChokwe: {{sentence_cjk_Latn}}\nEnglish: "
+include: flores
+task: flores_cjk_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_dik_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_dik_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4c9090a56c686036f07c63b3b92bafad068d811e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_dik_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: dik_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Southwestern Dinka\
+  \ sentences to English \nSouthwestern Dinka: {{sentence_dik_Latn}}\nEnglish: "
+include: flores
+task: flores_dik_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_dyu_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_dyu_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..47187fb0817ddf7c04041f6b2b9ef358138dfdf8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_dyu_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: dyu_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Dyula sentences\
+  \ to English \nDyula: {{sentence_dyu_Latn}}\nEnglish: "
+include: flores
+task: flores_dyu_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_ewe_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_ewe_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8838bc3a03742e9e4da5a387f995dcc94018b5d9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_ewe_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ewe_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Ewe sentences\
+  \ to English \nEwe: {{sentence_ewe_Latn}}\nEnglish: "
+include: flores
+task: flores_ewe_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_fon_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_fon_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7874a87cecb89c402a0e0c1ffea473f4283cc58f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_fon_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: fon_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Fon sentences\
+  \ to English \nFon: {{sentence_fon_Latn}}\nEnglish: "
+include: flores
+task: flores_fon_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_fra_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_fra_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bb84246ef4b7942f1ecac940e1f80d1664faef3e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_fra_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: fra_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following French sentences\
+  \ to English \nFrench: {{sentence_fra_Latn}}\nEnglish: "
+include: flores
+task: flores_fra_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_fuv_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_fuv_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0686706d533616d7c1809313c1f5bd302b3c1a45
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_fuv_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: fuv_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Nigerian Fulfulde\
+  \ sentences to English \nNigerian Fulfulde: {{sentence_fuv_Latn}}\nEnglish: "
+include: flores
+task: flores_fuv_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_gaz_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_gaz_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f0ba07a6112f9dc2b740a868a5df7673dfba7650
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_gaz_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: gaz_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Oromo sentences\
+  \ to English \nOromo: {{sentence_gaz_Latn}}\nEnglish: "
+include: flores
+task: flores_gaz_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_hau_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_hau_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..85647455d58aabfb824508bae282e15f63ccaa18
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_hau_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: hau_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Hausa sentences\
+  \ to English \nHausa: {{sentence_hau_Latn}}\nEnglish: "
+include: flores
+task: flores_hau_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_ibo_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_ibo_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c401f1e75be4f83b89a714d40eaaa97ffe677e36
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_ibo_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ibo_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Igbo sentences\
+  \ to English \nIgbo: {{sentence_ibo_Latn}}\nEnglish: "
+include: flores
+task: flores_ibo_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_kab_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_kab_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c82946b9143ce0706f9768bb078d6bdc6541ebbc
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_kab_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kab_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Kabyle sentences\
+  \ to English \nKabyle: {{sentence_kab_Latn}}\nEnglish: "
+include: flores
+task: flores_kab_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_kam_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_kam_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8661bf6e5694498af0e29ca23e9e037ddd77adc9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_kam_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kam_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Kamba sentences\
+  \ to English \nKamba: {{sentence_kam_Latn}}\nEnglish: "
+include: flores
+task: flores_kam_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_kbp_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_kbp_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7e20af3149dc1baad3b0edca477444a98ba078c9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_kbp_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kbp_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Kabiyè sentences\
+  \ to English \nKabiyè: {{sentence_kbp_Latn}}\nEnglish: "
+include: flores
+task: flores_kbp_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_kea_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_kea_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d078c293ab75a286b9ac717f9322d8fd92d0b585
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_kea_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kea_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Kabuverdianu sentences\
+  \ to English \nKabuverdianu: {{sentence_kea_Latn}}\nEnglish: "
+include: flores
+task: flores_kea_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_kik_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_kik_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..346dcb98be91749b59747194a47cefe40ca3eef4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_kik_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kik_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Kikuyu sentences\
+  \ to English \nKikuyu: {{sentence_kik_Latn}}\nEnglish: "
+include: flores
+task: flores_kik_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_kin_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_kin_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7210e7e6b21c0458983917f659518ba666d45d0a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_kin_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kin_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Kinyarwanda sentences\
+  \ to English \nKinyarwanda: {{sentence_kin_Latn}}\nEnglish: "
+include: flores
+task: flores_kin_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_kmb_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_kmb_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b3dc8d5ac6d52ccfb84f6b8fc416ab61eaa7c007
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_kmb_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kmb_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Kimbundu sentences\
+  \ to English \nKimbundu: {{sentence_kmb_Latn}}\nEnglish: "
+include: flores
+task: flores_kmb_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_knc_Arab-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_knc_Arab-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..37d5d624ff0f55b15649c5468f215b069efd4bcb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_knc_Arab-eng_Latn.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: knc_Arab-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Central Kanuri\
+  \ (Arabic script) sentences to English \nCentral Kanuri (Arabic script): {{sentence_knc_Arab}}\n\
+  English: "
+include: flores
+task: flores_knc_Arab-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_knc_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_knc_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1d60408cd39dc640f7db077186340de97aa4702f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_knc_Latn-eng_Latn.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: knc_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Central Kanuri\
+  \ (Latin script) sentences to English \nCentral Kanuri (Latin script): {{sentence_knc_Latn}}\n\
+  English: "
+include: flores
+task: flores_knc_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_kon_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_kon_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..63b3539cf9f361ba4b0786596bf43d934c85478c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_kon_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kon_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Kikongo sentences\
+  \ to English \nKikongo: {{sentence_kon_Latn}}\nEnglish: "
+include: flores
+task: flores_kon_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_lin_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_lin_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..82543c70d5b26ed5b79288c0775a6d21216bfbe8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_lin_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: lin_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Lingala sentences\
+  \ to English \nLingala: {{sentence_lin_Latn}}\nEnglish: "
+include: flores
+task: flores_lin_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_lua_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_lua_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..af0796cc47138b67137e58ba44835ffa9ebf8596
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_lua_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: lua_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Luba-Kasai sentences\
+  \ to English \nLuba-Kasai: {{sentence_lua_Latn}}\nEnglish: "
+include: flores
+task: flores_lua_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_lug_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_lug_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..eb9f47bcaf95caaac8de7730dba8f662dac0230c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_lug_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: lug_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Luganda sentences\
+  \ to English \nLuganda: {{sentence_lug_Latn}}\nEnglish: "
+include: flores
+task: flores_lug_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_luo_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_luo_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6000ab87662f6753b7dd98d97dc1c057c6c23b58
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_luo_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: luo_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Luo sentences\
+  \ to English \nLuo: {{sentence_luo_Latn}}\nEnglish: "
+include: flores
+task: flores_luo_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_mos_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_mos_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b72acf36fdc39990bc8d6a91a13e1194ce3d42df
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_mos_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: mos_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Mossi sentences\
+  \ to English \nMossi: {{sentence_mos_Latn}}\nEnglish: "
+include: flores
+task: flores_mos_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_nso_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_nso_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..028aa75cc17d326bf4d1d85b5c96ff050bb8d78e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_nso_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: nso_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Northern Sotho\
+  \ sentences to English \nNorthern Sotho: {{sentence_nso_Latn}}\nEnglish: "
+include: flores
+task: flores_nso_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_nus_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_nus_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e1f9ca54df695d83cc607b4409522ab459c28a99
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_nus_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: nus_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Nuer sentences\
+  \ to English \nNuer: {{sentence_nus_Latn}}\nEnglish: "
+include: flores
+task: flores_nus_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_nya_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_nya_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a5ceb01789f8d71651931a85a2b3580381895d97
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_nya_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: nya_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Nyanja sentences\
+  \ to English \nNyanja: {{sentence_nya_Latn}}\nEnglish: "
+include: flores
+task: flores_nya_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_plt_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_plt_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a2cdace5ed128379cd6093e6da5fee9345ef44c3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_plt_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: plt_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Plateau Malagasy\
+  \ sentences to English \nPlateau Malagasy: {{sentence_plt_Latn}}\nEnglish: "
+include: flores
+task: flores_plt_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_run_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_run_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..aa4b5bc968c230b50942903d989e30e80cb51f8b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_run_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: run_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Rundi sentences\
+  \ to English \nRundi: {{sentence_run_Latn}}\nEnglish: "
+include: flores
+task: flores_run_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_sag_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_sag_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b20eef56654fac2f2f086dcf6e0deea8a59c345d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_sag_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sag_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Sango sentences\
+  \ to English \nSango: {{sentence_sag_Latn}}\nEnglish: "
+include: flores
+task: flores_sag_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_sna_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_sna_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f0c98f038617264c525edf3d0df5325e05da55ce
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_sna_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sna_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Shona sentences\
+  \ to English \nShona: {{sentence_sna_Latn}}\nEnglish: "
+include: flores
+task: flores_sna_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_som_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_som_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b862c759b912e197cc16acda3cb68d1271d77e0f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_som_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: som_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Somali sentences\
+  \ to English \nSomali: {{sentence_som_Latn}}\nEnglish: "
+include: flores
+task: flores_som_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_sot_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_sot_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d5d4e24709a334418b7a23a5d0852f7e5ea665b8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_sot_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sot_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Southern Sotho\
+  \ sentences to English \nSouthern Sotho: {{sentence_sot_Latn}}\nEnglish: "
+include: flores
+task: flores_sot_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_ssw_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_ssw_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5ae236e5cbc21cba7724cb345d78ed20097b351b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_ssw_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ssw_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Swati sentences\
+  \ to English \nSwati: {{sentence_ssw_Latn}}\nEnglish: "
+include: flores
+task: flores_ssw_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_sun_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_sun_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5a697a2194eaf477a40ca3f56787caaf563d2179
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_sun_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sun_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Sundanese sentences\
+  \ to English \nSundanese: {{sentence_sun_Latn}}\nEnglish: "
+include: flores
+task: flores_sun_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_swh_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_swh_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..06dd9fcc0d384c4926a681e64f1c185c1111fe94
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_swh_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: swh_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Swahili sentences\
+  \ to English \nSwahili: {{sentence_swh_Latn}}\nEnglish: "
+include: flores
+task: flores_swh_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_taq_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_taq_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5380298e28c3be0c4c9ba536dfdcb685dd7356f2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_taq_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: taq_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Tamasheq sentences\
+  \ to English \nTamasheq: {{sentence_taq_Latn}}\nEnglish: "
+include: flores
+task: flores_taq_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_taq_Tfng-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_taq_Tfng-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7cfb54197cdaffd83c578da681c1b5d36c9f4265
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_taq_Tfng-eng_Latn.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: taq_Tfng-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Tamasheq (Tifinagh\
+  \ script) sentences to English \nTamasheq (Tifinagh script): {{sentence_taq_Tfng}}\n\
+  English: "
+include: flores
+task: flores_taq_Tfng-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_tir_Ethi-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_tir_Ethi-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..56607b6a6e76f921917eb4453b0851dd8a9fb415
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_tir_Ethi-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: tir_Ethi-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Tigrinya sentences\
+  \ to English \nTigrinya: {{sentence_tir_Ethi}}\nEnglish: "
+include: flores
+task: flores_tir_Ethi-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_tsn_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_tsn_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b8d04febf4a6ebf564df918c236ede2ccc016b34
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_tsn_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: tsn_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Setswana sentences\
+  \ to English \nSetswana: {{sentence_tsn_Latn}}\nEnglish: "
+include: flores
+task: flores_tsn_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_tso_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_tso_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3c357e9df91da2e9b05faf883128ad9b81028331
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_tso_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: tso_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Tsonga sentences\
+  \ to English \nTsonga: {{sentence_tso_Latn}}\nEnglish: "
+include: flores
+task: flores_tso_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_tum_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_tum_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d70a89b24f187643ac4e93dcad084e598385207d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_tum_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: tum_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Tumbuka sentences\
+  \ to English \nTumbuka: {{sentence_tum_Latn}}\nEnglish: "
+include: flores
+task: flores_tum_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_twi_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_twi_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d9dc957751e0c5115f4f8cb9d3bd47cfd3a66d9d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_twi_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: twi_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Twi sentences\
+  \ to English \nTwi: {{sentence_twi_Latn}}\nEnglish: "
+include: flores
+task: flores_twi_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_tzm_Tfng-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_tzm_Tfng-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..81f9c721e731ce51ac8cc8a8adc31225edfc3d59
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_tzm_Tfng-eng_Latn.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: tzm_Tfng-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Central Atlas\
+  \ Tamazight sentences to English \nCentral Atlas Tamazight: {{sentence_tzm_Tfng}}\n\
+  English: "
+include: flores
+task: flores_tzm_Tfng-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_umb_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_umb_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..983675b039a2ba51232dede065edec9dd7536c75
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_umb_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: umb_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Umbundu sentences\
+  \ to English \nUmbundu: {{sentence_umb_Latn}}\nEnglish: "
+include: flores
+task: flores_umb_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_wol_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_wol_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0f1210fec591a63d43bd3afebd770b844ffd28a8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_wol_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: wol_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Wolof sentences\
+  \ to English \nWolof: {{sentence_wol_Latn}}\nEnglish: "
+include: flores
+task: flores_wol_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_xho_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_xho_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f28e1bb3eed67659b3ac23ef9f97e6cd9c5ba7d8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_xho_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: xho_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Xhosa sentences\
+  \ to English \nXhosa: {{sentence_xho_Latn}}\nEnglish: "
+include: flores
+task: flores_xho_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_yor_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_yor_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e066592660b79c6b5e4d5c6046786a2b118e1eed
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_yor_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: yor_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Yoruba sentences\
+  \ to English \nYoruba: {{sentence_yor_Latn}}\nEnglish: "
+include: flores
+task: flores_yor_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_zul_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_zul_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f3b2fef466a1599ed1c5920328031176db342169
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/african-english/flores_zul_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: zul_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Zulu sentences\
+  \ to English \nZulu: {{sentence_zul_Latn}}\nEnglish: "
+include: flores
+task: flores_zul_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores
new file mode 100644
index 0000000000000000000000000000000000000000..ab71d6563002c5deed46fb73f2b61bd585b7b9ae
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores
@@ -0,0 +1,27 @@
+tag:
+- african_flores_tasks
+- flores_eng-afr
+- flores_eng-afr_prompt_2
+- afrobench_MT_tasks
+dataset_path: facebook/flores
+dataset_kwargs: {trust_remote_code: True}
+output_type: generate_until
+validation_split: dev
+fewshot_split: dev
+test_split: devtest
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: chrf
+    aggregation: chrf
+    higher_is_better: true
+generation_kwargs:
+  until:
+    - "**"
+    - </s>
+  do_sample: false
+  temperature: 0.0
+repeats: 1
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-ace_Arab.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-ace_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..30890d9150172e44d453679cea878790d1153f95
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-ace_Arab.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: eng_Latn-ace_Arab
+doc_to_target: sentence_ace_Arab
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Acehnese (Arabic script) \nEnglish: {{sentence_eng_Latn}} \nAcehnese (Arabic\
+  \ script): "
+include: flores
+task: flores_eng_Latn-ace_Arab_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-ace_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-ace_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4356785a7d7a7de55ea328e7957ba14764a26745
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-ace_Latn.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: eng_Latn-ace_Latn
+doc_to_target: sentence_ace_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Acehnese (Latin script) \nEnglish: {{sentence_eng_Latn}} \nAcehnese (Latin\
+  \ script): "
+include: flores
+task: flores_eng_Latn-ace_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-acq_Arab.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-acq_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..630c824e342e8032926204ca41cfdbc6472c35eb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-acq_Arab.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-acq_Arab
+doc_to_target: sentence_acq_Arab
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Ta’izzi-Adeni Arabic \nEnglish: {{sentence_eng_Latn}} \nTa’izzi-Adeni Arabic: "
+include: flores
+task: flores_eng_Latn-acq_Arab_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-aeb_Arab.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-aeb_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0df4f642f499163c22733c8d1c7397f9949054c6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-aeb_Arab.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-aeb_Arab
+doc_to_target: sentence_aeb_Arab
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Tunisian Arabic \nEnglish: {{sentence_eng_Latn}} \nTunisian Arabic: "
+include: flores
+task: flores_eng_Latn-aeb_Arab_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-afr_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-afr_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e2769adf1ec72d54aab8dd1911b91f14c6c56db7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-afr_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-afr_Latn
+doc_to_target: sentence_afr_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Afrikaans \nEnglish: {{sentence_eng_Latn}} \nAfrikaans: "
+include: flores
+task: flores_eng_Latn-afr_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-aka_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-aka_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..624149c7fb2c031e4483050382e7fe12a09ad32f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-aka_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-aka_Latn
+doc_to_target: sentence_aka_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Akan \nEnglish: {{sentence_eng_Latn}} \nAkan: "
+include: flores
+task: flores_eng_Latn-aka_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-amh_Ethi.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-amh_Ethi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0a53e8c2f24cba707d059a83dfa18d3f83791021
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-amh_Ethi.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-amh_Ethi
+doc_to_target: sentence_amh_Ethi
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Amharic \nEnglish: {{sentence_eng_Latn}} \nAmharic: "
+include: flores
+task: flores_eng_Latn-amh_Ethi_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-ary_Arab.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-ary_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bb814d766327c71ebfd38d4ca046f2484aa3d3d1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-ary_Arab.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-ary_Arab
+doc_to_target: sentence_ary_Arab
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Moroccan Arabic \nEnglish: {{sentence_eng_Latn}} \nMoroccan Arabic: "
+include: flores
+task: flores_eng_Latn-ary_Arab_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-arz_Arab.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-arz_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0362666cb7d58d02ed5af9e9429f8b56e1a12d47
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-arz_Arab.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-arz_Arab
+doc_to_target: sentence_arz_Arab
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Egyptian Arabic \nEnglish: {{sentence_eng_Latn}} \nEgyptian Arabic: "
+include: flores
+task: flores_eng_Latn-arz_Arab_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-bam_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-bam_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b38459211670064398a117bdb4b5a63c342471d8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-bam_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-bam_Latn
+doc_to_target: sentence_bam_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Bambara \nEnglish: {{sentence_eng_Latn}} \nBambara: "
+include: flores
+task: flores_eng_Latn-bam_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-ban_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-ban_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cff3c15bd4f226d62932f443416cc5e824dae612
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-ban_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-ban_Latn
+doc_to_target: sentence_ban_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Balinese \nEnglish: {{sentence_eng_Latn}} \nBalinese: "
+include: flores
+task: flores_eng_Latn-ban_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-bem_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-bem_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ef6552a2b4fc29aa64cb6c3e4b3f1304260c9d76
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-bem_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-bem_Latn
+doc_to_target: sentence_bem_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Bemba \nEnglish: {{sentence_eng_Latn}} \nBemba: "
+include: flores
+task: flores_eng_Latn-bem_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-cjk_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-cjk_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..38c4ea6ff6d0358ddf49b934b4f21549fb7b14d2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-cjk_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-cjk_Latn
+doc_to_target: sentence_cjk_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Chokwe \nEnglish: {{sentence_eng_Latn}} \nChokwe: "
+include: flores
+task: flores_eng_Latn-cjk_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-dik_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-dik_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bfcf7180903bace41e71d175a45c65ff68167344
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-dik_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-dik_Latn
+doc_to_target: sentence_dik_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Southwestern Dinka \nEnglish: {{sentence_eng_Latn}} \nSouthwestern Dinka: "
+include: flores
+task: flores_eng_Latn-dik_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-dyu_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-dyu_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a9fab72b27ebc9d9c9a80dd7b41c0d270f1114e0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-dyu_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-dyu_Latn
+doc_to_target: sentence_dyu_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Dyula \nEnglish: {{sentence_eng_Latn}} \nDyula: "
+include: flores
+task: flores_eng_Latn-dyu_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-ewe_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-ewe_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5ecc34e50ab717b0fa3d8d6608cb952692446f89
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-ewe_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-ewe_Latn
+doc_to_target: sentence_ewe_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Ewe \nEnglish: {{sentence_eng_Latn}} \nEwe: "
+include: flores
+task: flores_eng_Latn-ewe_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-fon_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-fon_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ed029237af79c6aaabe9942cb911a556718c014c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-fon_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-fon_Latn
+doc_to_target: sentence_fon_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Fon \nEnglish: {{sentence_eng_Latn}} \nFon: "
+include: flores
+task: flores_eng_Latn-fon_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-fra_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-fra_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9d54e66c20d87b05dc59ee76a468f89fa5aca761
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-fra_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-fra_Latn
+doc_to_target: sentence_fra_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to French \nEnglish: {{sentence_eng_Latn}} \nFrench: "
+include: flores
+task: flores_eng_Latn-fra_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-fuv_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-fuv_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a821f58fa428af19d22b819428db35a52f4a6725
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-fuv_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-fuv_Latn
+doc_to_target: sentence_fuv_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Nigerian Fulfulde \nEnglish: {{sentence_eng_Latn}} \nNigerian Fulfulde: "
+include: flores
+task: flores_eng_Latn-fuv_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-gaz_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-gaz_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..36fa1d6c4e1fad7f33e7182abfdca60a8df9d386
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-gaz_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-gaz_Latn
+doc_to_target: sentence_gaz_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Oromo \nEnglish: {{sentence_eng_Latn}} \nOromo: "
+include: flores
+task: flores_eng_Latn-gaz_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-hau_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-hau_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..aad0a48b3277c8150cb5679b4b8b77636d04b5c6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-hau_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-hau_Latn
+doc_to_target: sentence_hau_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Hausa \nEnglish: {{sentence_eng_Latn}} \nHausa: "
+include: flores
+task: flores_eng_Latn-hau_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-ibo_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-ibo_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b31e37cd4ce5a6891f7dff30ff75f05b99bdc48c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-ibo_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-ibo_Latn
+doc_to_target: sentence_ibo_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Igbo \nEnglish: {{sentence_eng_Latn}} \nIgbo: "
+include: flores
+task: flores_eng_Latn-ibo_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-kab_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-kab_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1d6cfd8cb97ca07352c0d7927bc2476a3e9e378a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-kab_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-kab_Latn
+doc_to_target: sentence_kab_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Kabyle \nEnglish: {{sentence_eng_Latn}} \nKabyle: "
+include: flores
+task: flores_eng_Latn-kab_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-kam_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-kam_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dd2da95c49b7828dfbc174d6a9d891546d433ecd
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-kam_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-kam_Latn
+doc_to_target: sentence_kam_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Kamba \nEnglish: {{sentence_eng_Latn}} \nKamba: "
+include: flores
+task: flores_eng_Latn-kam_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-kbp_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-kbp_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b04cbdf144d5a9718f5a1f9ae38158952e6975e7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-kbp_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-kbp_Latn
+doc_to_target: sentence_kbp_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Kabiyè \nEnglish: {{sentence_eng_Latn}} \nKabiyè: "
+include: flores
+task: flores_eng_Latn-kbp_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-kea_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-kea_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4a67cb9fef15715713918aaafe29f1147f40acda
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-kea_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-kea_Latn
+doc_to_target: sentence_kea_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Kabuverdianu \nEnglish: {{sentence_eng_Latn}} \nKabuverdianu: "
+include: flores
+task: flores_eng_Latn-kea_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-kik_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-kik_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1519f36e63c76ba57759547e91bab111c3796dcf
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-kik_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-kik_Latn
+doc_to_target: sentence_kik_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Kikuyu \nEnglish: {{sentence_eng_Latn}} \nKikuyu: "
+include: flores
+task: flores_eng_Latn-kik_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-kin_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-kin_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2b33033ff00dce959d37f6b7fe8f0440a6cd1577
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-kin_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-kin_Latn
+doc_to_target: sentence_kin_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Kinyarwanda \nEnglish: {{sentence_eng_Latn}} \nKinyarwanda: "
+include: flores
+task: flores_eng_Latn-kin_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-kmb_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-kmb_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..803989174a43ad7567cc321f7f7847039bc516d5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-kmb_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-kmb_Latn
+doc_to_target: sentence_kmb_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Kimbundu \nEnglish: {{sentence_eng_Latn}} \nKimbundu: "
+include: flores
+task: flores_eng_Latn-kmb_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-knc_Arab.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-knc_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c0d262413539f659922105528184c6b1f9c74f05
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-knc_Arab.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: eng_Latn-knc_Arab
+doc_to_target: sentence_knc_Arab
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Central Kanuri (Arabic script) \nEnglish: {{sentence_eng_Latn}} \nCentral Kanuri\
+  \ (Arabic script): "
+include: flores
+task: flores_eng_Latn-knc_Arab_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-knc_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-knc_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..61ea7a2cdf03e9cd2e6fcef2abcc6e072cb5f430
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-knc_Latn.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: eng_Latn-knc_Latn
+doc_to_target: sentence_knc_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Central Kanuri (Latin script) \nEnglish: {{sentence_eng_Latn}} \nCentral Kanuri\
+  \ (Latin script): "
+include: flores
+task: flores_eng_Latn-knc_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-kon_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-kon_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1967452e0032b023b48dfd3980e9d8241aed8e09
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-kon_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-kon_Latn
+doc_to_target: sentence_kon_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Kikongo \nEnglish: {{sentence_eng_Latn}} \nKikongo: "
+include: flores
+task: flores_eng_Latn-kon_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-lin_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-lin_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..05e2593bdee5d218324f959277480f74db95a82b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-lin_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-lin_Latn
+doc_to_target: sentence_lin_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Lingala \nEnglish: {{sentence_eng_Latn}} \nLingala: "
+include: flores
+task: flores_eng_Latn-lin_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-lua_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-lua_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5f4fe01e16cf1715dbf8467e9bb6fb1558f4b923
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-lua_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-lua_Latn
+doc_to_target: sentence_lua_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Luba-Kasai \nEnglish: {{sentence_eng_Latn}} \nLuba-Kasai: "
+include: flores
+task: flores_eng_Latn-lua_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-lug_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-lug_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0cfc35568598cd7733748a5f06fa5a1ad5c7c85e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-lug_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-lug_Latn
+doc_to_target: sentence_lug_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Luganda \nEnglish: {{sentence_eng_Latn}} \nLuganda: "
+include: flores
+task: flores_eng_Latn-lug_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-luo_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-luo_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..05c027bb0256d1a22e1c14ca2812b6f9abb65fb7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-luo_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-luo_Latn
+doc_to_target: sentence_luo_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Luo \nEnglish: {{sentence_eng_Latn}} \nLuo: "
+include: flores
+task: flores_eng_Latn-luo_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-mos_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-mos_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0a676522a51603f951c5dfe0d88a6d99823b46eb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-mos_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-mos_Latn
+doc_to_target: sentence_mos_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Mossi \nEnglish: {{sentence_eng_Latn}} \nMossi: "
+include: flores
+task: flores_eng_Latn-mos_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-nso_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-nso_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c681b492c17f5e95709c9f0bd06637b10c07c9c3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-nso_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-nso_Latn
+doc_to_target: sentence_nso_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Northern Sotho \nEnglish: {{sentence_eng_Latn}} \nNorthern Sotho: "
+include: flores
+task: flores_eng_Latn-nso_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-nus_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-nus_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7ae375058b9393357df2aca5f52c69d6b6fde744
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-nus_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-nus_Latn
+doc_to_target: sentence_nus_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Nuer \nEnglish: {{sentence_eng_Latn}} \nNuer: "
+include: flores
+task: flores_eng_Latn-nus_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-nya_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-nya_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..135029028e124537ec4b2dab4222fcb582d38beb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-nya_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-nya_Latn
+doc_to_target: sentence_nya_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Nyanja \nEnglish: {{sentence_eng_Latn}} \nNyanja: "
+include: flores
+task: flores_eng_Latn-nya_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-plt_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-plt_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..faa85197438e9ff31744de7683957736b3ad34bc
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-plt_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-plt_Latn
+doc_to_target: sentence_plt_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Plateau Malagasy \nEnglish: {{sentence_eng_Latn}} \nPlateau Malagasy: "
+include: flores
+task: flores_eng_Latn-plt_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-run_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-run_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8b670e3f7146cec72c15c9be17ad0df6b30a1a4b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-run_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-run_Latn
+doc_to_target: sentence_run_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Rundi \nEnglish: {{sentence_eng_Latn}} \nRundi: "
+include: flores
+task: flores_eng_Latn-run_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-sag_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-sag_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..32f399391b905b77d5bea93229fc6b4c5de9e533
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-sag_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-sag_Latn
+doc_to_target: sentence_sag_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Sango \nEnglish: {{sentence_eng_Latn}} \nSango: "
+include: flores
+task: flores_eng_Latn-sag_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-sna_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-sna_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0e219c40275fb7938cc2a121822b534776aff57b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-sna_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-sna_Latn
+doc_to_target: sentence_sna_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Shona \nEnglish: {{sentence_eng_Latn}} \nShona: "
+include: flores
+task: flores_eng_Latn-sna_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-som_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-som_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f87466dc875c1b2402ccd195f164949c94aa3e5e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-som_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-som_Latn
+doc_to_target: sentence_som_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Somali \nEnglish: {{sentence_eng_Latn}} \nSomali: "
+include: flores
+task: flores_eng_Latn-som_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-sot_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-sot_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..674d162b64e72d5c3d58521643a8dae6042b9cf5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-sot_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-sot_Latn
+doc_to_target: sentence_sot_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Southern Sotho \nEnglish: {{sentence_eng_Latn}} \nSouthern Sotho: "
+include: flores
+task: flores_eng_Latn-sot_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-ssw_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-ssw_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..23b9216f912ba5cd340181dd5c07b19c4ff03c7d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-ssw_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-ssw_Latn
+doc_to_target: sentence_ssw_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Swati \nEnglish: {{sentence_eng_Latn}} \nSwati: "
+include: flores
+task: flores_eng_Latn-ssw_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-sun_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-sun_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9f51ced5e6ce5353da73945815adbaab9e9c0d94
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-sun_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-sun_Latn
+doc_to_target: sentence_sun_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Sundanese \nEnglish: {{sentence_eng_Latn}} \nSundanese: "
+include: flores
+task: flores_eng_Latn-sun_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-swh_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-swh_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1558af98e0011ddf66f5ec63bcde425414a539a2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-swh_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-swh_Latn
+doc_to_target: sentence_swh_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Swahili \nEnglish: {{sentence_eng_Latn}} \nSwahili: "
+include: flores
+task: flores_eng_Latn-swh_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-taq_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-taq_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0b09b52f46c56e15fc30aff90cbac8c8b8f8e2b8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-taq_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-taq_Latn
+doc_to_target: sentence_taq_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Tamasheq \nEnglish: {{sentence_eng_Latn}} \nTamasheq: "
+include: flores
+task: flores_eng_Latn-taq_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-taq_Tfng.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-taq_Tfng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b69f1dbd4dd96f81e055b758bfc103a81f7c116c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-taq_Tfng.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: eng_Latn-taq_Tfng
+doc_to_target: sentence_taq_Tfng
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Tamasheq (Tifinagh script) \nEnglish: {{sentence_eng_Latn}} \nTamasheq (Tifinagh\
+  \ script): "
+include: flores
+task: flores_eng_Latn-taq_Tfng_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-tir_Ethi.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-tir_Ethi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4340591d8e397caa4525832e1225b364574c4664
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-tir_Ethi.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-tir_Ethi
+doc_to_target: sentence_tir_Ethi
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Tigrinya \nEnglish: {{sentence_eng_Latn}} \nTigrinya: "
+include: flores
+task: flores_eng_Latn-tir_Ethi_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-tsn_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-tsn_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1e592366ebb36a4c621f05e31326fbf36125025b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-tsn_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-tsn_Latn
+doc_to_target: sentence_tsn_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Setswana \nEnglish: {{sentence_eng_Latn}} \nSetswana: "
+include: flores
+task: flores_eng_Latn-tsn_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-tso_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-tso_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0d027a2aa2fc08aaa9fa792391eb69d46ddee802
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-tso_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-tso_Latn
+doc_to_target: sentence_tso_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Tsonga \nEnglish: {{sentence_eng_Latn}} \nTsonga: "
+include: flores
+task: flores_eng_Latn-tso_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-tum_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-tum_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1accaeaf4bd9cbbfb5c46c6341a3bb81663767be
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-tum_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-tum_Latn
+doc_to_target: sentence_tum_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Tumbuka \nEnglish: {{sentence_eng_Latn}} \nTumbuka: "
+include: flores
+task: flores_eng_Latn-tum_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-twi_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-twi_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4a45df82e6c60141396deafa19cd7882b2edb689
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-twi_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-twi_Latn
+doc_to_target: sentence_twi_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Twi \nEnglish: {{sentence_eng_Latn}} \nTwi: "
+include: flores
+task: flores_eng_Latn-twi_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-tzm_Tfng.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-tzm_Tfng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6a3faa15d24df79d839a226cc374ace410133a12
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-tzm_Tfng.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-tzm_Tfng
+doc_to_target: sentence_tzm_Tfng
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Central Atlas Tamazight \nEnglish: {{sentence_eng_Latn}} \nCentral Atlas Tamazight: "
+include: flores
+task: flores_eng_Latn-tzm_Tfng_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-umb_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-umb_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3f21c6fe1939f288d56b6bd1229ce6755babb807
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-umb_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-umb_Latn
+doc_to_target: sentence_umb_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Umbundu \nEnglish: {{sentence_eng_Latn}} \nUmbundu: "
+include: flores
+task: flores_eng_Latn-umb_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-wol_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-wol_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..263ded277f0e1a596eac0bf1af5ab1858cf6cd42
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-wol_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-wol_Latn
+doc_to_target: sentence_wol_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Wolof \nEnglish: {{sentence_eng_Latn}} \nWolof: "
+include: flores
+task: flores_eng_Latn-wol_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-xho_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-xho_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a92e46f996b3dfacb06bd5d8d589d43763800e1a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-xho_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-xho_Latn
+doc_to_target: sentence_xho_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Xhosa \nEnglish: {{sentence_eng_Latn}} \nXhosa: "
+include: flores
+task: flores_eng_Latn-xho_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-yor_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-yor_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..80ec895c70fc2f09261ec6df48f2e6bc9755f479
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-yor_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-yor_Latn
+doc_to_target: sentence_yor_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Yoruba \nEnglish: {{sentence_eng_Latn}} \nYoruba: "
+include: flores
+task: flores_eng_Latn-yor_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-zul_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-zul_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..593cdfe3c7c6878bef06e9af476476fdcbbfdfd6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/english-african/flores_eng_Latn-zul_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-zul_Latn
+doc_to_target: sentence_zul_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Zulu \nEnglish: {{sentence_eng_Latn}} \nZulu: "
+include: flores
+task: flores_eng_Latn-zul_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/flores/prompt_2/flores b/lm_eval/tasks/afrobench/flores/prompt_2/flores
new file mode 100644
index 0000000000000000000000000000000000000000..74f9f33eb22662bec79709bd64d8d31f3fb8eae0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_2/flores
@@ -0,0 +1,24 @@
+tag:
+- flores_tasks
+- flores_afr-eng
+dataset_path: facebook/flores
+dataset_kwargs: {trust_remote_code: True}
+output_type: generate_until
+validation_split: dev
+fewshot_split: dev
+test_split: devtest
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: chrf
+    aggregation: chrf
+    higher_is_better: true
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+repeats: 1
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores
new file mode 100644
index 0000000000000000000000000000000000000000..60bf41116e43ccdd17efcdcbe0e72c8aad0cf684
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores
@@ -0,0 +1,27 @@
+tag:
+- african_flores_tasks
+- flores_afr-eng
+- flores_afr-eng_prompt_3
+- afrobench_MT_tasks
+dataset_path: facebook/flores
+dataset_kwargs: {trust_remote_code: True}
+output_type: generate_until
+validation_split: dev
+fewshot_split: dev
+test_split: devtest
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: chrf
+    aggregation: chrf
+    higher_is_better: true
+generation_kwargs:
+  until:
+    - "**"
+    - </s>
+  do_sample: false
+  temperature: 0.0
+repeats: 1
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_ace_Arab-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_ace_Arab-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ee5f12704a3a7f03a52aba093e01e335a98729ff
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_ace_Arab-eng_Latn.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: ace_Arab-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Acehnese (Arabic script) and English linguist, translate the following\
+  \ Acehnese (Arabic script) sentences to English \nAcehnese (Arabic script): {{sentence_ace_Arab}}\n\
+  English: "
+include: flores
+task: flores_ace_Arab-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_ace_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_ace_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e1d70ba341b8e30123cfd2885f570e5050c359a5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_ace_Latn-eng_Latn.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: ace_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Acehnese (Latin script) and English linguist, translate the following\
+  \ Acehnese (Latin script) sentences to English \nAcehnese (Latin script): {{sentence_ace_Latn}}\n\
+  English: "
+include: flores
+task: flores_ace_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_acq_Arab-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_acq_Arab-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8cda39626e72a3df883f1b74c314e8c275fa4fbb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_acq_Arab-eng_Latn.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: acq_Arab-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Ta’izzi-Adeni Arabic and English linguist, translate the following\
+  \ Ta’izzi-Adeni Arabic sentences to English \nTa’izzi-Adeni Arabic: {{sentence_acq_Arab}}\n\
+  English: "
+include: flores
+task: flores_acq_Arab-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_aeb_Arab-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_aeb_Arab-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..97f8ef2c91bd0255f2b887ff6b1acb75fc1c0487
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_aeb_Arab-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: aeb_Arab-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Tunisian Arabic and English linguist, translate the following Tunisian\
+  \ Arabic sentences to English \nTunisian Arabic: {{sentence_aeb_Arab}}\nEnglish: "
+include: flores
+task: flores_aeb_Arab-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_afr_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_afr_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e228cb9c66858d173835016566cd1f4731038120
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_afr_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: afr_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Afrikaans and English linguist, translate the following Afrikaans\
+  \ sentences to English \nAfrikaans: {{sentence_afr_Latn}}\nEnglish: "
+include: flores
+task: flores_afr_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_aka_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_aka_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6d6fc38582c415828023478f33f2925427a68cbb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_aka_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: aka_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Akan and English linguist, translate the following Akan sentences\
+  \ to English \nAkan: {{sentence_aka_Latn}}\nEnglish: "
+include: flores
+task: flores_aka_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_amh_Ethi-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_amh_Ethi-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..58f33f9a13c5c9840fd4dcdcdd12c664daf60878
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_amh_Ethi-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: amh_Ethi-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Amharic and English linguist, translate the following Amharic sentences\
+  \ to English \nAmharic: {{sentence_amh_Ethi}}\nEnglish: "
+include: flores
+task: flores_amh_Ethi-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_ary_Arab-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_ary_Arab-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3006ebf72c0088340346a3a7b8f140da84211048
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_ary_Arab-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ary_Arab-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Moroccan Arabic and English linguist, translate the following Moroccan\
+  \ Arabic sentences to English \nMoroccan Arabic: {{sentence_ary_Arab}}\nEnglish: "
+include: flores
+task: flores_ary_Arab-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_arz_Arab-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_arz_Arab-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..46cc0a18d4633b7032c7c179606845941f595e8b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_arz_Arab-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: arz_Arab-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Egyptian Arabic and English linguist, translate the following Egyptian\
+  \ Arabic sentences to English \nEgyptian Arabic: {{sentence_arz_Arab}}\nEnglish: "
+include: flores
+task: flores_arz_Arab-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_bam_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_bam_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c50a8dfa4ae3b2a99c2bb40f64749fb22c8928ac
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_bam_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: bam_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Bambara and English linguist, translate the following Bambara sentences\
+  \ to English \nBambara: {{sentence_bam_Latn}}\nEnglish: "
+include: flores
+task: flores_bam_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_ban_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_ban_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..86f2eed3fef3b2aefef9f0a7e640310d054e3fc9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_ban_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ban_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Balinese and English linguist, translate the following Balinese\
+  \ sentences to English \nBalinese: {{sentence_ban_Latn}}\nEnglish: "
+include: flores
+task: flores_ban_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_bem_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_bem_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..55c32fe9c5e3f4321b6c3145d862d23f48da233b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_bem_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: bem_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Bemba and English linguist, translate the following Bemba sentences\
+  \ to English \nBemba: {{sentence_bem_Latn}}\nEnglish: "
+include: flores
+task: flores_bem_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_cjk_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_cjk_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..642cd4dda88f9c7e38092fbc650e8340ff8998a2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_cjk_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: cjk_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Chokwe and English linguist, translate the following Chokwe sentences\
+  \ to English \nChokwe: {{sentence_cjk_Latn}}\nEnglish: "
+include: flores
+task: flores_cjk_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_dik_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_dik_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8005a642241e9ba4e9255a224f6c7d641553edd7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_dik_Latn-eng_Latn.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: dik_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Southwestern Dinka and English linguist, translate the following\
+  \ Southwestern Dinka sentences to English \nSouthwestern Dinka: {{sentence_dik_Latn}}\n\
+  English: "
+include: flores
+task: flores_dik_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_dyu_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_dyu_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a99efc0867c365186db75a8f04a0fbfa741f91c2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_dyu_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: dyu_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Dyula and English linguist, translate the following Dyula sentences\
+  \ to English \nDyula: {{sentence_dyu_Latn}}\nEnglish: "
+include: flores
+task: flores_dyu_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_ewe_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_ewe_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..77133ad60cc57992882213908c1abe2300fae291
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_ewe_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ewe_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Ewe and English linguist, translate the following Ewe sentences\
+  \ to English \nEwe: {{sentence_ewe_Latn}}\nEnglish: "
+include: flores
+task: flores_ewe_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_fon_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_fon_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..436bf4ac3e5319e8cf2da5791607f8d4f7564eca
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_fon_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: fon_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Fon and English linguist, translate the following Fon sentences\
+  \ to English \nFon: {{sentence_fon_Latn}}\nEnglish: "
+include: flores
+task: flores_fon_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_fra_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_fra_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b10c46e3226342c3a01c90f881df8575049eb6b6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_fra_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: fra_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a French and English linguist, translate the following French sentences\
+  \ to English \nFrench: {{sentence_fra_Latn}}\nEnglish: "
+include: flores
+task: flores_fra_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_fuv_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_fuv_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ffcbd3c04f1f6fd608e11286be4d88c079890a88
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_fuv_Latn-eng_Latn.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: fuv_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Nigerian Fulfulde and English linguist, translate the following\
+  \ Nigerian Fulfulde sentences to English \nNigerian Fulfulde: {{sentence_fuv_Latn}}\n\
+  English: "
+include: flores
+task: flores_fuv_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_gaz_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_gaz_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..703cd3517a81e172683fb43b91ddbb4ca7db500e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_gaz_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: gaz_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Oromo and English linguist, translate the following Oromo sentences\
+  \ to English \nOromo: {{sentence_gaz_Latn}}\nEnglish: "
+include: flores
+task: flores_gaz_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_hau_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_hau_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7527bf78ebc88167a99707eb3101b1c350e5c991
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_hau_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: hau_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Hausa and English linguist, translate the following Hausa sentences\
+  \ to English \nHausa: {{sentence_hau_Latn}}\nEnglish: "
+include: flores
+task: flores_hau_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_ibo_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_ibo_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7705911a67b2584a9d1afc3cd5c4294a37a22ece
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_ibo_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ibo_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Igbo and English linguist, translate the following Igbo sentences\
+  \ to English \nIgbo: {{sentence_ibo_Latn}}\nEnglish: "
+include: flores
+task: flores_ibo_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_kab_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_kab_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ec406c5e0fbf8f5b41b17e432586c00f8383eabd
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_kab_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kab_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Kabyle and English linguist, translate the following Kabyle sentences\
+  \ to English \nKabyle: {{sentence_kab_Latn}}\nEnglish: "
+include: flores
+task: flores_kab_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_kam_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_kam_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ed27b6d79c71b5c1f4690cd409a86aabf9901124
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_kam_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kam_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Kamba and English linguist, translate the following Kamba sentences\
+  \ to English \nKamba: {{sentence_kam_Latn}}\nEnglish: "
+include: flores
+task: flores_kam_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_kbp_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_kbp_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5c1a0961e08908cf1e846a87e7ddce3641e80ead
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_kbp_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kbp_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Kabiyè and English linguist, translate the following Kabiyè sentences\
+  \ to English \nKabiyè: {{sentence_kbp_Latn}}\nEnglish: "
+include: flores
+task: flores_kbp_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_kea_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_kea_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..67dd9e73fa327338fecec80728ac645f78996b92
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_kea_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kea_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Kabuverdianu and English linguist, translate the following Kabuverdianu\
+  \ sentences to English \nKabuverdianu: {{sentence_kea_Latn}}\nEnglish: "
+include: flores
+task: flores_kea_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_kik_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_kik_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..14a6be5dfd5c86c44c8f3acb23af0f36d5445ead
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_kik_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kik_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Kikuyu and English linguist, translate the following Kikuyu sentences\
+  \ to English \nKikuyu: {{sentence_kik_Latn}}\nEnglish: "
+include: flores
+task: flores_kik_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_kin_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_kin_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8bb14aedf4e95de93a3d4da32a07b19bf854b697
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_kin_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kin_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Kinyarwanda and English linguist, translate the following Kinyarwanda\
+  \ sentences to English \nKinyarwanda: {{sentence_kin_Latn}}\nEnglish: "
+include: flores
+task: flores_kin_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_kmb_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_kmb_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c31ede4dfe8449d2a1c8e84b74eeee0fdc908b78
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_kmb_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kmb_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Kimbundu and English linguist, translate the following Kimbundu\
+  \ sentences to English \nKimbundu: {{sentence_kmb_Latn}}\nEnglish: "
+include: flores
+task: flores_kmb_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_knc_Arab-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_knc_Arab-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c8c7f8095e11d407d75360ee5e4794e45fc17eeb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_knc_Arab-eng_Latn.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: knc_Arab-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Central Kanuri (Arabic script) and English linguist, translate\
+  \ the following Central Kanuri (Arabic script) sentences to English \nCentral Kanuri\
+  \ (Arabic script): {{sentence_knc_Arab}}\nEnglish: "
+include: flores
+task: flores_knc_Arab-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_knc_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_knc_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9621de73d33ea37feeb3a4face5dbe50812bf9ca
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_knc_Latn-eng_Latn.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: knc_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Central Kanuri (Latin script) and English linguist, translate the\
+  \ following Central Kanuri (Latin script) sentences to English \nCentral Kanuri\
+  \ (Latin script): {{sentence_knc_Latn}}\nEnglish: "
+include: flores
+task: flores_knc_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_kon_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_kon_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..54ede3a6e2c0280761a3af529cc0a8fe82d2f518
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_kon_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kon_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Kikongo and English linguist, translate the following Kikongo sentences\
+  \ to English \nKikongo: {{sentence_kon_Latn}}\nEnglish: "
+include: flores
+task: flores_kon_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_lin_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_lin_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ea7e736d949726bf2296091e4309a12d493dbe4f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_lin_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: lin_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Lingala and English linguist, translate the following Lingala sentences\
+  \ to English \nLingala: {{sentence_lin_Latn}}\nEnglish: "
+include: flores
+task: flores_lin_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_lua_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_lua_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..327f014489f7502ac062537c72d4846a7099b64d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_lua_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: lua_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Luba-Kasai and English linguist, translate the following Luba-Kasai\
+  \ sentences to English \nLuba-Kasai: {{sentence_lua_Latn}}\nEnglish: "
+include: flores
+task: flores_lua_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_lug_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_lug_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9bfa92fa280f98278f7735634231cb99d44bc71f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_lug_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: lug_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Luganda and English linguist, translate the following Luganda sentences\
+  \ to English \nLuganda: {{sentence_lug_Latn}}\nEnglish: "
+include: flores
+task: flores_lug_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_luo_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_luo_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a66fded383a914454aed5e904278e60bf85d1e62
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_luo_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: luo_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Luo and English linguist, translate the following Luo sentences\
+  \ to English \nLuo: {{sentence_luo_Latn}}\nEnglish: "
+include: flores
+task: flores_luo_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_mos_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_mos_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e428853bf22859e37bbd10dcd4e113188b7519ef
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_mos_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: mos_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Mossi and English linguist, translate the following Mossi sentences\
+  \ to English \nMossi: {{sentence_mos_Latn}}\nEnglish: "
+include: flores
+task: flores_mos_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_nso_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_nso_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..054aa409b729cc70d70b1d53a49945f92096f4b9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_nso_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: nso_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Northern Sotho and English linguist, translate the following Northern\
+  \ Sotho sentences to English \nNorthern Sotho: {{sentence_nso_Latn}}\nEnglish: "
+include: flores
+task: flores_nso_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_nus_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_nus_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a3e0d1e3ac8ff35a2b0c89f37cb2697a6d15299a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_nus_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: nus_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Nuer and English linguist, translate the following Nuer sentences\
+  \ to English \nNuer: {{sentence_nus_Latn}}\nEnglish: "
+include: flores
+task: flores_nus_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_nya_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_nya_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e23c57c6807991e1b509c0a03a7c93a23eac3015
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_nya_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: nya_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Nyanja and English linguist, translate the following Nyanja sentences\
+  \ to English \nNyanja: {{sentence_nya_Latn}}\nEnglish: "
+include: flores
+task: flores_nya_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_plt_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_plt_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3ddfd864c3830d4921e1fcda79155c733809b305
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_plt_Latn-eng_Latn.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: plt_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Plateau Malagasy and English linguist, translate the following\
+  \ Plateau Malagasy sentences to English \nPlateau Malagasy: {{sentence_plt_Latn}}\n\
+  English: "
+include: flores
+task: flores_plt_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_run_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_run_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..64a82f716b71950711e6b055a6e30f45356a082c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_run_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: run_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Rundi and English linguist, translate the following Rundi sentences\
+  \ to English \nRundi: {{sentence_run_Latn}}\nEnglish: "
+include: flores
+task: flores_run_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_sag_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_sag_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..48408f94054fee78fe7c3be6460de563e9e60f0a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_sag_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sag_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Sango and English linguist, translate the following Sango sentences\
+  \ to English \nSango: {{sentence_sag_Latn}}\nEnglish: "
+include: flores
+task: flores_sag_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_sna_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_sna_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ff1626419b69a8e94349bfd63093ad33914bbcec
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_sna_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sna_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Shona and English linguist, translate the following Shona sentences\
+  \ to English \nShona: {{sentence_sna_Latn}}\nEnglish: "
+include: flores
+task: flores_sna_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_som_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_som_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7e27e2a5b3d1754f4c47755a1b92c7ac95938e67
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_som_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: som_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Somali and English linguist, translate the following Somali sentences\
+  \ to English \nSomali: {{sentence_som_Latn}}\nEnglish: "
+include: flores
+task: flores_som_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_sot_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_sot_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cc70b6f62b317e27cd8c00d95d89e103945028dc
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_sot_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sot_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Southern Sotho and English linguist, translate the following Southern\
+  \ Sotho sentences to English \nSouthern Sotho: {{sentence_sot_Latn}}\nEnglish: "
+include: flores
+task: flores_sot_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_ssw_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_ssw_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0cd61ae8e8ceb6b9a0f985457f26d25961272036
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_ssw_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ssw_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Swati and English linguist, translate the following Swati sentences\
+  \ to English \nSwati: {{sentence_ssw_Latn}}\nEnglish: "
+include: flores
+task: flores_ssw_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_sun_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_sun_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..000108f77ea1a0b578c1ca980ed0d74490f24fdf
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_sun_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sun_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Sundanese and English linguist, translate the following Sundanese\
+  \ sentences to English \nSundanese: {{sentence_sun_Latn}}\nEnglish: "
+include: flores
+task: flores_sun_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_swh_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_swh_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1c81805c1f22b9e6c6bd55ba925fa7dfb80f0cf1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_swh_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: swh_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Swahili and English linguist, translate the following Swahili sentences\
+  \ to English \nSwahili: {{sentence_swh_Latn}}\nEnglish: "
+include: flores
+task: flores_swh_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_taq_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_taq_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6febb3004bc3f39f94287ac41a9883d95f056fe1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_taq_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: taq_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Tamasheq and English linguist, translate the following Tamasheq\
+  \ sentences to English \nTamasheq: {{sentence_taq_Latn}}\nEnglish: "
+include: flores
+task: flores_taq_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_taq_Tfng-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_taq_Tfng-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6290ab94d3be2e52750f9af5900d7c27f27cb5af
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_taq_Tfng-eng_Latn.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: taq_Tfng-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Tamasheq (Tifinagh script) and English linguist, translate the\
+  \ following Tamasheq (Tifinagh script) sentences to English \nTamasheq (Tifinagh\
+  \ script): {{sentence_taq_Tfng}}\nEnglish: "
+include: flores
+task: flores_taq_Tfng-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_tir_Ethi-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_tir_Ethi-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..60133a3b735a0b1d91901bdd7f1ef2122b0f0f03
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_tir_Ethi-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: tir_Ethi-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Tigrinya and English linguist, translate the following Tigrinya\
+  \ sentences to English \nTigrinya: {{sentence_tir_Ethi}}\nEnglish: "
+include: flores
+task: flores_tir_Ethi-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_tsn_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_tsn_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..40417bde77b4a0ede2e9c87c56668be101579f3b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_tsn_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: tsn_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Setswana and English linguist, translate the following Setswana\
+  \ sentences to English \nSetswana: {{sentence_tsn_Latn}}\nEnglish: "
+include: flores
+task: flores_tsn_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_tso_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_tso_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..56d4632500b86964d0d665f1827cd129bc508d63
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_tso_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: tso_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Tsonga and English linguist, translate the following Tsonga sentences\
+  \ to English \nTsonga: {{sentence_tso_Latn}}\nEnglish: "
+include: flores
+task: flores_tso_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_tum_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_tum_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cc4bb541f7a5692a067ad75f5e3a86490487cc70
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_tum_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: tum_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Tumbuka and English linguist, translate the following Tumbuka sentences\
+  \ to English \nTumbuka: {{sentence_tum_Latn}}\nEnglish: "
+include: flores
+task: flores_tum_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_twi_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_twi_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4cc0d674c8ced1f005eef5a94c87a886a6f176aa
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_twi_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: twi_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Twi and English linguist, translate the following Twi sentences\
+  \ to English \nTwi: {{sentence_twi_Latn}}\nEnglish: "
+include: flores
+task: flores_twi_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_tzm_Tfng-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_tzm_Tfng-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d3575ccb2a766a722cbc88880f34656af8cdb3a4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_tzm_Tfng-eng_Latn.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: tzm_Tfng-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Central Atlas Tamazight and English linguist, translate the following\
+  \ Central Atlas Tamazight sentences to English \nCentral Atlas Tamazight: {{sentence_tzm_Tfng}}\n\
+  English: "
+include: flores
+task: flores_tzm_Tfng-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_umb_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_umb_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e7df76cf07cb4bf8f772721136fd4d92280b3820
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_umb_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: umb_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Umbundu and English linguist, translate the following Umbundu sentences\
+  \ to English \nUmbundu: {{sentence_umb_Latn}}\nEnglish: "
+include: flores
+task: flores_umb_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_wol_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_wol_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..22275ca15cd1829db481c0c77363a10649be101f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_wol_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: wol_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Wolof and English linguist, translate the following Wolof sentences\
+  \ to English \nWolof: {{sentence_wol_Latn}}\nEnglish: "
+include: flores
+task: flores_wol_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_xho_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_xho_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..85ae368b6efa8dab3a8a2b110e438b043ef3c74f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_xho_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: xho_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Xhosa and English linguist, translate the following Xhosa sentences\
+  \ to English \nXhosa: {{sentence_xho_Latn}}\nEnglish: "
+include: flores
+task: flores_xho_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_yor_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_yor_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5bbd8eb967ea1f4c6afe09bef65e379c4fed9c25
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_yor_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: yor_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Yoruba and English linguist, translate the following Yoruba sentences\
+  \ to English \nYoruba: {{sentence_yor_Latn}}\nEnglish: "
+include: flores
+task: flores_yor_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_zul_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_zul_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ea2c2edb8439fffb452187b86ed1690501b20b3d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/african-english/flores_zul_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: zul_Latn-eng_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Zulu and English linguist, translate the following Zulu sentences\
+  \ to English \nZulu: {{sentence_zul_Latn}}\nEnglish: "
+include: flores
+task: flores_zul_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores
new file mode 100644
index 0000000000000000000000000000000000000000..ac7dc1651e4729ae0357c6d958745400ddc35ea1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores
@@ -0,0 +1,27 @@
+tag:
+- african_flores_tasks
+- flores_eng-afr
+- flores_eng-afr_prompt_3
+- afrobench_MT_tasks
+dataset_path: facebook/flores
+dataset_kwargs: {trust_remote_code: True}
+output_type: generate_until
+validation_split: dev
+fewshot_split: dev
+test_split: devtest
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: chrf
+    aggregation: chrf
+    higher_is_better: true
+generation_kwargs:
+  until:
+    - "**"
+    - </s>
+  do_sample: false
+  temperature: 0.0
+repeats: 1
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-ace_Arab.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-ace_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..53cf711fa19132b8668d1c4a6024e1b96f54751b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-ace_Arab.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: eng_Latn-ace_Arab
+doc_to_target: sentence_ace_Arab
+doc_to_text: "As a Acehnese (Arabic script) and English linguist, translate the following\
+  \ English sentences to Acehnese (Arabic script) \nEnglish: {{sentence_eng_Latn}}\
+  \ \nAcehnese (Arabic script): "
+include: flores
+task: flores_eng_Latn-ace_Arab_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-ace_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-ace_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..766b7c30061e8adfd3e4827052fba7160483ae4a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-ace_Latn.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: eng_Latn-ace_Latn
+doc_to_target: sentence_ace_Latn
+doc_to_text: "As a Acehnese (Latin script) and English linguist, translate the following\
+  \ English sentences to Acehnese (Latin script) \nEnglish: {{sentence_eng_Latn}}\
+  \ \nAcehnese (Latin script): "
+include: flores
+task: flores_eng_Latn-ace_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-acq_Arab.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-acq_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e809c866eb602e76defc6c3fca983e02bc213a52
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-acq_Arab.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: eng_Latn-acq_Arab
+doc_to_target: sentence_acq_Arab
+doc_to_text: "As a Ta’izzi-Adeni Arabic and English linguist, translate the following\
+  \ English sentences to Ta’izzi-Adeni Arabic \nEnglish: {{sentence_eng_Latn}} \n\
+  Ta’izzi-Adeni Arabic: "
+include: flores
+task: flores_eng_Latn-acq_Arab_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-aeb_Arab.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-aeb_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9e8263fe6af0b65e5c935c7d56146b6940c6850b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-aeb_Arab.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-aeb_Arab
+doc_to_target: sentence_aeb_Arab
+doc_to_text: "As a Tunisian Arabic and English linguist, translate the following English\
+  \ sentences to Tunisian Arabic \nEnglish: {{sentence_eng_Latn}} \nTunisian Arabic: "
+include: flores
+task: flores_eng_Latn-aeb_Arab_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-afr_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-afr_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..86421c268959192fae2dcbc19a1a4b935d6bff29
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-afr_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-afr_Latn
+doc_to_target: sentence_afr_Latn
+doc_to_text: "As a Afrikaans and English linguist, translate the following English\
+  \ sentences to Afrikaans \nEnglish: {{sentence_eng_Latn}} \nAfrikaans: "
+include: flores
+task: flores_eng_Latn-afr_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-aka_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-aka_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3373390566a317a7438e216cea67b926d5dd20fa
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-aka_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-aka_Latn
+doc_to_target: sentence_aka_Latn
+doc_to_text: "As a Akan and English linguist, translate the following English sentences\
+  \ to Akan \nEnglish: {{sentence_eng_Latn}} \nAkan: "
+include: flores
+task: flores_eng_Latn-aka_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-amh_Ethi.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-amh_Ethi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ba3e0116586dfb106bc57103dc685ddd8856570d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-amh_Ethi.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-amh_Ethi
+doc_to_target: sentence_amh_Ethi
+doc_to_text: "As a Amharic and English linguist, translate the following English sentences\
+  \ to Amharic \nEnglish: {{sentence_eng_Latn}} \nAmharic: "
+include: flores
+task: flores_eng_Latn-amh_Ethi_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-ary_Arab.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-ary_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c732756a2ea06e33114c117c630f9b3fccab32fe
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-ary_Arab.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-ary_Arab
+doc_to_target: sentence_ary_Arab
+doc_to_text: "As a Moroccan Arabic and English linguist, translate the following English\
+  \ sentences to Moroccan Arabic \nEnglish: {{sentence_eng_Latn}} \nMoroccan Arabic: "
+include: flores
+task: flores_eng_Latn-ary_Arab_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-arz_Arab.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-arz_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f11bc38a2dc1c1cc565979a47b51b5aad9bc830e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-arz_Arab.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-arz_Arab
+doc_to_target: sentence_arz_Arab
+doc_to_text: "As a Egyptian Arabic and English linguist, translate the following English\
+  \ sentences to Egyptian Arabic \nEnglish: {{sentence_eng_Latn}} \nEgyptian Arabic: "
+include: flores
+task: flores_eng_Latn-arz_Arab_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-bam_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-bam_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c762962832885fe21c75986f7ce006789217dbd4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-bam_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-bam_Latn
+doc_to_target: sentence_bam_Latn
+doc_to_text: "As a Bambara and English linguist, translate the following English sentences\
+  \ to Bambara \nEnglish: {{sentence_eng_Latn}} \nBambara: "
+include: flores
+task: flores_eng_Latn-bam_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-ban_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-ban_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..601aecf5cebebdb6572fadf8f82d2963b9b87d5c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-ban_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-ban_Latn
+doc_to_target: sentence_ban_Latn
+doc_to_text: "As a Balinese and English linguist, translate the following English\
+  \ sentences to Balinese \nEnglish: {{sentence_eng_Latn}} \nBalinese: "
+include: flores
+task: flores_eng_Latn-ban_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-bem_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-bem_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fadabdb9356f28fda88a68e69646a0fd60141e9c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-bem_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-bem_Latn
+doc_to_target: sentence_bem_Latn
+doc_to_text: "As a Bemba and English linguist, translate the following English sentences\
+  \ to Bemba \nEnglish: {{sentence_eng_Latn}} \nBemba: "
+include: flores
+task: flores_eng_Latn-bem_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-cjk_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-cjk_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c522831373d25e05103918ba43f36183106cc509
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-cjk_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-cjk_Latn
+doc_to_target: sentence_cjk_Latn
+doc_to_text: "As a Chokwe and English linguist, translate the following English sentences\
+  \ to Chokwe \nEnglish: {{sentence_eng_Latn}} \nChokwe: "
+include: flores
+task: flores_eng_Latn-cjk_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-dik_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-dik_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..acfeb83ad758573c632ca1b3e9e08f190b86fa30
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-dik_Latn.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: eng_Latn-dik_Latn
+doc_to_target: sentence_dik_Latn
+doc_to_text: "As a Southwestern Dinka and English linguist, translate the following\
+  \ English sentences to Southwestern Dinka \nEnglish: {{sentence_eng_Latn}} \nSouthwestern\
+  \ Dinka: "
+include: flores
+task: flores_eng_Latn-dik_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-dyu_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-dyu_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..796dc6d2f633c5baba22d2cce8592f0f01e3fe42
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-dyu_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-dyu_Latn
+doc_to_target: sentence_dyu_Latn
+doc_to_text: "As a Dyula and English linguist, translate the following English sentences\
+  \ to Dyula \nEnglish: {{sentence_eng_Latn}} \nDyula: "
+include: flores
+task: flores_eng_Latn-dyu_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-ewe_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-ewe_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..31a07891820793360f26b2d093e98b5982816ea6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-ewe_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-ewe_Latn
+doc_to_target: sentence_ewe_Latn
+doc_to_text: "As a Ewe and English linguist, translate the following English sentences\
+  \ to Ewe \nEnglish: {{sentence_eng_Latn}} \nEwe: "
+include: flores
+task: flores_eng_Latn-ewe_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-fon_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-fon_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6cdc7308d63891ed0fc65e779e394b71eafb3bb1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-fon_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-fon_Latn
+doc_to_target: sentence_fon_Latn
+doc_to_text: "As a Fon and English linguist, translate the following English sentences\
+  \ to Fon \nEnglish: {{sentence_eng_Latn}} \nFon: "
+include: flores
+task: flores_eng_Latn-fon_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-fra_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-fra_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3896879db152bc1e583fd24f5823321d0f6eda4d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-fra_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-fra_Latn
+doc_to_target: sentence_fra_Latn
+doc_to_text: "As a French and English linguist, translate the following English sentences\
+  \ to French \nEnglish: {{sentence_eng_Latn}} \nFrench: "
+include: flores
+task: flores_eng_Latn-fra_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-fuv_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-fuv_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6b63249be8c1e81e837e9a024dd19ecd822f748b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-fuv_Latn.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: eng_Latn-fuv_Latn
+doc_to_target: sentence_fuv_Latn
+doc_to_text: "As a Nigerian Fulfulde and English linguist, translate the following\
+  \ English sentences to Nigerian Fulfulde \nEnglish: {{sentence_eng_Latn}} \nNigerian\
+  \ Fulfulde: "
+include: flores
+task: flores_eng_Latn-fuv_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-gaz_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-gaz_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..95cde87c38c66448967d595f60709c2f908af5f0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-gaz_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-gaz_Latn
+doc_to_target: sentence_gaz_Latn
+doc_to_text: "As a Oromo and English linguist, translate the following English sentences\
+  \ to Oromo \nEnglish: {{sentence_eng_Latn}} \nOromo: "
+include: flores
+task: flores_eng_Latn-gaz_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-hau_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-hau_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..eec82e34503bb64fcab1c90cf507499760f95a15
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-hau_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-hau_Latn
+doc_to_target: sentence_hau_Latn
+doc_to_text: "As a Hausa and English linguist, translate the following English sentences\
+  \ to Hausa \nEnglish: {{sentence_eng_Latn}} \nHausa: "
+include: flores
+task: flores_eng_Latn-hau_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-ibo_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-ibo_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..838990b364097652e9ba4ed68726147e4424d05e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-ibo_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-ibo_Latn
+doc_to_target: sentence_ibo_Latn
+doc_to_text: "As a Igbo and English linguist, translate the following English sentences\
+  \ to Igbo \nEnglish: {{sentence_eng_Latn}} \nIgbo: "
+include: flores
+task: flores_eng_Latn-ibo_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-kab_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-kab_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..16888ad8f28b64a7bb9715fdf8f193e18ce06072
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-kab_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-kab_Latn
+doc_to_target: sentence_kab_Latn
+doc_to_text: "As a Kabyle and English linguist, translate the following English sentences\
+  \ to Kabyle \nEnglish: {{sentence_eng_Latn}} \nKabyle: "
+include: flores
+task: flores_eng_Latn-kab_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-kam_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-kam_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d48c52d16017b2e1241afd154539148d3f0d0ae4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-kam_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-kam_Latn
+doc_to_target: sentence_kam_Latn
+doc_to_text: "As a Kamba and English linguist, translate the following English sentences\
+  \ to Kamba \nEnglish: {{sentence_eng_Latn}} \nKamba: "
+include: flores
+task: flores_eng_Latn-kam_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-kbp_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-kbp_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c992a28f7e168e3753e378631fa6ce716e7ee69e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-kbp_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-kbp_Latn
+doc_to_target: sentence_kbp_Latn
+doc_to_text: "As a Kabiyè and English linguist, translate the following English sentences\
+  \ to Kabiyè \nEnglish: {{sentence_eng_Latn}} \nKabiyè: "
+include: flores
+task: flores_eng_Latn-kbp_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-kea_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-kea_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d8ce1b502edea3c9f00fe89dbf9dc382010e4bff
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-kea_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-kea_Latn
+doc_to_target: sentence_kea_Latn
+doc_to_text: "As a Kabuverdianu and English linguist, translate the following English\
+  \ sentences to Kabuverdianu \nEnglish: {{sentence_eng_Latn}} \nKabuverdianu: "
+include: flores
+task: flores_eng_Latn-kea_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-kik_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-kik_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fc7975c2b23bb486ead2962f28064a5fcab6102f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-kik_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-kik_Latn
+doc_to_target: sentence_kik_Latn
+doc_to_text: "As a Kikuyu and English linguist, translate the following English sentences\
+  \ to Kikuyu \nEnglish: {{sentence_eng_Latn}} \nKikuyu: "
+include: flores
+task: flores_eng_Latn-kik_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-kin_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-kin_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1e2b91d461378cb7c8ff098d237037eefdcacc03
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-kin_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-kin_Latn
+doc_to_target: sentence_kin_Latn
+doc_to_text: "As a Kinyarwanda and English linguist, translate the following English\
+  \ sentences to Kinyarwanda \nEnglish: {{sentence_eng_Latn}} \nKinyarwanda: "
+include: flores
+task: flores_eng_Latn-kin_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-kmb_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-kmb_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..270f29b629e6f1f06da31ba154d977b0281fd63b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-kmb_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-kmb_Latn
+doc_to_target: sentence_kmb_Latn
+doc_to_text: "As a Kimbundu and English linguist, translate the following English\
+  \ sentences to Kimbundu \nEnglish: {{sentence_eng_Latn}} \nKimbundu: "
+include: flores
+task: flores_eng_Latn-kmb_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-knc_Arab.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-knc_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bd2994d36152fc1dcb4a7a2561cc41982dd6fed1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-knc_Arab.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: eng_Latn-knc_Arab
+doc_to_target: sentence_knc_Arab
+doc_to_text: "As a Central Kanuri (Arabic script) and English linguist, translate\
+  \ the following English sentences to Central Kanuri (Arabic script) \nEnglish: {{sentence_eng_Latn}}\
+  \ \nCentral Kanuri (Arabic script): "
+include: flores
+task: flores_eng_Latn-knc_Arab_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-knc_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-knc_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..262d0c1f3b8efc51c35e7154f27a9a4e6ed1405f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-knc_Latn.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: eng_Latn-knc_Latn
+doc_to_target: sentence_knc_Latn
+doc_to_text: "As a Central Kanuri (Latin script) and English linguist, translate the\
+  \ following English sentences to Central Kanuri (Latin script) \nEnglish: {{sentence_eng_Latn}}\
+  \ \nCentral Kanuri (Latin script): "
+include: flores
+task: flores_eng_Latn-knc_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-kon_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-kon_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ae9e1201808061f32c0e9d9260b8d2900f7bd7d4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-kon_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-kon_Latn
+doc_to_target: sentence_kon_Latn
+doc_to_text: "As a Kikongo and English linguist, translate the following English sentences\
+  \ to Kikongo \nEnglish: {{sentence_eng_Latn}} \nKikongo: "
+include: flores
+task: flores_eng_Latn-kon_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-lin_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-lin_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0945c697c27b39ed91cff296dd162735f0629f4a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-lin_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-lin_Latn
+doc_to_target: sentence_lin_Latn
+doc_to_text: "As a Lingala and English linguist, translate the following English sentences\
+  \ to Lingala \nEnglish: {{sentence_eng_Latn}} \nLingala: "
+include: flores
+task: flores_eng_Latn-lin_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-lua_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-lua_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ff92a2cf381a3a4c94a5543901be39a647d24eb7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-lua_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-lua_Latn
+doc_to_target: sentence_lua_Latn
+doc_to_text: "As a Luba-Kasai and English linguist, translate the following English\
+  \ sentences to Luba-Kasai \nEnglish: {{sentence_eng_Latn}} \nLuba-Kasai: "
+include: flores
+task: flores_eng_Latn-lua_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-lug_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-lug_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4dfc626b9fdbcde3de0383b5d512365137da00b5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-lug_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-lug_Latn
+doc_to_target: sentence_lug_Latn
+doc_to_text: "As a Luganda and English linguist, translate the following English sentences\
+  \ to Luganda \nEnglish: {{sentence_eng_Latn}} \nLuganda: "
+include: flores
+task: flores_eng_Latn-lug_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-luo_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-luo_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..803ed75d8b732c81f859285f974cd216afb86784
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-luo_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-luo_Latn
+doc_to_target: sentence_luo_Latn
+doc_to_text: "As a Luo and English linguist, translate the following English sentences\
+  \ to Luo \nEnglish: {{sentence_eng_Latn}} \nLuo: "
+include: flores
+task: flores_eng_Latn-luo_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-mos_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-mos_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0e959db1653eff6ca0054ec5032144a96c2c5713
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-mos_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-mos_Latn
+doc_to_target: sentence_mos_Latn
+doc_to_text: "As a Mossi and English linguist, translate the following English sentences\
+  \ to Mossi \nEnglish: {{sentence_eng_Latn}} \nMossi: "
+include: flores
+task: flores_eng_Latn-mos_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-nso_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-nso_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..44839d82cac5af78f74b2f382d36cbd74f93baa6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-nso_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-nso_Latn
+doc_to_target: sentence_nso_Latn
+doc_to_text: "As a Northern Sotho and English linguist, translate the following English\
+  \ sentences to Northern Sotho \nEnglish: {{sentence_eng_Latn}} \nNorthern Sotho: "
+include: flores
+task: flores_eng_Latn-nso_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-nus_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-nus_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..387e4341f0761727d3e07a8748291aac574c727f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-nus_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-nus_Latn
+doc_to_target: sentence_nus_Latn
+doc_to_text: "As a Nuer and English linguist, translate the following English sentences\
+  \ to Nuer \nEnglish: {{sentence_eng_Latn}} \nNuer: "
+include: flores
+task: flores_eng_Latn-nus_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-nya_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-nya_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9311e264e1f7e617a22c52e7ac969b1001f7c5e7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-nya_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-nya_Latn
+doc_to_target: sentence_nya_Latn
+doc_to_text: "As a Nyanja and English linguist, translate the following English sentences\
+  \ to Nyanja \nEnglish: {{sentence_eng_Latn}} \nNyanja: "
+include: flores
+task: flores_eng_Latn-nya_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-plt_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-plt_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..afc81158cbed0e268746ee51c1d4e1071f4315e0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-plt_Latn.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: eng_Latn-plt_Latn
+doc_to_target: sentence_plt_Latn
+doc_to_text: "As a Plateau Malagasy and English linguist, translate the following\
+  \ English sentences to Plateau Malagasy \nEnglish: {{sentence_eng_Latn}} \nPlateau\
+  \ Malagasy: "
+include: flores
+task: flores_eng_Latn-plt_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-run_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-run_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..519700cd32de76f039a6f1d3ce16a4c539278334
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-run_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-run_Latn
+doc_to_target: sentence_run_Latn
+doc_to_text: "As a Rundi and English linguist, translate the following English sentences\
+  \ to Rundi \nEnglish: {{sentence_eng_Latn}} \nRundi: "
+include: flores
+task: flores_eng_Latn-run_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-sag_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-sag_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fa99b16137861e1e9fb4f19669dbb71977fd3cc1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-sag_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-sag_Latn
+doc_to_target: sentence_sag_Latn
+doc_to_text: "As a Sango and English linguist, translate the following English sentences\
+  \ to Sango \nEnglish: {{sentence_eng_Latn}} \nSango: "
+include: flores
+task: flores_eng_Latn-sag_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-sna_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-sna_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fd7ac49ac5854133223a599569981f4c27d19a21
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-sna_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-sna_Latn
+doc_to_target: sentence_sna_Latn
+doc_to_text: "As a Shona and English linguist, translate the following English sentences\
+  \ to Shona \nEnglish: {{sentence_eng_Latn}} \nShona: "
+include: flores
+task: flores_eng_Latn-sna_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-som_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-som_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..17870addf00ed2c4f0c4e126fc094a06aabc8027
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-som_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-som_Latn
+doc_to_target: sentence_som_Latn
+doc_to_text: "As a Somali and English linguist, translate the following English sentences\
+  \ to Somali \nEnglish: {{sentence_eng_Latn}} \nSomali: "
+include: flores
+task: flores_eng_Latn-som_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-sot_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-sot_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a45cf383057f37f504f799d7cb241ec61274fd83
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-sot_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-sot_Latn
+doc_to_target: sentence_sot_Latn
+doc_to_text: "As a Southern Sotho and English linguist, translate the following English\
+  \ sentences to Southern Sotho \nEnglish: {{sentence_eng_Latn}} \nSouthern Sotho: "
+include: flores
+task: flores_eng_Latn-sot_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-ssw_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-ssw_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0dbd162772b8938aef4f05da00ac3da0ce3be530
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-ssw_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-ssw_Latn
+doc_to_target: sentence_ssw_Latn
+doc_to_text: "As a Swati and English linguist, translate the following English sentences\
+  \ to Swati \nEnglish: {{sentence_eng_Latn}} \nSwati: "
+include: flores
+task: flores_eng_Latn-ssw_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-sun_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-sun_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0f8f6339450e8af7318e570290370a21037ba98d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-sun_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-sun_Latn
+doc_to_target: sentence_sun_Latn
+doc_to_text: "As a Sundanese and English linguist, translate the following English\
+  \ sentences to Sundanese \nEnglish: {{sentence_eng_Latn}} \nSundanese: "
+include: flores
+task: flores_eng_Latn-sun_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-swh_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-swh_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..20971c5383cbcce97b3743262adc35c9d2dfadcf
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-swh_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-swh_Latn
+doc_to_target: sentence_swh_Latn
+doc_to_text: "As a Swahili and English linguist, translate the following English sentences\
+  \ to Swahili \nEnglish: {{sentence_eng_Latn}} \nSwahili: "
+include: flores
+task: flores_eng_Latn-swh_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-taq_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-taq_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bdb06f77b78b9ca296eb960fc62a63a94e990fd8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-taq_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-taq_Latn
+doc_to_target: sentence_taq_Latn
+doc_to_text: "As a Tamasheq and English linguist, translate the following English\
+  \ sentences to Tamasheq \nEnglish: {{sentence_eng_Latn}} \nTamasheq: "
+include: flores
+task: flores_eng_Latn-taq_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-taq_Tfng.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-taq_Tfng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d690651ddb21eaf3022475e98dc8f5ddf72f073b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-taq_Tfng.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: eng_Latn-taq_Tfng
+doc_to_target: sentence_taq_Tfng
+doc_to_text: "As a Tamasheq (Tifinagh script) and English linguist, translate the\
+  \ following English sentences to Tamasheq (Tifinagh script) \nEnglish: {{sentence_eng_Latn}}\
+  \ \nTamasheq (Tifinagh script): "
+include: flores
+task: flores_eng_Latn-taq_Tfng_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-tir_Ethi.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-tir_Ethi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c6b3ba347ab935d9c87a48262a9cafb259985ea0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-tir_Ethi.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-tir_Ethi
+doc_to_target: sentence_tir_Ethi
+doc_to_text: "As a Tigrinya and English linguist, translate the following English\
+  \ sentences to Tigrinya \nEnglish: {{sentence_eng_Latn}} \nTigrinya: "
+include: flores
+task: flores_eng_Latn-tir_Ethi_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-tsn_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-tsn_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..845626f5f347436a0fcd2c04fe3446cb806da44e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-tsn_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-tsn_Latn
+doc_to_target: sentence_tsn_Latn
+doc_to_text: "As a Setswana and English linguist, translate the following English\
+  \ sentences to Setswana \nEnglish: {{sentence_eng_Latn}} \nSetswana: "
+include: flores
+task: flores_eng_Latn-tsn_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-tso_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-tso_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..958411f89ea39c77cc329ce5f14795761ec1f1a9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-tso_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-tso_Latn
+doc_to_target: sentence_tso_Latn
+doc_to_text: "As a Tsonga and English linguist, translate the following English sentences\
+  \ to Tsonga \nEnglish: {{sentence_eng_Latn}} \nTsonga: "
+include: flores
+task: flores_eng_Latn-tso_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-tum_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-tum_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..95e6efa7dbdd8ea987bd680954e69e97742c452d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-tum_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-tum_Latn
+doc_to_target: sentence_tum_Latn
+doc_to_text: "As a Tumbuka and English linguist, translate the following English sentences\
+  \ to Tumbuka \nEnglish: {{sentence_eng_Latn}} \nTumbuka: "
+include: flores
+task: flores_eng_Latn-tum_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-twi_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-twi_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0dcb20543ccff51461bc2af582fcdfaa5855d25f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-twi_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-twi_Latn
+doc_to_target: sentence_twi_Latn
+doc_to_text: "As a Twi and English linguist, translate the following English sentences\
+  \ to Twi \nEnglish: {{sentence_eng_Latn}} \nTwi: "
+include: flores
+task: flores_eng_Latn-twi_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-tzm_Tfng.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-tzm_Tfng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..887344c67c0fe6cd1658c8172b55a252bfc9f12b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-tzm_Tfng.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: eng_Latn-tzm_Tfng
+doc_to_target: sentence_tzm_Tfng
+doc_to_text: "As a Central Atlas Tamazight and English linguist, translate the following\
+  \ English sentences to Central Atlas Tamazight \nEnglish: {{sentence_eng_Latn}}\
+  \ \nCentral Atlas Tamazight: "
+include: flores
+task: flores_eng_Latn-tzm_Tfng_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-umb_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-umb_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b8c4adc0139f6f2d9ce4fcd910e3f558b96df78d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-umb_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-umb_Latn
+doc_to_target: sentence_umb_Latn
+doc_to_text: "As a Umbundu and English linguist, translate the following English sentences\
+  \ to Umbundu \nEnglish: {{sentence_eng_Latn}} \nUmbundu: "
+include: flores
+task: flores_eng_Latn-umb_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-wol_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-wol_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..66ad25794e67d54123f2a63ffad5e98d12c6ce59
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-wol_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-wol_Latn
+doc_to_target: sentence_wol_Latn
+doc_to_text: "As a Wolof and English linguist, translate the following English sentences\
+  \ to Wolof \nEnglish: {{sentence_eng_Latn}} \nWolof: "
+include: flores
+task: flores_eng_Latn-wol_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-xho_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-xho_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8cd2fe08ec7bcd000182a7a9f080e739b6d82289
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-xho_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-xho_Latn
+doc_to_target: sentence_xho_Latn
+doc_to_text: "As a Xhosa and English linguist, translate the following English sentences\
+  \ to Xhosa \nEnglish: {{sentence_eng_Latn}} \nXhosa: "
+include: flores
+task: flores_eng_Latn-xho_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-yor_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-yor_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..09562458138acb1d146e5319816b01e2205351ce
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-yor_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-yor_Latn
+doc_to_target: sentence_yor_Latn
+doc_to_text: "As a Yoruba and English linguist, translate the following English sentences\
+  \ to Yoruba \nEnglish: {{sentence_eng_Latn}} \nYoruba: "
+include: flores
+task: flores_eng_Latn-yor_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-zul_Latn.yaml b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-zul_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..15b41952e52b3acfee85294c372ecb954f8b37cc
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/english-african/flores_eng_Latn-zul_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn-zul_Latn
+doc_to_target: sentence_zul_Latn
+doc_to_text: "As a Zulu and English linguist, translate the following English sentences\
+  \ to Zulu \nEnglish: {{sentence_eng_Latn}} \nZulu: "
+include: flores
+task: flores_eng_Latn-zul_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/flores/prompt_3/flores b/lm_eval/tasks/afrobench/flores/prompt_3/flores
new file mode 100644
index 0000000000000000000000000000000000000000..74f9f33eb22662bec79709bd64d8d31f3fb8eae0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/flores/prompt_3/flores
@@ -0,0 +1,24 @@
+tag:
+- flores_tasks
+- flores_afr-eng
+dataset_path: facebook/flores
+dataset_kwargs: {trust_remote_code: True}
+output_type: generate_until
+validation_split: dev
+fewshot_split: dev
+test_split: devtest
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: chrf
+    aggregation: chrf
+    higher_is_better: true
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+repeats: 1
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/injongointent/README.md b/lm_eval/tasks/afrobench/injongointent/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..641877cb7c01a5b19791b20c95a246753ddee75a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/README.md
@@ -0,0 +1,23 @@
+#
+
+## Paper
+Title: `INJONGO: A Multicultural Intent Detection and Slot-filling Dataset for 16 African Languages`
+
+Paper Link: https://arxiv.org/abs/2502.09814
+
+## Abstract
+>Slot-filling and intent detection are well-established tasks in Conversational AI. However, current large-scale benchmarks for these tasks often exclude evaluations of low-resource languages and rely on translations from English benchmarks, thereby predominantly reflecting Western-centric concepts. In this paper, we introduce Injongo -- a multicultural, open-source benchmark dataset for 16 African languages with utterances generated by native speakers across diverse domains, including banking, travel, home, and dining. Through extensive experiments, we benchmark the fine-tuning multilingual transformer models and the prompting large language models (LLMs), and show the advantage of leveraging African-cultural utterances over Western-centric utterances for improving cross-lingual transfer from the English language. Experimental results reveal that current LLMs struggle with the slot-filling task, with GPT-4o achieving an average performance of 26 F1-score. In contrast, intent detection performance is notably better, with an average accuracy of 70.6%, though it still falls behind the fine-tuning baselines. Compared to the English language, GPT-4o and fine-tuning baselines perform similarly on intent detection, achieving an accuracy of approximately 81%. Our findings suggest that the performance of LLMs is still behind for many low-resource African languages, and more work is needed to further improve their downstream performance.
+
+### Citation
+
+```
+@misc{yu2025injongomulticulturalintentdetection,
+      title={INJONGO: A Multicultural Intent Detection and Slot-filling Dataset for 16 African Languages},
+      author={Hao Yu and Jesujoba O. Alabi and Andiswa Bukula and Jian Yun Zhuang and En-Shiun Annie Lee and Tadesse Kebede Guge and Israel Abebe Azime and Happy Buzaaba and Blessing Kudzaishe Sibanda and Godson K. Kalipe and Jonathan Mukiibi and Salomon Kabongo Kabenamualu and Mmasibidi Setaka and Lolwethu Ndolela and Nkiruka Odu and Rooweither Mabuya and Shamsuddeen Hassan Muhammad and Salomey Osei and Sokhar Samb and Juliet W. Murage and Dietrich Klakow and David Ifeoluwa Adelani},
+      year={2025},
+      eprint={2502.09814},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2502.09814},
+}
+```
diff --git a/lm_eval/tasks/afrobench/injongointent/gen_utils.py b/lm_eval/tasks/afrobench/injongointent/gen_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..112041999df20d26a31becc30633720a16457b18
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/gen_utils.py
@@ -0,0 +1,159 @@
+import argparse
+import os
+
+import yaml
+
+
+class FunctionTag:
+    def __init__(self, value):
+        self.value = value
+
+
+def prompt_func(mode, lang, intent):
+    prompt_map = {
+        "prompt_1": "Given the text: '{{text}}', determine the correct intent from the following list: "
+        f"[{', '.join(intent)}]. Only output one intent from the list.",
+        "prompt_2": "Analyze the text: '{{text}}'. Choose the most appropriate intent from these options: "
+        f"[{', '.join(intent)}]. Respond with only the selected intent.",
+        "prompt_3": "You are a linguistic analyst trained to understand user intent. Based on the text: '{{text}}', "
+        f"choose the intent that best matches from this list: [{', '.join(intent)}]. Return only the intent.",
+        "prompt_4": f"You are a {lang} linguistic analyst trained to understand {lang} user intent. Based on the {lang}"
+        "text: '{{text}}', choose the intent that best matches from this list: "
+        f"[{', '.join(intent)}]. Return only the intent.",
+        "prompt_5": f"The following text is in {lang}: '{{{{text}}}}'. Given the list of intents: [{', '.join(intent)}], "
+        "identify the intent expressed in the text. Return only the identified intent.",
+    }
+    return prompt_map[mode]
+
+
+def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
+    """
+    Generate a yaml file for each language.
+
+    :param output_dir: The directory to output the files to.
+    :param overwrite: Whether to overwrite files if they already exist.
+    """
+    err = []
+    languages = {
+        "amh": "Amharic",
+        "ewe": "Ewe",
+        "hau": "Hausa",
+        "ibo": "Igbo",
+        "kin": "Kinyarwanda",
+        "lin": "Lingala",
+        "lug": "Luganda",
+        "orm": "Oromo",
+        "sna": "Shona",
+        "sot": "Sotho",
+        "swa": "Swahili",
+        "twi": "Twi",
+        "wol": "Wolof",
+        "xho": "Xhosa",
+        "yor": "Yoruba",
+        "zul": "Zulu",
+        "eng": "English",
+    }
+
+    intents = [
+        "alarm",
+        "balance",
+        "bill_balance",
+        "book_flight",
+        "book_hotel",
+        "calendar_update",
+        "cancel_reservation",
+        "car_rental",
+        "confirm_reservation",
+        "cook_time",
+        "exchange_rate",
+        "food_last",
+        "freeze_account",
+        "ingredients_list",
+        "interest_rate",
+        "international_visa",
+        "make_call",
+        "meal_suggestion",
+        "min_payment",
+        "pay_bill",
+        "pin_change",
+        "play_music",
+        "plug_type",
+        "recipe",
+        "restaurant_reservation",
+        "restaurant_reviews",
+        "restaurant_suggestion",
+        "share_location",
+        "shopping_list_update",
+        "spending_history",
+        "text",
+        "time",
+        "timezone",
+        "transactions",
+        "transfer",
+        "translate",
+        "travel_notification",
+        "travel_suggestion",
+        "update_playlist",
+        "weather",
+    ]
+
+    for lang in languages.keys():
+        try:
+            file_name = f"injongointent_{lang}.yaml"
+            task_name = f"injongointent_{lang}_{mode}"
+            yaml_template = "injongointent"
+            yaml_details = {
+                "include": yaml_template,
+                "task": task_name,
+                "dataset_name": lang,
+                "doc_to_text": prompt_func(mode, languages[lang], intents),
+            }
+            os.makedirs(f"{output_dir}/{mode}", exist_ok=True)
+            with open(
+                f"{output_dir}/{mode}/{file_name}",
+                "w" if overwrite else "x",
+                encoding="utf8",
+            ) as f:
+                f.write("# Generated by utils.py\n")
+                yaml.dump(
+                    yaml_details,
+                    f,
+                    allow_unicode=True,
+                )
+        except FileExistsError:
+            err.append(file_name)
+
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist (use --overwrite flag):"
+            f" {', '.join(err)}"
+        )
+
+
+def main() -> None:
+    """Parse CLI args and generate language-specific yaml files."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--overwrite",
+        default=True,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default="./",
+        help="Directory to write yaml files to",
+    )
+    parser.add_argument(
+        "--mode",
+        default="prompt_3",
+        choices=["prompt_1", "prompt_2", "prompt_3", "prompt_4", "prompt_5"],
+        help="Prompt number",
+    )
+    args = parser.parse_args()
+
+    gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite, mode=args.mode)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lm_eval/tasks/afrobench/injongointent/injongointent.yaml b/lm_eval/tasks/afrobench/injongointent/injongointent.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..220f4c514f0afb8ec9105d54c90f834b4fd57780
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/injongointent.yaml
@@ -0,0 +1,13 @@
+group: injongointent
+task:
+  - injongointent_prompt_1
+  - injongointent_prompt_2
+  - injongointent_prompt_3
+  - injongointent_prompt_4
+  - injongointent_prompt_5
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent b/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent
new file mode 100644
index 0000000000000000000000000000000000000000..a77bc5c95941392779b960df6ad26ebebe5ba96d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent
@@ -0,0 +1,75 @@
+tag:
+- injongointent_tasks
+- injongointent_prompt_1
+dataset_path: masakhane/InjongoIntent
+dataset_name: null
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: validation
+doc_to_target: intent
+doc_to_choice:
+  - alarm
+  - balance
+  - bill_balance
+  - book_flight
+  - book_hotel
+  - calendar_update
+  - cancel_reservation
+  - car_rental
+  - confirm_reservation
+  - cook_time
+  - exchange_rate
+  - food_last
+  - freeze_account
+  - ingredients_list
+  - interest_rate
+  - international_visa
+  - make_call
+  - meal_suggestion
+  - min_payment
+  - pay_bill
+  - pin_change
+  - play_music
+  - plug_type
+  - recipe
+  - restaurant_reservation
+  - restaurant_reviews
+  - restaurant_suggestion
+  - share_location
+  - shopping_list_update
+  - spending_history
+  - text
+  - time
+  - timezone
+  - transactions
+  - transfer
+  - translate
+  - travel_notification
+  - travel_suggestion
+  - update_playlist
+  - weather
+should_decontaminate: true
+doc_to_decontamination_query: text
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_amh.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0b3a3ee270683d5cc57a6e6ce81a3fe971f6c04e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_amh.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: amh
+doc_to_text: 'Given the text: ''{{text}}'', determine the correct intent from the
+  following list: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Only
+  output one intent from the list.'
+include: injongointent
+task: injongointent_amh_prompt_1
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_eng.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..240c37d5f1cd4197314c51532ab25ab2e915e2ad
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_eng.yaml
@@ -0,0 +1,16 @@
+# Generated by utils.py
+dataset_name: eng
+validation_split: train
+test_split: test
+fewshot_split: train
+doc_to_text: 'Given the text: ''{{text}}'', determine the correct intent from the
+  following list: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Only
+  output one intent from the list.'
+include: injongointent
+task: injongointent_eng_prompt_1
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_ewe.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c08d8bb0c151a812b1cd6d5131e4a0d1664f8725
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_ewe.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: ewe
+doc_to_text: 'Given the text: ''{{text}}'', determine the correct intent from the
+  following list: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Only
+  output one intent from the list.'
+include: injongointent
+task: injongointent_ewe_prompt_1
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_hau.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9e1338c72cda04bbbbca713f83dd6aa715f19014
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_hau.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: 'Given the text: ''{{text}}'', determine the correct intent from the
+  following list: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Only
+  output one intent from the list.'
+include: injongointent
+task: injongointent_hau_prompt_1
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_ibo.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e4a956d23c414332d353f9c7d04ac3ce876831ed
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_ibo.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: 'Given the text: ''{{text}}'', determine the correct intent from the
+  following list: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Only
+  output one intent from the list.'
+include: injongointent
+task: injongointent_ibo_prompt_1
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_kin.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f55d787a3f4952afcbb3f70d9432bfc7dbf0a84e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_kin.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_text: 'Given the text: ''{{text}}'', determine the correct intent from the
+  following list: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Only
+  output one intent from the list.'
+include: injongointent
+task: injongointent_kin_prompt_1
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_lin.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2cc08df484bf58f9eaf2d498074eb1ac5dc72338
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_lin.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: lin
+doc_to_text: 'Given the text: ''{{text}}'', determine the correct intent from the
+  following list: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Only
+  output one intent from the list.'
+include: injongointent
+task: injongointent_lin_prompt_1
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_lug.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b1a421577bbe8ff9ce34b00da45fdb3efcb22e9d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_lug.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: lug
+doc_to_text: 'Given the text: ''{{text}}'', determine the correct intent from the
+  following list: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Only
+  output one intent from the list.'
+include: injongointent
+task: injongointent_lug_prompt_1
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_orm.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2b95a4e7b9cb3afe8c745cbd62a94c3fad6a5314
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_orm.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: orm
+doc_to_text: 'Given the text: ''{{text}}'', determine the correct intent from the
+  following list: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Only
+  output one intent from the list.'
+include: injongointent
+task: injongointent_orm_prompt_1
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_sna.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6cbf0105abe8b3157df4c6898e3873fb25beba28
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_sna.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: sna
+doc_to_text: 'Given the text: ''{{text}}'', determine the correct intent from the
+  following list: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Only
+  output one intent from the list.'
+include: injongointent
+task: injongointent_sna_prompt_1
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_sot.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ad3b4497f8dbbb678048dc9dfa8aa8894dd241fe
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_sot.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: sot
+doc_to_text: 'Given the text: ''{{text}}'', determine the correct intent from the
+  following list: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Only
+  output one intent from the list.'
+include: injongointent
+task: injongointent_sot_prompt_1
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_swa.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fc3d797c05c018baf599dcee28bccc0dd5c5ab72
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_swa.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: 'Given the text: ''{{text}}'', determine the correct intent from the
+  following list: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Only
+  output one intent from the list.'
+include: injongointent
+task: injongointent_swa_prompt_1
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_twi.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..73fc61c7e6ab11fbf71d8842818f020f147b5443
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_twi.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_text: 'Given the text: ''{{text}}'', determine the correct intent from the
+  following list: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Only
+  output one intent from the list.'
+include: injongointent
+task: injongointent_twi_prompt_1
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_wol.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7d359d2f8e4cb91028705cc8c76842c9b5e78c3c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_wol.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: wol
+doc_to_text: 'Given the text: ''{{text}}'', determine the correct intent from the
+  following list: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Only
+  output one intent from the list.'
+include: injongointent
+task: injongointent_wol_prompt_1
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_xho.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4d9c173aac2832665724358697d59d3bf8f38e56
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_xho.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: xho
+doc_to_text: 'Given the text: ''{{text}}'', determine the correct intent from the
+  following list: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Only
+  output one intent from the list.'
+include: injongointent
+task: injongointent_xho_prompt_1
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_yor.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..682e01c12972c8b9a98e53711b307ebbf62676fa
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_yor.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: 'Given the text: ''{{text}}'', determine the correct intent from the
+  following list: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Only
+  output one intent from the list.'
+include: injongointent
+task: injongointent_yor_prompt_1
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_zul.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1d38a78141e1912ff995f6d87c0753f317ec6ad0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_1/injongointent_zul.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: zul
+doc_to_text: 'Given the text: ''{{text}}'', determine the correct intent from the
+  following list: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Only
+  output one intent from the list.'
+include: injongointent
+task: injongointent_zul_prompt_1
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_1/utils.py b/lm_eval/tasks/afrobench/injongointent/prompt_1/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e735e2deb1f9c53152c072615aebe8ba3acb90b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_1/utils.py
@@ -0,0 +1 @@
+from lm_eval.utils import weighted_f1_score
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent b/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent
new file mode 100644
index 0000000000000000000000000000000000000000..dfcb82678a61524c08bfd2d7e2d2ec0a50330f27
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent
@@ -0,0 +1,75 @@
+tag:
+- injongointent_tasks
+- injongointent_prompt_2
+dataset_path: masakhane/InjongoIntent
+dataset_name: null
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: validation
+doc_to_target: intent
+doc_to_choice:
+  - alarm
+  - balance
+  - bill_balance
+  - book_flight
+  - book_hotel
+  - calendar_update
+  - cancel_reservation
+  - car_rental
+  - confirm_reservation
+  - cook_time
+  - exchange_rate
+  - food_last
+  - freeze_account
+  - ingredients_list
+  - interest_rate
+  - international_visa
+  - make_call
+  - meal_suggestion
+  - min_payment
+  - pay_bill
+  - pin_change
+  - play_music
+  - plug_type
+  - recipe
+  - restaurant_reservation
+  - restaurant_reviews
+  - restaurant_suggestion
+  - share_location
+  - shopping_list_update
+  - spending_history
+  - text
+  - time
+  - timezone
+  - transactions
+  - transfer
+  - translate
+  - travel_notification
+  - travel_suggestion
+  - update_playlist
+  - weather
+should_decontaminate: true
+doc_to_decontamination_query: text
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_amh.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7c1b2189ac768bef8c6263ccc41a003cd00ee6d8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_amh.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: amh
+doc_to_text: 'Analyze the text: ''{{text}}''. Choose the most appropriate intent from
+  these options: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Respond
+  with only the selected intent.'
+include: injongointent
+task: injongointent_amh_prompt_2
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_eng.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cc03705d5868b6c67b9973937f4bba59f08bd8c7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_eng.yaml
@@ -0,0 +1,16 @@
+# Generated by utils.py
+dataset_name: eng
+validation_split: train
+test_split: test
+fewshot_split: train
+doc_to_text: 'Analyze the text: ''{{text}}''. Choose the most appropriate intent from
+  these options: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Respond
+  with only the selected intent.'
+include: injongointent
+task: injongointent_eng_prompt_2
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_ewe.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..58eb914472609d29836edb92f5c676deac50166a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_ewe.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: ewe
+doc_to_text: 'Analyze the text: ''{{text}}''. Choose the most appropriate intent from
+  these options: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Respond
+  with only the selected intent.'
+include: injongointent
+task: injongointent_ewe_prompt_2
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_hau.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e7745369ac6af719ea71e9f4e3032bcc7b66a8ac
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_hau.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: 'Analyze the text: ''{{text}}''. Choose the most appropriate intent from
+  these options: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Respond
+  with only the selected intent.'
+include: injongointent
+task: injongointent_hau_prompt_2
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_ibo.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b47052d71829c170849a07957a0412b8f21bddb2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_ibo.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: 'Analyze the text: ''{{text}}''. Choose the most appropriate intent from
+  these options: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Respond
+  with only the selected intent.'
+include: injongointent
+task: injongointent_ibo_prompt_2
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_kin.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..457935674bd84bb39e96d8ba000497430963f983
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_kin.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_text: 'Analyze the text: ''{{text}}''. Choose the most appropriate intent from
+  these options: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Respond
+  with only the selected intent.'
+include: injongointent
+task: injongointent_kin_prompt_2
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_lin.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..54e7fcb71bb91ce4272dbd4723b7e2059da71c12
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_lin.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: lin
+doc_to_text: 'Analyze the text: ''{{text}}''. Choose the most appropriate intent from
+  these options: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Respond
+  with only the selected intent.'
+include: injongointent
+task: injongointent_lin_prompt_2
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_lug.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..96aa42fc9e2bed859ca386091267ca48b9566399
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_lug.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: lug
+doc_to_text: 'Analyze the text: ''{{text}}''. Choose the most appropriate intent from
+  these options: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Respond
+  with only the selected intent.'
+include: injongointent
+task: injongointent_lug_prompt_2
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_orm.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..872f96c542cd697bef3b0f92294e9247cedb458a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_orm.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: orm
+doc_to_text: 'Analyze the text: ''{{text}}''. Choose the most appropriate intent from
+  these options: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Respond
+  with only the selected intent.'
+include: injongointent
+task: injongointent_orm_prompt_2
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_sna.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a62dfe34a4611319a56bc06f7bb01c447ac7ad6f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_sna.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: sna
+doc_to_text: 'Analyze the text: ''{{text}}''. Choose the most appropriate intent from
+  these options: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Respond
+  with only the selected intent.'
+include: injongointent
+task: injongointent_sna_prompt_2
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_sot.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e9ca6a5675529c7bb3701d332c20eb1c2af19d53
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_sot.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: sot
+doc_to_text: 'Analyze the text: ''{{text}}''. Choose the most appropriate intent from
+  these options: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Respond
+  with only the selected intent.'
+include: injongointent
+task: injongointent_sot_prompt_2
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_swa.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..339f66ac8d569245f6e8cab2f3456fcea24713d4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_swa.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: 'Analyze the text: ''{{text}}''. Choose the most appropriate intent from
+  these options: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Respond
+  with only the selected intent.'
+include: injongointent
+task: injongointent_swa_prompt_2
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_twi.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b758bce0493c722c0d995cdbf8cc5e4b409e4af9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_twi.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_text: 'Analyze the text: ''{{text}}''. Choose the most appropriate intent from
+  these options: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Respond
+  with only the selected intent.'
+include: injongointent
+task: injongointent_twi_prompt_2
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_wol.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9b573a444b22b2e34ed10ee9f75a914f16177bfc
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_wol.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: wol
+doc_to_text: 'Analyze the text: ''{{text}}''. Choose the most appropriate intent from
+  these options: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Respond
+  with only the selected intent.'
+include: injongointent
+task: injongointent_wol_prompt_2
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_xho.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f2c02205fdb7b9701096089c26d6813fff802dd7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_xho.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: xho
+doc_to_text: 'Analyze the text: ''{{text}}''. Choose the most appropriate intent from
+  these options: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Respond
+  with only the selected intent.'
+include: injongointent
+task: injongointent_xho_prompt_2
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_yor.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4c821736809b4a37d94dedc82e6943594585ce35
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_yor.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: 'Analyze the text: ''{{text}}''. Choose the most appropriate intent from
+  these options: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Respond
+  with only the selected intent.'
+include: injongointent
+task: injongointent_yor_prompt_2
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_zul.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c8a541b66bfb14f692c1230871f2482452eb7347
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_2/injongointent_zul.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: zul
+doc_to_text: 'Analyze the text: ''{{text}}''. Choose the most appropriate intent from
+  these options: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Respond
+  with only the selected intent.'
+include: injongointent
+task: injongointent_zul_prompt_2
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_2/utils.py b/lm_eval/tasks/afrobench/injongointent/prompt_2/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e735e2deb1f9c53152c072615aebe8ba3acb90b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_2/utils.py
@@ -0,0 +1 @@
+from lm_eval.utils import weighted_f1_score
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent b/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent
new file mode 100644
index 0000000000000000000000000000000000000000..afdf43cfc10b75238debbd5dbab36ac493872025
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent
@@ -0,0 +1,75 @@
+tag:
+- injongointent_tasks
+- injongointent_prompt_3
+dataset_path: masakhane/InjongoIntent
+dataset_name: null
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: validation
+doc_to_target: intent
+doc_to_choice:
+  - alarm
+  - balance
+  - bill_balance
+  - book_flight
+  - book_hotel
+  - calendar_update
+  - cancel_reservation
+  - car_rental
+  - confirm_reservation
+  - cook_time
+  - exchange_rate
+  - food_last
+  - freeze_account
+  - ingredients_list
+  - interest_rate
+  - international_visa
+  - make_call
+  - meal_suggestion
+  - min_payment
+  - pay_bill
+  - pin_change
+  - play_music
+  - plug_type
+  - recipe
+  - restaurant_reservation
+  - restaurant_reviews
+  - restaurant_suggestion
+  - share_location
+  - shopping_list_update
+  - spending_history
+  - text
+  - time
+  - timezone
+  - transactions
+  - transfer
+  - translate
+  - travel_notification
+  - travel_suggestion
+  - update_playlist
+  - weather
+should_decontaminate: true
+doc_to_decontamination_query: text
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_amh.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7bd62c5b00eedcfb1cd254ae679426955cacb8a0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_amh.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: amh
+doc_to_text: 'You are a linguistic analyst trained to understand user intent. Based
+  on the text: ''{{text}}'', choose the intent that best matches from this list: [alarm,
+  balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather]. Return only the intent.'
+include: injongointent
+task: injongointent_amh_prompt_3
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_eng.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..258f0cfab4a4dc9ee8f7f8baceb2c2bac35c0c02
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_eng.yaml
@@ -0,0 +1,16 @@
+# Generated by utils.py
+dataset_name: eng
+validation_split: train
+test_split: test
+fewshot_split: train
+doc_to_text: 'You are a linguistic analyst trained to understand user intent. Based
+  on the text: ''{{text}}'', choose the intent that best matches from this list: [alarm,
+  balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather]. Return only the intent.'
+include: injongointent
+task: injongointent_eng_prompt_3
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_ewe.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..12688cc9d97dcacc50419a91707ac2f5fa47363b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_ewe.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: ewe
+doc_to_text: 'You are a linguistic analyst trained to understand user intent. Based
+  on the text: ''{{text}}'', choose the intent that best matches from this list: [alarm,
+  balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather]. Return only the intent.'
+include: injongointent
+task: injongointent_ewe_prompt_3
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_hau.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8414a09bae932411fd51cfbcf9d5efbbb2d93a45
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_hau.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: 'You are a linguistic analyst trained to understand user intent. Based
+  on the text: ''{{text}}'', choose the intent that best matches from this list: [alarm,
+  balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather]. Return only the intent.'
+include: injongointent
+task: injongointent_hau_prompt_3
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_ibo.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8f254438388e95b551fd9344758851b0a9fc8768
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_ibo.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: 'You are a linguistic analyst trained to understand user intent. Based
+  on the text: ''{{text}}'', choose the intent that best matches from this list: [alarm,
+  balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather]. Return only the intent.'
+include: injongointent
+task: injongointent_ibo_prompt_3
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_kin.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b946cf004f775642ad8f0b9d5995820b97187f0a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_kin.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_text: 'You are a linguistic analyst trained to understand user intent. Based
+  on the text: ''{{text}}'', choose the intent that best matches from this list: [alarm,
+  balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather]. Return only the intent.'
+include: injongointent
+task: injongointent_kin_prompt_3
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_lin.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4a485d5ce807f0f9200ccdf535d5427697561bbd
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_lin.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: lin
+doc_to_text: 'You are a linguistic analyst trained to understand user intent. Based
+  on the text: ''{{text}}'', choose the intent that best matches from this list: [alarm,
+  balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather]. Return only the intent.'
+include: injongointent
+task: injongointent_lin_prompt_3
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_lug.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..71376ec3f8a3ae742cc694ea0d19c57ad9187b0b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_lug.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: lug
+doc_to_text: 'You are a linguistic analyst trained to understand user intent. Based
+  on the text: ''{{text}}'', choose the intent that best matches from this list: [alarm,
+  balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather]. Return only the intent.'
+include: injongointent
+task: injongointent_lug_prompt_3
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_orm.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..706f3a908a221dd3bc876adae4ddc40b6b4cb6ea
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_orm.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: orm
+doc_to_text: 'You are a linguistic analyst trained to understand user intent. Based
+  on the text: ''{{text}}'', choose the intent that best matches from this list: [alarm,
+  balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather]. Return only the intent.'
+include: injongointent
+task: injongointent_orm_prompt_3
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_sna.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f4aca73782fbcc8516983e588271bb336f674f2b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_sna.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: sna
+doc_to_text: 'You are a linguistic analyst trained to understand user intent. Based
+  on the text: ''{{text}}'', choose the intent that best matches from this list: [alarm,
+  balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather]. Return only the intent.'
+include: injongointent
+task: injongointent_sna_prompt_3
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_sot.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..57e27afab1edb9509d4f9901bea7fc114f249a09
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_sot.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: sot
+doc_to_text: 'You are a linguistic analyst trained to understand user intent. Based
+  on the text: ''{{text}}'', choose the intent that best matches from this list: [alarm,
+  balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather]. Return only the intent.'
+include: injongointent
+task: injongointent_sot_prompt_3
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_swa.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6cb4886d1f1d9ba9c87cbc78a436dad83d72487e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_swa.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: 'You are a linguistic analyst trained to understand user intent. Based
+  on the text: ''{{text}}'', choose the intent that best matches from this list: [alarm,
+  balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather]. Return only the intent.'
+include: injongointent
+task: injongointent_swa_prompt_3
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_twi.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e8623bf33bcc89d2b25390b487617159263327f6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_twi.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_text: 'You are a linguistic analyst trained to understand user intent. Based
+  on the text: ''{{text}}'', choose the intent that best matches from this list: [alarm,
+  balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather]. Return only the intent.'
+include: injongointent
+task: injongointent_twi_prompt_3
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_wol.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..afc3cf4a907143eb7a13dca4c4783a0c715e1524
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_wol.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: wol
+doc_to_text: 'You are a linguistic analyst trained to understand user intent. Based
+  on the text: ''{{text}}'', choose the intent that best matches from this list: [alarm,
+  balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather]. Return only the intent.'
+include: injongointent
+task: injongointent_wol_prompt_3
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_xho.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9f41aa561fbd6a6da39e9f90d93c4fa909ffbf72
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_xho.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: xho
+doc_to_text: 'You are a linguistic analyst trained to understand user intent. Based
+  on the text: ''{{text}}'', choose the intent that best matches from this list: [alarm,
+  balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather]. Return only the intent.'
+include: injongointent
+task: injongointent_xho_prompt_3
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_yor.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3a5d5686de20ff32a1a4207ea2711d2176464df5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_yor.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: 'You are a linguistic analyst trained to understand user intent. Based
+  on the text: ''{{text}}'', choose the intent that best matches from this list: [alarm,
+  balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather]. Return only the intent.'
+include: injongointent
+task: injongointent_yor_prompt_3
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_zul.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f857ff065b13b0eb050107b884a7b57f795dc779
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_3/injongointent_zul.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: zul
+doc_to_text: 'You are a linguistic analyst trained to understand user intent. Based
+  on the text: ''{{text}}'', choose the intent that best matches from this list: [alarm,
+  balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather]. Return only the intent.'
+include: injongointent
+task: injongointent_zul_prompt_3
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_3/utils.py b/lm_eval/tasks/afrobench/injongointent/prompt_3/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e735e2deb1f9c53152c072615aebe8ba3acb90b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_3/utils.py
@@ -0,0 +1 @@
+from lm_eval.utils import weighted_f1_score
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent b/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent
new file mode 100644
index 0000000000000000000000000000000000000000..5d5c05ae113bdcef17764decefc59f335ddb3ba3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent
@@ -0,0 +1,75 @@
+tag:
+- injongointent_tasks
+- injongointent_prompt_4
+dataset_path: masakhane/InjongoIntent
+dataset_name: null
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: validation
+doc_to_target: intent
+doc_to_choice:
+  - alarm
+  - balance
+  - bill_balance
+  - book_flight
+  - book_hotel
+  - calendar_update
+  - cancel_reservation
+  - car_rental
+  - confirm_reservation
+  - cook_time
+  - exchange_rate
+  - food_last
+  - freeze_account
+  - ingredients_list
+  - interest_rate
+  - international_visa
+  - make_call
+  - meal_suggestion
+  - min_payment
+  - pay_bill
+  - pin_change
+  - play_music
+  - plug_type
+  - recipe
+  - restaurant_reservation
+  - restaurant_reviews
+  - restaurant_suggestion
+  - share_location
+  - shopping_list_update
+  - spending_history
+  - text
+  - time
+  - timezone
+  - transactions
+  - transfer
+  - translate
+  - travel_notification
+  - travel_suggestion
+  - update_playlist
+  - weather
+should_decontaminate: true
+doc_to_decontamination_query: text
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_amh.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..aa14ee5b178f7577c036039b089678bcfa697a04
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_amh.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: amh
+doc_to_text: 'You are a Amharic linguistic analyst trained to understand Amharic user
+  intent. Based on the Amharictext: ''{{text}}'', choose the intent that best matches
+  from this list: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Return
+  only the intent.'
+include: injongointent
+task: injongointent_amh_prompt_4
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_eng.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..853e64965251e37e59efe72554557b2a378e358f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_eng.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: eng
+validation_split: train
+test_split: test
+fewshot_split: train
+doc_to_text: 'You are a English linguistic analyst trained to understand English user
+  intent. Based on the English text: ''{{text}}'', choose the intent that best matches
+  from this list: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Return
+  only the intent.'
+include: injongointent
+task: injongointent_eng_prompt_4
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_ewe.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f61a3db57d2bcf2af8cf30dc65ac08b896013fbe
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_ewe.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: ewe
+doc_to_text: 'You are a Ewe linguistic analyst trained to understand Ewe user intent.
+  Based on the Ewetext: ''{{text}}'', choose the intent that best matches from this
+  list: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather]. Return only the intent.'
+include: injongointent
+task: injongointent_ewe_prompt_4
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_hau.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fdef34cb847fbc8008ed496826bc2cb79361a546
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_hau.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: 'You are a Hausa linguistic analyst trained to understand Hausa user
+  intent. Based on the Hausatext: ''{{text}}'', choose the intent that best matches
+  from this list: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Return
+  only the intent.'
+include: injongointent
+task: injongointent_hau_prompt_4
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_ibo.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..23b59831ed97b56ed983d26871d155b1f72a176b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_ibo.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: 'You are a Igbo linguistic analyst trained to understand Igbo user intent.
+  Based on the Igbotext: ''{{text}}'', choose the intent that best matches from this
+  list: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather]. Return only the intent.'
+include: injongointent
+task: injongointent_ibo_prompt_4
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_kin.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..28f05aeb00423bb80f2f070763f3f22345ae4776
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_kin.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_text: 'You are a Kinyarwanda linguistic analyst trained to understand Kinyarwanda
+  user intent. Based on the Kinyarwandatext: ''{{text}}'', choose the intent that
+  best matches from this list: [alarm, balance, bill_balance, book_flight, book_hotel,
+  calendar_update, cancel_reservation, car_rental, confirm_reservation, cook_time,
+  exchange_rate, food_last, freeze_account, ingredients_list, interest_rate, international_visa,
+  make_call, meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type,
+  recipe, restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Return
+  only the intent.'
+include: injongointent
+task: injongointent_kin_prompt_4
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_lin.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..df991d89146112468bae83c7c5fe87eef307dc49
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_lin.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: lin
+doc_to_text: 'You are a Lingala linguistic analyst trained to understand Lingala user
+  intent. Based on the Lingalatext: ''{{text}}'', choose the intent that best matches
+  from this list: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Return
+  only the intent.'
+include: injongointent
+task: injongointent_lin_prompt_4
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_lug.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c1abb66edb31a6a76e987e665f91f630fe8d3416
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_lug.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: lug
+doc_to_text: 'You are a Luganda linguistic analyst trained to understand Luganda user
+  intent. Based on the Lugandatext: ''{{text}}'', choose the intent that best matches
+  from this list: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Return
+  only the intent.'
+include: injongointent
+task: injongointent_lug_prompt_4
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_orm.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..195ff4a232e782d38bae93b33648535934ff07e4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_orm.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: orm
+doc_to_text: 'You are a Oromo linguistic analyst trained to understand Oromo user
+  intent. Based on the Oromotext: ''{{text}}'', choose the intent that best matches
+  from this list: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Return
+  only the intent.'
+include: injongointent
+task: injongointent_orm_prompt_4
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_sna.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..23d066c3d8c8b41e7184f867ba492d58a8736d82
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_sna.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: sna
+doc_to_text: 'You are a Shona linguistic analyst trained to understand Shona user
+  intent. Based on the Shonatext: ''{{text}}'', choose the intent that best matches
+  from this list: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Return
+  only the intent.'
+include: injongointent
+task: injongointent_sna_prompt_4
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_sot.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..82102a21e2e7bb0a3d57a6fff4dd678b587d1d95
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_sot.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: sot
+doc_to_text: 'You are a Sotho linguistic analyst trained to understand Sotho user
+  intent. Based on the Sothotext: ''{{text}}'', choose the intent that best matches
+  from this list: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Return
+  only the intent.'
+include: injongointent
+task: injongointent_sot_prompt_4
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_swa.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..031ffbb40ceba3233e5f396e38a523f2bca81ad9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_swa.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: 'You are a Swahili linguistic analyst trained to understand Swahili user
+  intent. Based on the Swahilitext: ''{{text}}'', choose the intent that best matches
+  from this list: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Return
+  only the intent.'
+include: injongointent
+task: injongointent_swa_prompt_4
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_twi.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a569b3cec8f6858d041e9d0c0c876720558b808b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_twi.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_text: 'You are a Twi linguistic analyst trained to understand Twi user intent.
+  Based on the Twitext: ''{{text}}'', choose the intent that best matches from this
+  list: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather]. Return only the intent.'
+include: injongointent
+task: injongointent_twi_prompt_4
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_wol.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a55398ab4161ec54e971863cd1b4bfb41330b093
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_wol.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: wol
+doc_to_text: 'You are a Wolof linguistic analyst trained to understand Wolof user
+  intent. Based on the Woloftext: ''{{text}}'', choose the intent that best matches
+  from this list: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Return
+  only the intent.'
+include: injongointent
+task: injongointent_wol_prompt_4
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_xho.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d773a1756e17eb5cd21b3bc550b5847acaae671c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_xho.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: xho
+doc_to_text: 'You are a Xhosa linguistic analyst trained to understand Xhosa user
+  intent. Based on the Xhosatext: ''{{text}}'', choose the intent that best matches
+  from this list: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Return
+  only the intent.'
+include: injongointent
+task: injongointent_xho_prompt_4
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_yor.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..af01d9f3e8efc1d57003194dd0201dd76bd76fbf
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_yor.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: 'You are a Yoruba linguistic analyst trained to understand Yoruba user
+  intent. Based on the Yorubatext: ''{{text}}'', choose the intent that best matches
+  from this list: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather]. Return
+  only the intent.'
+include: injongointent
+task: injongointent_yor_prompt_4
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_zul.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3b6e5aace303095eccb19b8a011559555cee7eb2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_4/injongointent_zul.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: zul
+doc_to_text: 'You are a Zulu linguistic analyst trained to understand Zulu user intent.
+  Based on the Zulutext: ''{{text}}'', choose the intent that best matches from this
+  list: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather]. Return only the intent.'
+include: injongointent
+task: injongointent_zul_prompt_4
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_4/utils.py b/lm_eval/tasks/afrobench/injongointent/prompt_4/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e735e2deb1f9c53152c072615aebe8ba3acb90b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_4/utils.py
@@ -0,0 +1 @@
+from lm_eval.utils import weighted_f1_score
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent b/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent
new file mode 100644
index 0000000000000000000000000000000000000000..0012857bdaa787ad8bf9ba345330844c8b266a8e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent
@@ -0,0 +1,75 @@
+tag:
+- injongointent_tasks
+- injongointent_prompt_5
+dataset_path: masakhane/InjongoIntent
+dataset_name: null
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: validation
+doc_to_target: intent
+doc_to_choice:
+  - alarm
+  - balance
+  - bill_balance
+  - book_flight
+  - book_hotel
+  - calendar_update
+  - cancel_reservation
+  - car_rental
+  - confirm_reservation
+  - cook_time
+  - exchange_rate
+  - food_last
+  - freeze_account
+  - ingredients_list
+  - interest_rate
+  - international_visa
+  - make_call
+  - meal_suggestion
+  - min_payment
+  - pay_bill
+  - pin_change
+  - play_music
+  - plug_type
+  - recipe
+  - restaurant_reservation
+  - restaurant_reviews
+  - restaurant_suggestion
+  - share_location
+  - shopping_list_update
+  - spending_history
+  - text
+  - time
+  - timezone
+  - transactions
+  - transfer
+  - translate
+  - travel_notification
+  - travel_suggestion
+  - update_playlist
+  - weather
+should_decontaminate: true
+doc_to_decontamination_query: text
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_amh.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2a6623a9387a98d2d21791f51d0a8690b756c9a4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_amh.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: amh
+doc_to_text: 'The following text is in Amharic: ''{{text}}''. Given the list of intents:
+  [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather], identify the intent expressed in the
+  text. Return only the identified intent.'
+include: injongointent
+task: injongointent_amh_prompt_5
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_eng.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4dcbebbbd00cf8a3abb7f5e848d3a6b8520d46a3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_eng.yaml
@@ -0,0 +1,16 @@
+# Generated by utils.py
+dataset_name: eng
+validation_split: train
+test_split: test
+fewshot_split: train
+doc_to_text: 'The following text is in English: ''{{text}}''. Given the list of intents:
+  [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather], identify the intent expressed in the
+  text. Return only the identified intent.'
+include: injongointent
+task: injongointent_eng_prompt_5
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_ewe.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0cab84252dcbfdc4d1eefcd2f4aa36b031edd7ba
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_ewe.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: ewe
+doc_to_text: 'The following text is in Ewe: ''{{text}}''. Given the list of intents:
+  [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather], identify the intent expressed in the
+  text. Return only the identified intent.'
+include: injongointent
+task: injongointent_ewe_prompt_5
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_hau.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b6275db8383eddada828f7cb1963d554b4f8d658
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_hau.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: 'The following text is in Hausa: ''{{text}}''. Given the list of intents:
+  [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather], identify the intent expressed in the
+  text. Return only the identified intent.'
+include: injongointent
+task: injongointent_hau_prompt_5
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_ibo.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..518ec898411de5d141ac16137578e710fc7bb60e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_ibo.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: 'The following text is in Igbo: ''{{text}}''. Given the list of intents:
+  [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather], identify the intent expressed in the
+  text. Return only the identified intent.'
+include: injongointent
+task: injongointent_ibo_prompt_5
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_kin.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..348535c679af3c08da162c812fbfd700557e8326
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_kin.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_text: 'The following text is in Kinyarwanda: ''{{text}}''. Given the list of
+  intents: [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update,
+  cancel_reservation, car_rental, confirm_reservation, cook_time, exchange_rate, food_last,
+  freeze_account, ingredients_list, interest_rate, international_visa, make_call,
+  meal_suggestion, min_payment, pay_bill, pin_change, play_music, plug_type, recipe,
+  restaurant_reservation, restaurant_reviews, restaurant_suggestion, share_location,
+  shopping_list_update, spending_history, text, time, timezone, transactions, transfer,
+  translate, travel_notification, travel_suggestion, update_playlist, weather], identify
+  the intent expressed in the text. Return only the identified intent.'
+include: injongointent
+task: injongointent_kin_prompt_5
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_lin.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..75bbf4ec5935505c554d5c7b58934188d1591532
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_lin.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: lin
+doc_to_text: 'The following text is in Lingala: ''{{text}}''. Given the list of intents:
+  [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather], identify the intent expressed in the
+  text. Return only the identified intent.'
+include: injongointent
+task: injongointent_lin_prompt_5
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_lug.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..49b7f6faddd5ad59c079de1a1986e33ef4a29311
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_lug.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: lug
+doc_to_text: 'The following text is in Luganda: ''{{text}}''. Given the list of intents:
+  [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather], identify the intent expressed in the
+  text. Return only the identified intent.'
+include: injongointent
+task: injongointent_lug_prompt_5
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_orm.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..72a7686934892b2f21f3098b302dc601050b394b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_orm.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: orm
+doc_to_text: 'The following text is in Oromo: ''{{text}}''. Given the list of intents:
+  [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather], identify the intent expressed in the
+  text. Return only the identified intent.'
+include: injongointent
+task: injongointent_orm_prompt_5
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_sna.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8931b65ce15999ea1215cf420a671baed32a51c2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_sna.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: sna
+doc_to_text: 'The following text is in Shona: ''{{text}}''. Given the list of intents:
+  [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather], identify the intent expressed in the
+  text. Return only the identified intent.'
+include: injongointent
+task: injongointent_sna_prompt_5
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_sot.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5a8d0328e75acaead01f4e08fb4c6ed26eed6314
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_sot.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: sot
+doc_to_text: 'The following text is in Sotho: ''{{text}}''. Given the list of intents:
+  [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather], identify the intent expressed in the
+  text. Return only the identified intent.'
+include: injongointent
+task: injongointent_sot_prompt_5
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_swa.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1da6be32a6c64ec6583c70a2f0cbaa3d6aa3d435
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_swa.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: 'The following text is in Swahili: ''{{text}}''. Given the list of intents:
+  [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather], identify the intent expressed in the
+  text. Return only the identified intent.'
+include: injongointent
+task: injongointent_swa_prompt_5
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_twi.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cc78ae4f60d64d9d30a5bf52064fda54e0bce9ef
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_twi.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_text: 'The following text is in Twi: ''{{text}}''. Given the list of intents:
+  [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather], identify the intent expressed in the
+  text. Return only the identified intent.'
+include: injongointent
+task: injongointent_twi_prompt_5
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_wol.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9c71483ec36660b62bc945b4ebf89fdb77034e85
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_wol.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: wol
+doc_to_text: 'The following text is in Wolof: ''{{text}}''. Given the list of intents:
+  [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather], identify the intent expressed in the
+  text. Return only the identified intent.'
+include: injongointent
+task: injongointent_wol_prompt_5
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_xho.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8d8b543fc5cac462b31a9ae806aa3ce0f60b6ba9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_xho.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: xho
+doc_to_text: 'The following text is in Xhosa: ''{{text}}''. Given the list of intents:
+  [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather], identify the intent expressed in the
+  text. Return only the identified intent.'
+include: injongointent
+task: injongointent_xho_prompt_5
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_yor.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cbe285688f78a2ec210740e0294e4a3c6fde8dd7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_yor.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: 'The following text is in Yoruba: ''{{text}}''. Given the list of intents:
+  [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather], identify the intent expressed in the
+  text. Return only the identified intent.'
+include: injongointent
+task: injongointent_yor_prompt_5
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_zul.yaml b/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7ba384db5a8b2f7f17601ad7fff0bed8327ecc1c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_5/injongointent_zul.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: zul
+doc_to_text: 'The following text is in Zulu: ''{{text}}''. Given the list of intents:
+  [alarm, balance, bill_balance, book_flight, book_hotel, calendar_update, cancel_reservation,
+  car_rental, confirm_reservation, cook_time, exchange_rate, food_last, freeze_account,
+  ingredients_list, interest_rate, international_visa, make_call, meal_suggestion,
+  min_payment, pay_bill, pin_change, play_music, plug_type, recipe, restaurant_reservation,
+  restaurant_reviews, restaurant_suggestion, share_location, shopping_list_update,
+  spending_history, text, time, timezone, transactions, transfer, translate, travel_notification,
+  travel_suggestion, update_playlist, weather], identify the intent expressed in the
+  text. Return only the identified intent.'
+include: injongointent
+task: injongointent_zul_prompt_5
diff --git a/lm_eval/tasks/afrobench/injongointent/prompt_5/utils.py b/lm_eval/tasks/afrobench/injongointent/prompt_5/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e735e2deb1f9c53152c072615aebe8ba3acb90b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/injongointent/prompt_5/utils.py
@@ -0,0 +1 @@
+from lm_eval.utils import weighted_f1_score
diff --git a/lm_eval/tasks/afrobench/mafand/README.md b/lm_eval/tasks/afrobench/mafand/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9e7eea17598d29defff07bb37c4f47efaa446547
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/README.md
@@ -0,0 +1,73 @@
+#
+
+## Paper
+Title: `A Few Thousand Translations Go a Long Way! Leveraging Pre-trained Models for African News Translation`
+
+Paper Link: https://aclanthology.org/2022.naacl-main.223/
+
+## Abstract
+>Recent advances in the pre-training of language models leverage large-scale datasets to create multilingual models. However, low-resource languages are mostly left out in these datasets. This is primarily because many widely spoken languages are not well represented on the web and therefore excluded from the large-scale crawls used to create datasets. Furthermore, downstream users of these models are restricted to the selection of languages originally chosen for pre-training. This work investigates how to optimally leverage existing pre-trained models to create low-resource translation systems for 16 African languages. We focus on two questions: 1) How can pre-trained models be used for languages not included in the initial pre-training? and 2) How can the resulting translation models effectively transfer to new domains? To answer these questions, we create a new African news corpus covering 16 languages, of which eight languages are not part of any existing evaluation dataset. We demonstrate that the most effective strategy for transferring both to additional languages and to additional domains is to fine-tune large pre-trained models on small quantities of high-quality translation data.
+
+HomePage: https://github.com/masakhane-io/lafand-mt
+
+### Citation
+
+```
+@inproceedings{adelani-etal-2022-thousand,
+    title = "A Few Thousand Translations Go a Long Way! Leveraging Pre-trained Models for {A}frican News Translation",
+    author = "Adelani, David  and
+      Alabi, Jesujoba  and
+      Fan, Angela  and
+      Kreutzer, Julia  and
+      Shen, Xiaoyu  and
+      Reid, Machel  and
+      Ruiter, Dana  and
+      Klakow, Dietrich  and
+      Nabende, Peter  and
+      Chang, Ernie  and
+      Gwadabe, Tajuddeen  and
+      Sackey, Freshia  and
+      Dossou, Bonaventure F. P.  and
+      Emezue, Chris  and
+      Leong, Colin  and
+      Beukman, Michael  and
+      Muhammad, Shamsuddeen  and
+      Jarso, Guyo  and
+      Yousuf, Oreen  and
+      Niyongabo Rubungo, Andre  and
+      Hacheme, Gilles  and
+      Wairagala, Eric Peter  and
+      Nasir, Muhammad Umair  and
+      Ajibade, Benjamin  and
+      Ajayi, Tunde  and
+      Gitau, Yvonne  and
+      Abbott, Jade  and
+      Ahmed, Mohamed  and
+      Ochieng, Millicent  and
+      Aremu, Anuoluwapo  and
+      Ogayo, Perez  and
+      Mukiibi, Jonathan  and
+      Ouoba Kabore, Fatoumata  and
+      Kalipe, Godson  and
+      Mbaye, Derguene  and
+      Tapo, Allahsera Auguste  and
+      Memdjokam Koagne, Victoire  and
+      Munkoh-Buabeng, Edwin  and
+      Wagner, Valencia  and
+      Abdulmumin, Idris  and
+      Awokoya, Ayodele  and
+      Buzaaba, Happy  and
+      Sibanda, Blessing  and
+      Bukula, Andiswa  and
+      Manthalu, Sam",
+    booktitle = "Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies",
+    month = jul,
+    year = "2022",
+    address = "Seattle, United States",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2022.naacl-main.223",
+    doi = "10.18653/v1/2022.naacl-main.223",
+    pages = "3053--3070",
+    abstract = "Recent advances in the pre-training for language models leverage large-scale datasets to create multilingual models. However, low-resource languages are mostly left out in these datasets. This is primarily because many widely spoken languages that are not well represented on the web and therefore excluded from the large-scale crawls for datasets. Furthermore, downstream users of these models are restricted to the selection of languages originally chosen for pre-training. This work investigates how to optimally leverage existing pre-trained models to create low-resource translation systems for 16 African languages. We focus on two questions: 1) How can pre-trained models be used for languages not included in the initial pretraining? and 2) How can the resulting translation models effectively transfer to new domains? To answer these questions, we create a novel African news corpus covering 16 languages, of which eight languages are not part of any existing evaluation dataset. We demonstrate that the most effective strategy for transferring both additional languages and additional domains is to leverage small quantities of high-quality translation data to fine-tune large pre-trained models.",
+}
+```
diff --git a/lm_eval/tasks/afrobench/mafand/gen_utils.py b/lm_eval/tasks/afrobench/mafand/gen_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c260a321a419b3013545b738850af5f796a1bc32
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/gen_utils.py
@@ -0,0 +1,147 @@
+import argparse
+import os
+
+import yaml
+
+
+class FunctionTag:
+    def __init__(self, value):
+        self.value = value
+
+
+def prompt_func(mode, lang, lang_dict):
+    language_column_name = f"{lang}_text"
+    prompt_map = {
+        "prompt_1": "You are an advanced Translator, a specialized assistant designed to translate documents from "
+        f"{lang_dict[lang]} into English. \nYour main goal is to ensure translations are grammatically "
+        f"correct and human-oriented. \n{lang_dict[lang]}: {{{{{language_column_name}}}}} \nEnglish: ",
+        "prompt_1_reverse": "You are an advanced Translator, a specialized assistant designed to translate documents "
+        f"from English into {lang_dict[lang]}. \nYour main goal is to ensure translations are "
+        f"grammatically correct and human-oriented. "
+        f"\nEnglish: {{eng_text}} \n{lang_dict[lang]}: ",
+        "prompt_2": f"{lang_dict[lang]} sentence: {{{{{language_column_name}}}}} \nEnglish sentence: ",
+        "prompt_2_reverse": "English sentence: {{eng_text}} "
+        f"\n{lang_dict[lang]} sentence: ",
+        "prompt_3": f"You are a translation expert. Translate the following {lang_dict[lang]} sentences to English \n"
+        f"{lang_dict[lang]} sentence: {{{{{language_column_name}}}}}\nEnglish sentence: ",
+        "prompt_3_reverse": f"You are a translation expert. Translate the following English sentences to "
+        f"{lang_dict[lang]} "
+        "\nEnglish sentence: {{eng_text}} "
+        f"\n{lang_dict[lang]} sentence: ",
+    }
+    return prompt_map[mode]
+
+
+def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str, reverse: bool) -> None:
+    """
+    Generate a yaml file for each language.
+
+    :param output_dir: The directory to output the files to.
+    :param overwrite: Whether to overwrite files if they already exist.
+    """
+    err = []
+    languages = {
+        "amh": "Amharic",
+        "bam": "Bambara",
+        "bbj": "Gbomala",
+        "ewe": "Ewe",
+        "fon": "Fon",
+        "hau": "Hausa",
+        "ibo": "Igbo",
+        "kin": "Kinyarwanda",
+        "lug": "Luganda",
+        "luo": "Luo",
+        "mos": "Mossi",
+        "nya": "Chichewa",
+        "pcm": "Nigerian Pidgin",
+        "sna": "Shona",
+        "swa": "Swahili",
+        "tsn": "Setswana",
+        "twi": "Twi",
+        "wol": "Wolof",
+        "xho": "Xhosa",
+        "yor": "Yoruba",
+        "zul": "Zulu",
+    }
+
+    french_langs = ["bam", "bbj", "ewe", "fon", "wol", "mos"]
+
+    for lang in languages.keys():
+        try:
+            norm_lang = f"{lang}-en" if lang not in french_langs else f"{lang}-fr"
+            reverse_lang = f"en-{lang}" if lang not in french_langs else f"fr-{lang}"
+            dataset_name = norm_lang if reverse else reverse_lang
+            file_name = f"mafand_{dataset_name}.yaml"
+            task_name = f"mafand_{dataset_name}_{mode}"
+            yaml_template = "mafand"
+            yaml_details = {
+                "include": yaml_template,
+                "task": task_name,
+                "dataset_name": reverse_lang,
+            }
+            file_dir = (
+                f"{output_dir}/{mode}/african-english"
+                if reverse
+                else f"{output_dir}/{mode}/english-african"
+            )
+            os.makedirs(file_dir, exist_ok=True)
+            with open(
+                f"{file_dir}/{file_name}",
+                "w" if overwrite else "x",
+                encoding="utf8",
+            ) as f:
+                f.write("# Generated by utils.py\n")
+                yaml.dump(
+                    yaml_details,
+                    f,
+                    allow_unicode=True,
+                )
+
+        except FileExistsError:
+            err.append(file_name)
+
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist (use --overwrite flag):"
+            f" {', '.join(err)}"
+        )
+
+
+def main() -> None:
+    """Parse CLI args and generate language-specific yaml files."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--overwrite",
+        default=True,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default="./",
+        help="Directory to write yaml files to",
+    )
+    parser.add_argument(
+        "--mode",
+        default="prompt_3",
+        choices=["prompt_1", "prompt_2", "prompt_3"],
+        help="Prompt number",
+    )
+    parser.add_argument(
+        "--reverse",
+        default=True,
+        choices=[True, False],
+        help="Reverse the translation direction",
+    )
+    args = parser.parse_args()
+
+    gen_lang_yamls(
+        output_dir=args.output_dir,
+        overwrite=args.overwrite,
+        mode=args.mode,
+        reverse=args.reverse,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lm_eval/tasks/afrobench/mafand/mafand.yaml b/lm_eval/tasks/afrobench/mafand/mafand.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ef8619addf5df6be4d59e31540c6ed6663e6b189
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/mafand.yaml
@@ -0,0 +1,14 @@
+group: mafand
+task:
+  - mafand_eng-afr_prompt_1
+  - mafand_eng-afr_prompt_2
+  - mafand_eng-afr_prompt_3
+  - mafand_afr-eng_prompt_1
+  - mafand_afr-eng_prompt_2
+  - mafand_afr-eng_prompt_3
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand
new file mode 100644
index 0000000000000000000000000000000000000000..4f2047be0877cfda1e41acc76e491478c1f8f8f0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand
@@ -0,0 +1,28 @@
+tag:
+- mafand_tasks
+- mafand_afr-eng
+- mafand_afr-eng_prompt_1
+- afrobench_MT_tasks
+dataset_path: masakhane/mafand
+dataset_kwargs: {trust_remote_code: True}
+output_type: generate_until
+validation_split: validation
+fewshot_split: validation
+test_split: test
+doc_to_target: !function utils.get_target
+doc_to_text: !function utils.create_text_prompt_1
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: chrf
+    aggregation: chrf
+    higher_is_better: true
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+repeats: 1
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_amh-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_amh-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..95e87fd8aeb3df35fd529338e719683805a78f18
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_amh-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-amh
+include: mafand
+task: mafand_amh-en_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_bam-fr.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_bam-fr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dbc612ac327d46f46c4df459d558c8429d2089dd
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_bam-fr.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fr-bam
+include: mafand
+task: mafand_bam-fr_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_bbj-fr.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_bbj-fr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..abe64f9b2a1dbf14bcf60f9f4f80df24f65821ed
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_bbj-fr.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fr-bbj
+include: mafand
+task: mafand_bbj-fr_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_ewe-fr.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_ewe-fr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ecd9b38bcdfda7354569bc002e3ed10ff573449f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_ewe-fr.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fr-ewe
+include: mafand
+task: mafand_ewe-fr_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_fon-fr.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_fon-fr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..705cfbb855261a2cc14d841bde89661fa5d6be75
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_fon-fr.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fr-fon
+include: mafand
+task: mafand_fon-fr_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_hau-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_hau-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3b84d9cecabfd92350a9ab63585f38fcfff328d7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_hau-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-hau
+include: mafand
+task: mafand_hau-en_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_ibo-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_ibo-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d78c91bb1fc61797c9554b9cf8acb7c099e53919
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_ibo-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-ibo
+include: mafand
+task: mafand_ibo-en_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_kin-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_kin-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..954c036e80de22ad705888d66705e58b2a15f689
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_kin-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-kin
+include: mafand
+task: mafand_kin-en_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_lug-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_lug-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..671c072adc79858fc9eb1f2c32b99420b26fcb95
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_lug-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-lug
+include: mafand
+task: mafand_lug-en_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_luo-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_luo-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a1d5965f08d62f2a8147a39878f844d714800ade
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_luo-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-luo
+include: mafand
+task: mafand_luo-en_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_mos-fr.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_mos-fr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..da085707bc8ec37730535867ad2faa80e23bfa20
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_mos-fr.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fr-mos
+include: mafand
+task: mafand_mos-fr_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_nya-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_nya-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3bc2426687b5784602b1bc1bcb64c453d79d39ab
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_nya-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-nya
+include: mafand
+task: mafand_nya-en_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_pcm-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_pcm-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f6fb5dee6d393fbe6a6172e347e382cee2aecfb5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_pcm-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-pcm
+include: mafand
+task: mafand_pcm-en_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_sna-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_sna-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..283517d6ac4f9f4c8e462a63002d11a681c438b3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_sna-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-sna
+include: mafand
+task: mafand_sna-en_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_swa-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_swa-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..476bba42d82461b241b2b8a396baefe20c154206
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_swa-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-swa
+include: mafand
+task: mafand_swa-en_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_tsn-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_tsn-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a94c5b6ea0f870db19b4fc3c692be95a3fc6d455
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_tsn-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-tsn
+include: mafand
+task: mafand_tsn-en_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_twi-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_twi-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7f5883b12d6875133699a8158dd2151c8c784981
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_twi-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-twi
+include: mafand
+task: mafand_twi-en_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_wol-fr.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_wol-fr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bb887188000da8e83aeed287619ed1cd2375e18a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_wol-fr.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fr-wol
+include: mafand
+task: mafand_wol-fr_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_xho-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_xho-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a0561b4157c15f0fa5f8f3876ab86d07087a5a0a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_xho-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-xho
+include: mafand
+task: mafand_xho-en_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_yor-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_yor-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ec97ae7d3c4abc88a4774dc69cb32e2d7dced32d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_yor-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-yor
+include: mafand
+task: mafand_yor-en_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_zul-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_zul-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9649d772d259a5519e87ae522f80b3d591f1b2be
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/mafand_zul-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-zul
+include: mafand
+task: mafand_zul-en_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/utils.py b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0df3a329824d44fa94eb830ae943fa30dd32bab7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/african-english/utils.py
@@ -0,0 +1,121 @@
+languages = {
+    "amh": "Amharic",
+    "bam": "Bambara",
+    "bbj": "Gbomala",
+    "ewe": "Ewe",
+    "fon": "Fon",
+    "hau": "Hausa",
+    "ibo": "Igbo",
+    "kin": "Kinyarwanda",
+    "lug": "Luganda",
+    "luo": "Luo",
+    "mos": "Mossi",
+    "nya": "Chichewa",
+    "pcm": "Nigerian Pidgin",
+    "sna": "Shona",
+    "swa": "Swahili",
+    "tsn": "Setswana",
+    "twi": "Twi",
+    "wol": "Wolof",
+    "xho": "Xhosa",
+    "yor": "Yoruba",
+    "zul": "Zulu",
+}
+
+
+def get_target(doc):
+    target = (
+        doc["translation"]["en"]
+        if "en" in doc["translation"].keys()
+        else doc["translation"]["fr"]
+    )
+    return target
+
+
+def get_target_reverse(doc):
+    target_key = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    target = doc["translation"][target_key]
+    return target
+
+
+def create_text_prompt_1(doc):
+    source_key = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    source_sentence = doc["translation"][source_key]
+    source_lang = "English" if "en" in doc["translation"].keys() else "French"
+    prompt = (
+        "You are an advanced Translator, a specialized assistant designed to translate documents from "
+        f"{languages[source_key]} into {source_lang}. \nYour main goal is to ensure translations are grammatically "
+        f"correct and human-oriented. \n{languages[source_key]}: {source_sentence} \n{source_lang}: "
+    )
+    return prompt
+
+
+def create_reverse_prompt_1(doc):
+    target_lang = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    source_key = "en" if "en" in doc["translation"].keys() else "fr"
+    source_lang = "English" if source_key == "en" else "French"
+    source_sentence = doc["translation"][source_key]
+    prompt = (
+        "You are an advanced Translator, a specialized assistant designed to translate documents from "
+        f"{source_lang} into {languages[target_lang]}. \nYour main goal is to ensure translations are "
+        f"grammatically correct and human-oriented. \n{source_lang}: {source_sentence} \n{languages[target_lang]}: "
+    )
+    return prompt
+
+
+def create_text_prompt_2(doc):
+    source_key = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    source_sentence = doc["translation"][source_key]
+    source_lang = "English" if "en" in doc["translation"].keys() else "French"
+    prompt = (
+        f"{languages[source_key]} sentence: {source_sentence} \n{source_lang} sentence: ",
+    )
+    return prompt
+
+
+def create_reverse_prompt_2(doc):
+    target_lang = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    source_key = "en" if "en" in doc["translation"].keys() else "fr"
+    source_lang = "English" if source_key == "en" else "French"
+    source_sentence = doc["translation"][source_key]
+    prompt = (
+        f"{source_lang} sentence: {source_sentence} \n{languages[target_lang]} sentence: \n",
+    )
+    return prompt
+
+
+def create_text_prompt_3(doc):
+    source_key = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    source_sentence = doc["translation"][source_key]
+    source_lang = "English" if "en" in doc["translation"].keys() else "French"
+    prompt = (
+        f"You are a translation expert. Translate the following {languages[source_key]} sentences "
+        f"to {source_lang}. \n{languages[source_key]} sentence: {source_sentence}\n{source_lang} sentence: "
+    )
+    return prompt
+
+
+def create_reverse_prompt_3(doc):
+    target_lang = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    source_key = "en" if "en" in doc["translation"].keys() else "fr"
+    source_lang = "English" if source_key == "en" else "French"
+    source_sentence = doc["translation"][source_key]
+    prompt = (
+        f"You are a translation expert. Translate the following {source_lang} sentence into {languages[target_lang]}\n"
+        f"{source_lang} sentence: {source_sentence}\n{languages[target_lang]} sentence: "
+    )
+    return prompt
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand
new file mode 100644
index 0000000000000000000000000000000000000000..1d004556267924e0120e95fb516fee72a5d3eb1d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand
@@ -0,0 +1,28 @@
+tag:
+- mafand_tasks
+- mafand_eng-afr
+- mafand_eng-afr_prompt_1
+- afrobench_MT_tasks
+dataset_path: masakhane/mafand
+dataset_kwargs: {trust_remote_code: True}
+output_type: generate_until
+validation_split: validation
+fewshot_split: validation
+test_split: test
+doc_to_target: !function utils.get_target_reverse
+doc_to_text: !function utils.create_reverse_prompt_1
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: chrf
+    aggregation: chrf
+    higher_is_better: true
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+repeats: 1
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-amh.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8ef9ab288615c42f1c4c04538f0b51e3d2245ac1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-amh.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-amh
+include: mafand
+task: mafand_en-amh_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-hau.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0ea577781deda41a7217dbd757158789a80dafc6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-hau.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-hau
+include: mafand
+task: mafand_en-hau_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-ibo.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..88af221fb31e583d9238c95c05dffb44306a617e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-ibo.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-ibo
+include: mafand
+task: mafand_en-ibo_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-kin.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0c415f511ad246eb99509f4bedb979810dfcf20d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-kin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-kin
+include: mafand
+task: mafand_en-kin_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-lug.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..94070e8986b0f44eecb1097c673c7daf4aec8067
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-lug.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-lug
+include: mafand
+task: mafand_en-lug_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-luo.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-luo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fc6b15c67f2919dca92ef84d058a93da6e0271a1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-luo.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-luo
+include: mafand
+task: mafand_en-luo_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-nya.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-nya.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..225a46474ff2c694da98558bfaf665bb8e102248
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-nya.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-nya
+include: mafand
+task: mafand_en-nya_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-pcm.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-pcm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..69380c7182bdfc8f8e25c2ee16b3ced4a5c05d49
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-pcm.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-pcm
+include: mafand
+task: mafand_en-pcm_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-sna.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..634d988fd46f0a98b6e5b76e4a42f897edb4871d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-sna.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-sna
+include: mafand
+task: mafand_en-sna_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-swa.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bfbf259cbbe8d6d740dc7522f6d8bd6542c790a1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-swa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-swa
+include: mafand
+task: mafand_en-swa_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-tsn.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-tsn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..faa99ddf5186515253279585aa1439569d7de9b3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-tsn.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-tsn
+include: mafand
+task: mafand_en-tsn_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-twi.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a9294975da5385d196db9abc9f22d087a2c9cce0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-twi.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-twi
+include: mafand
+task: mafand_en-twi_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-xho.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..244f5cabd9901020a136322842ec47fb181faed6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-xho.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-xho
+include: mafand
+task: mafand_en-xho_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-yor.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..aa3189779577ecbc58f80ff4973a63d81ea1c6a6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-yor.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-yor
+include: mafand
+task: mafand_en-yor_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-zul.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6afdc0c5648d8fec0be6d2d2fca0b9416dccffab
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_en-zul.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-zul
+include: mafand
+task: mafand_en-zul_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_fr-bam.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_fr-bam.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7c21d96f275a14b81225b3979044a23909bfb023
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_fr-bam.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fr-bam
+include: mafand
+task: mafand_fr-bam_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_fr-bbj.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_fr-bbj.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..76cf07507625677c85772e988ac66741508efbc6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_fr-bbj.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fr-bbj
+include: mafand
+task: mafand_fr-bbj_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_fr-ewe.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_fr-ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0c7bd6671b7ba6da2a869cf12862046b632396e6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_fr-ewe.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fr-ewe
+include: mafand
+task: mafand_fr-ewe_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_fr-fon.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_fr-fon.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..737d68eba79405276a96151a33786387c33cc148
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_fr-fon.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fr-fon
+include: mafand
+task: mafand_fr-fon_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_fr-mos.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_fr-mos.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9186a5b9f6670f5187284981d547d91631cc94f6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_fr-mos.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fr-mos
+include: mafand
+task: mafand_fr-mos_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_fr-wol.yaml b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_fr-wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6e29f5fb98a138f3a131c62b685abc25b60e3f69
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/mafand_fr-wol.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fr-wol
+include: mafand
+task: mafand_fr-wol_prompt_1
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/utils.py b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0df3a329824d44fa94eb830ae943fa30dd32bab7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_1/english-african/utils.py
@@ -0,0 +1,121 @@
+languages = {
+    "amh": "Amharic",
+    "bam": "Bambara",
+    "bbj": "Gbomala",
+    "ewe": "Ewe",
+    "fon": "Fon",
+    "hau": "Hausa",
+    "ibo": "Igbo",
+    "kin": "Kinyarwanda",
+    "lug": "Luganda",
+    "luo": "Luo",
+    "mos": "Mossi",
+    "nya": "Chichewa",
+    "pcm": "Nigerian Pidgin",
+    "sna": "Shona",
+    "swa": "Swahili",
+    "tsn": "Setswana",
+    "twi": "Twi",
+    "wol": "Wolof",
+    "xho": "Xhosa",
+    "yor": "Yoruba",
+    "zul": "Zulu",
+}
+
+
+def get_target(doc):
+    target = (
+        doc["translation"]["en"]
+        if "en" in doc["translation"].keys()
+        else doc["translation"]["fr"]
+    )
+    return target
+
+
+def get_target_reverse(doc):
+    target_key = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    target = doc["translation"][target_key]
+    return target
+
+
+def create_text_prompt_1(doc):
+    source_key = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    source_sentence = doc["translation"][source_key]
+    source_lang = "English" if "en" in doc["translation"].keys() else "French"
+    prompt = (
+        "You are an advanced Translator, a specialized assistant designed to translate documents from "
+        f"{languages[source_key]} into {source_lang}. \nYour main goal is to ensure translations are grammatically "
+        f"correct and human-oriented. \n{languages[source_key]}: {source_sentence} \n{source_lang}: "
+    )
+    return prompt
+
+
+def create_reverse_prompt_1(doc):
+    target_lang = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    source_key = "en" if "en" in doc["translation"].keys() else "fr"
+    source_lang = "English" if source_key == "en" else "French"
+    source_sentence = doc["translation"][source_key]
+    prompt = (
+        "You are an advanced Translator, a specialized assistant designed to translate documents from "
+        f"{source_lang} into {languages[target_lang]}. \nYour main goal is to ensure translations are "
+        f"grammatically correct and human-oriented. \n{source_lang}: {source_sentence} \n{languages[target_lang]}: "
+    )
+    return prompt
+
+
+def create_text_prompt_2(doc):
+    source_key = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    source_sentence = doc["translation"][source_key]
+    source_lang = "English" if "en" in doc["translation"].keys() else "French"
+    prompt = (
+        f"{languages[source_key]} sentence: {source_sentence} \n{source_lang} sentence: ",
+    )
+    return prompt
+
+
+def create_reverse_prompt_2(doc):
+    target_lang = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    source_key = "en" if "en" in doc["translation"].keys() else "fr"
+    source_lang = "English" if source_key == "en" else "French"
+    source_sentence = doc["translation"][source_key]
+    prompt = (
+        f"{source_lang} sentence: {source_sentence} \n{languages[target_lang]} sentence: \n",
+    )
+    return prompt
+
+
+def create_text_prompt_3(doc):
+    source_key = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    source_sentence = doc["translation"][source_key]
+    source_lang = "English" if "en" in doc["translation"].keys() else "French"
+    prompt = (
+        f"You are a translation expert. Translate the following {languages[source_key]} sentences "
+        f"to {source_lang}. \n{languages[source_key]} sentence: {source_sentence}\n{source_lang} sentence: "
+    )
+    return prompt
+
+
+def create_reverse_prompt_3(doc):
+    target_lang = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    source_key = "en" if "en" in doc["translation"].keys() else "fr"
+    source_lang = "English" if source_key == "en" else "French"
+    source_sentence = doc["translation"][source_key]
+    prompt = (
+        f"You are a translation expert. Translate the following {source_lang} sentence into {languages[target_lang]}\n"
+        f"{source_lang} sentence: {source_sentence}\n{languages[target_lang]} sentence: "
+    )
+    return prompt
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand
new file mode 100644
index 0000000000000000000000000000000000000000..eb7ad9883115b42626fd30d223fa90ff6f133384
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand
@@ -0,0 +1,28 @@
+tag:
+- mafand_tasks
+- mafand_afr-eng
+- mafand_afr-eng_prompt_3
+- afrobench_MT_tasks
+dataset_path: masakhane/mafand
+dataset_kwargs: {trust_remote_code: True}
+output_type: generate_until
+validation_split: validation
+fewshot_split: validation
+test_split: test
+doc_to_target: !function utils.get_target
+doc_to_text: !function utils.create_text_prompt_2
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: chrf
+    aggregation: chrf
+    higher_is_better: true
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+repeats: 1
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_amh-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_amh-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6db544cb5e17e87e6ebcf606ad59ad9a7435f338
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_amh-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-amh
+include: mafand
+task: mafand_amh-en_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_bam-fr.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_bam-fr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0a9f3b3ac09958dc4253d982dfe1c872aefafd7e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_bam-fr.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fr-bam
+include: mafand
+task: mafand_bam-fr_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_bbj-fr.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_bbj-fr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b0b42b23787e88bd5ff0d000c1806655e26f65cf
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_bbj-fr.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fr-bbj
+include: mafand
+task: mafand_bbj-fr_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_ewe-fr.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_ewe-fr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..457c0d1945bfa65f5d5f0e1ccffc30c11aadd452
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_ewe-fr.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fr-ewe
+include: mafand
+task: mafand_ewe-fr_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_fon-fr.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_fon-fr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..84263d5a9ca41877f02d680660706eaeba28fea2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_fon-fr.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fr-fon
+include: mafand
+task: mafand_fon-fr_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_hau-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_hau-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..05c31a4670a9300c1175dc2f7e109c024b146301
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_hau-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-hau
+include: mafand
+task: mafand_hau-en_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_ibo-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_ibo-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3cb4a5b897c2d143ef31d667c9d87fc35f40caa0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_ibo-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-ibo
+include: mafand
+task: mafand_ibo-en_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_kin-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_kin-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e3e1acf9a92a63ab80dbc0314adebfa59e449490
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_kin-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-kin
+include: mafand
+task: mafand_kin-en_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_lug-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_lug-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..eb68279d783b71ff94742507d4ed1571a03b6b51
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_lug-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-lug
+include: mafand
+task: mafand_lug-en_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_luo-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_luo-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..12f199473ee6e4ab1f6a2d3ae6e69eb04ba6a399
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_luo-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-luo
+include: mafand
+task: mafand_luo-en_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_mos-fr.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_mos-fr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a723701d3250aa78f7631a3dcfc450301201bf73
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_mos-fr.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fr-mos
+include: mafand
+task: mafand_mos-fr_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_nya-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_nya-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..24569f00825bb4a2e0419e290ae4ff1bc7e0d312
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_nya-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-nya
+include: mafand
+task: mafand_nya-en_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_pcm-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_pcm-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7ec0c66a0a5c7fadbc7b9ec715fe53596d4c2b51
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_pcm-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-pcm
+include: mafand
+task: mafand_pcm-en_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_sna-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_sna-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3bf99b9955378aa6a5930d9783ed33dfea96ac95
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_sna-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-sna
+include: mafand
+task: mafand_sna-en_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_swa-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_swa-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..eb2ada0bba76e9c83a99d8060ac8f4146a1462c7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_swa-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-swa
+include: mafand
+task: mafand_swa-en_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_tsn-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_tsn-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d16e7e94c5b251eff2adece451f89c7af71fbc30
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_tsn-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-tsn
+include: mafand
+task: mafand_tsn-en_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_twi-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_twi-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..267337c177fc06e777d62c3a2286084d79266a4c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_twi-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-twi
+include: mafand
+task: mafand_twi-en_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_wol-fr.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_wol-fr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f6c67bd8d6df8d81d4d4088b6bbf8e04498afef6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_wol-fr.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fr-wol
+include: mafand
+task: mafand_wol-fr_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_xho-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_xho-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fd1960d0efbd42ba25132c938944692fbf63b92f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_xho-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-xho
+include: mafand
+task: mafand_xho-en_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_yor-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_yor-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cb7241ad2cc2fc2f889bab56c4ad3e233a2d2165
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_yor-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-yor
+include: mafand
+task: mafand_yor-en_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_zul-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_zul-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d44db7a2eeefcc7dc0218f08327ceaee4a6e351a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/mafand_zul-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-zul
+include: mafand
+task: mafand_zul-en_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/utils.py b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0df3a329824d44fa94eb830ae943fa30dd32bab7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/african-english/utils.py
@@ -0,0 +1,121 @@
+languages = {
+    "amh": "Amharic",
+    "bam": "Bambara",
+    "bbj": "Gbomala",
+    "ewe": "Ewe",
+    "fon": "Fon",
+    "hau": "Hausa",
+    "ibo": "Igbo",
+    "kin": "Kinyarwanda",
+    "lug": "Luganda",
+    "luo": "Luo",
+    "mos": "Mossi",
+    "nya": "Chichewa",
+    "pcm": "Nigerian Pidgin",
+    "sna": "Shona",
+    "swa": "Swahili",
+    "tsn": "Setswana",
+    "twi": "Twi",
+    "wol": "Wolof",
+    "xho": "Xhosa",
+    "yor": "Yoruba",
+    "zul": "Zulu",
+}
+
+
+def get_target(doc):
+    target = (
+        doc["translation"]["en"]
+        if "en" in doc["translation"].keys()
+        else doc["translation"]["fr"]
+    )
+    return target
+
+
+def get_target_reverse(doc):
+    target_key = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    target = doc["translation"][target_key]
+    return target
+
+
+def create_text_prompt_1(doc):
+    source_key = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    source_sentence = doc["translation"][source_key]
+    source_lang = "English" if "en" in doc["translation"].keys() else "French"
+    prompt = (
+        "You are an advanced Translator, a specialized assistant designed to translate documents from "
+        f"{languages[source_key]} into {source_lang}. \nYour main goal is to ensure translations are grammatically "
+        f"correct and human-oriented. \n{languages[source_key]}: {source_sentence} \n{source_lang}: "
+    )
+    return prompt
+
+
+def create_reverse_prompt_1(doc):
+    target_lang = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    source_key = "en" if "en" in doc["translation"].keys() else "fr"
+    source_lang = "English" if source_key == "en" else "French"
+    source_sentence = doc["translation"][source_key]
+    prompt = (
+        "You are an advanced Translator, a specialized assistant designed to translate documents from "
+        f"{source_lang} into {languages[target_lang]}. \nYour main goal is to ensure translations are "
+        f"grammatically correct and human-oriented. \n{source_lang}: {source_sentence} \n{languages[target_lang]}: "
+    )
+    return prompt
+
+
+def create_text_prompt_2(doc):
+    source_key = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    source_sentence = doc["translation"][source_key]
+    source_lang = "English" if "en" in doc["translation"].keys() else "French"
+    prompt = (
+        f"{languages[source_key]} sentence: {source_sentence} \n{source_lang} sentence: ",
+    )
+    return prompt
+
+
+def create_reverse_prompt_2(doc):
+    target_lang = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    source_key = "en" if "en" in doc["translation"].keys() else "fr"
+    source_lang = "English" if source_key == "en" else "French"
+    source_sentence = doc["translation"][source_key]
+    prompt = (
+        f"{source_lang} sentence: {source_sentence} \n{languages[target_lang]} sentence: \n",
+    )
+    return prompt
+
+
+def create_text_prompt_3(doc):
+    source_key = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    source_sentence = doc["translation"][source_key]
+    source_lang = "English" if "en" in doc["translation"].keys() else "French"
+    prompt = (
+        f"You are a translation expert. Translate the following {languages[source_key]} sentences "
+        f"to {source_lang}. \n{languages[source_key]} sentence: {source_sentence}\n{source_lang} sentence: "
+    )
+    return prompt
+
+
+def create_reverse_prompt_3(doc):
+    target_lang = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    source_key = "en" if "en" in doc["translation"].keys() else "fr"
+    source_lang = "English" if source_key == "en" else "French"
+    source_sentence = doc["translation"][source_key]
+    prompt = (
+        f"You are a translation expert. Translate the following {source_lang} sentence into {languages[target_lang]}\n"
+        f"{source_lang} sentence: {source_sentence}\n{languages[target_lang]} sentence: "
+    )
+    return prompt
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand
new file mode 100644
index 0000000000000000000000000000000000000000..35548392e118f0625d9adae32849389d5239cd3e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand
@@ -0,0 +1,28 @@
+tag:
+- mafand_tasks
+- mafand_eng-afr
+- mafand_eng-afr_prompt_2
+- afrobench_MT_tasks
+dataset_path: masakhane/mafand
+dataset_kwargs: {trust_remote_code: True}
+output_type: generate_until
+validation_split: validation
+fewshot_split: validation
+test_split: test
+doc_to_target: !function utils.get_target_reverse
+doc_to_text: !function utils.create_reverse_prompt_2
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: chrf
+    aggregation: chrf
+    higher_is_better: true
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+repeats: 1
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-amh.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..09c21d215e4cab74dda2b30ee110f55dcf2cbcbb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-amh.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-amh
+include: mafand
+task: mafand_en-amh_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-hau.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e9a91c76499bf37863e6b55c71ff88205b1e2599
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-hau.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-hau
+include: mafand
+task: mafand_en-hau_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-ibo.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..568a845e403663c3d24e30383cab8eebe0d37151
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-ibo.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-ibo
+include: mafand
+task: mafand_en-ibo_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-kin.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..09425f64afdb68817d0f462545e67f5a1e2d5f07
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-kin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-kin
+include: mafand
+task: mafand_en-kin_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-lug.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..13c91d36a6e5e0037a301c88fcf814309e29590f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-lug.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-lug
+include: mafand
+task: mafand_en-lug_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-luo.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-luo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..41bb09363d1c100ffea32d38eb2085400bec018b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-luo.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-luo
+include: mafand
+task: mafand_en-luo_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-nya.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-nya.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..90a728107edf37bddd1d4eb80bcc6ddfaa49572e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-nya.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-nya
+include: mafand
+task: mafand_en-nya_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-pcm.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-pcm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..73229c4f18ac24014cf15f161454910a921e1d02
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-pcm.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-pcm
+include: mafand
+task: mafand_en-pcm_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-sna.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ac37187170a032226e1b33dd87fb09c7b9952cf1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-sna.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-sna
+include: mafand
+task: mafand_en-sna_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-swa.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..21d9fc0e71589cb2e1c832a6b82e6d8fb5288b89
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-swa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-swa
+include: mafand
+task: mafand_en-swa_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-tsn.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-tsn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b3dd43626d00774a8e9bd6e432322e4d608887ba
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-tsn.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-tsn
+include: mafand
+task: mafand_en-tsn_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-twi.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5502ffa4e5547433d7cba66191456178c5d3377e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-twi.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-twi
+include: mafand
+task: mafand_en-twi_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-xho.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c8c1ffee3415a1373a1b583819e874f968b5cfee
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-xho.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-xho
+include: mafand
+task: mafand_en-xho_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-yor.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..89c070c7c5e9f6d53fde695f156408f37038242d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-yor.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-yor
+include: mafand
+task: mafand_en-yor_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-zul.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e54725404ba481e668156618a97f0b11d1a1fb31
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_en-zul.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-zul
+include: mafand
+task: mafand_en-zul_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_fr-bam.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_fr-bam.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..15c6e981d1cb878e26a08755f5bf1fd7629f2525
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_fr-bam.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fr-bam
+include: mafand
+task: mafand_fr-bam_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_fr-bbj.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_fr-bbj.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4f5101a752865bf641a150be2aff3b449239d3f8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_fr-bbj.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fr-bbj
+include: mafand
+task: mafand_fr-bbj_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_fr-ewe.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_fr-ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..29d4214cf433e56e8f2d479299cf413e2d211d34
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_fr-ewe.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fr-ewe
+include: mafand
+task: mafand_fr-ewe_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_fr-fon.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_fr-fon.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9710db5b341b91663a782945da57a3a21ae3c1ae
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_fr-fon.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fr-fon
+include: mafand
+task: mafand_fr-fon_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_fr-mos.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_fr-mos.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..682fb19c479f7f4a49a810c1f665c20439e4ee3e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_fr-mos.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fr-mos
+include: mafand
+task: mafand_fr-mos_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_fr-wol.yaml b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_fr-wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3740ca9b73557e472f867b7b5c33131a441e18fa
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/mafand_fr-wol.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fr-wol
+include: mafand
+task: mafand_fr-wol_prompt_2
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/utils.py b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0df3a329824d44fa94eb830ae943fa30dd32bab7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_2/english-african/utils.py
@@ -0,0 +1,121 @@
+languages = {
+    "amh": "Amharic",
+    "bam": "Bambara",
+    "bbj": "Gbomala",
+    "ewe": "Ewe",
+    "fon": "Fon",
+    "hau": "Hausa",
+    "ibo": "Igbo",
+    "kin": "Kinyarwanda",
+    "lug": "Luganda",
+    "luo": "Luo",
+    "mos": "Mossi",
+    "nya": "Chichewa",
+    "pcm": "Nigerian Pidgin",
+    "sna": "Shona",
+    "swa": "Swahili",
+    "tsn": "Setswana",
+    "twi": "Twi",
+    "wol": "Wolof",
+    "xho": "Xhosa",
+    "yor": "Yoruba",
+    "zul": "Zulu",
+}
+
+
+def get_target(doc):
+    target = (
+        doc["translation"]["en"]
+        if "en" in doc["translation"].keys()
+        else doc["translation"]["fr"]
+    )
+    return target
+
+
+def get_target_reverse(doc):
+    target_key = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    target = doc["translation"][target_key]
+    return target
+
+
+def create_text_prompt_1(doc):
+    source_key = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    source_sentence = doc["translation"][source_key]
+    source_lang = "English" if "en" in doc["translation"].keys() else "French"
+    prompt = (
+        "You are an advanced Translator, a specialized assistant designed to translate documents from "
+        f"{languages[source_key]} into {source_lang}. \nYour main goal is to ensure translations are grammatically "
+        f"correct and human-oriented. \n{languages[source_key]}: {source_sentence} \n{source_lang}: "
+    )
+    return prompt
+
+
+def create_reverse_prompt_1(doc):
+    target_lang = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    source_key = "en" if "en" in doc["translation"].keys() else "fr"
+    source_lang = "English" if source_key == "en" else "French"
+    source_sentence = doc["translation"][source_key]
+    prompt = (
+        "You are an advanced Translator, a specialized assistant designed to translate documents from "
+        f"{source_lang} into {languages[target_lang]}. \nYour main goal is to ensure translations are "
+        f"grammatically correct and human-oriented. \n{source_lang}: {source_sentence} \n{languages[target_lang]}: "
+    )
+    return prompt
+
+
+def create_text_prompt_2(doc):
+    source_key = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    source_sentence = doc["translation"][source_key]
+    source_lang = "English" if "en" in doc["translation"].keys() else "French"
+    prompt = (
+        f"{languages[source_key]} sentence: {source_sentence} \n{source_lang} sentence: ",
+    )
+    return prompt
+
+
+def create_reverse_prompt_2(doc):
+    target_lang = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    source_key = "en" if "en" in doc["translation"].keys() else "fr"
+    source_lang = "English" if source_key == "en" else "French"
+    source_sentence = doc["translation"][source_key]
+    prompt = (
+        f"{source_lang} sentence: {source_sentence} \n{languages[target_lang]} sentence: \n",
+    )
+    return prompt
+
+
+def create_text_prompt_3(doc):
+    source_key = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    source_sentence = doc["translation"][source_key]
+    source_lang = "English" if "en" in doc["translation"].keys() else "French"
+    prompt = (
+        f"You are a translation expert. Translate the following {languages[source_key]} sentences "
+        f"to {source_lang}. \n{languages[source_key]} sentence: {source_sentence}\n{source_lang} sentence: "
+    )
+    return prompt
+
+
+def create_reverse_prompt_3(doc):
+    target_lang = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    source_key = "en" if "en" in doc["translation"].keys() else "fr"
+    source_lang = "English" if source_key == "en" else "French"
+    source_sentence = doc["translation"][source_key]
+    prompt = (
+        f"You are a translation expert. Translate the following {source_lang} sentence into {languages[target_lang]}\n"
+        f"{source_lang} sentence: {source_sentence}\n{languages[target_lang]} sentence: "
+    )
+    return prompt
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand
new file mode 100644
index 0000000000000000000000000000000000000000..eb7ad9883115b42626fd30d223fa90ff6f133384
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand
@@ -0,0 +1,28 @@
+tag:
+- mafand_tasks
+- mafand_afr-eng
+- mafand_afr-eng_prompt_3
+- afrobench_MT_tasks
+dataset_path: masakhane/mafand
+dataset_kwargs: {trust_remote_code: True}
+output_type: generate_until
+validation_split: validation
+fewshot_split: validation
+test_split: test
+doc_to_target: !function utils.get_target
+doc_to_text: !function utils.create_text_prompt_2
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: chrf
+    aggregation: chrf
+    higher_is_better: true
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+repeats: 1
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_amh-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_amh-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..856318b5ad1b19fde25dd12ee3a2fc712b053b1a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_amh-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-amh
+include: mafand
+task: mafand_amh-en_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_bam-fr.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_bam-fr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bed4252375b82e34d27c96adf368217957e33063
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_bam-fr.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fr-bam
+include: mafand
+task: mafand_bam-fr_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_bbj-fr.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_bbj-fr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1170c266f4b14550499e7ddf930c50714560ec66
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_bbj-fr.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fr-bbj
+include: mafand
+task: mafand_bbj-fr_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_ewe-fr.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_ewe-fr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..39a345cb6b6b86684e67978bd52c0e3071302a04
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_ewe-fr.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fr-ewe
+include: mafand
+task: mafand_ewe-fr_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_fon-fr.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_fon-fr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1b464bb913f4888d825a2ab2aa2d566ef5d422d4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_fon-fr.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fr-fon
+include: mafand
+task: mafand_fon-fr_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_hau-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_hau-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9c0b0f15fe011d52ff4bbd167e23453a621e2928
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_hau-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-hau
+include: mafand
+task: mafand_hau-en_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_ibo-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_ibo-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2f78f55a3b27a4461efd85e741503ead06e4bcce
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_ibo-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-ibo
+include: mafand
+task: mafand_ibo-en_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_kin-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_kin-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..254b22be3883110224aee429313cf2636993e675
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_kin-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-kin
+include: mafand
+task: mafand_kin-en_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_lug-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_lug-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ad19b3c85e6aae28a01c7e3476f8492e804c6d83
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_lug-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-lug
+include: mafand
+task: mafand_lug-en_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_luo-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_luo-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a3a367493d5da3804d2eda16eba676886765939e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_luo-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-luo
+include: mafand
+task: mafand_luo-en_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_mos-fr.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_mos-fr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4ea419312925c06760ddc6f6efbf343594f2b932
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_mos-fr.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fr-mos
+include: mafand
+task: mafand_mos-fr_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_nya-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_nya-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..de9ec930a1b738673b91ed89916c17109b159e66
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_nya-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-nya
+include: mafand
+task: mafand_nya-en_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_pcm-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_pcm-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..95ad3380334ced0554dc21ab4e6a07f23352f51d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_pcm-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-pcm
+include: mafand
+task: mafand_pcm-en_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_sna-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_sna-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2d86ccc3ad64220d354d4e9e230ee585e556b32c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_sna-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-sna
+include: mafand
+task: mafand_sna-en_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_swa-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_swa-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3c70f2e3e77f4ee94edd757311e8b37816911104
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_swa-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-swa
+include: mafand
+task: mafand_swa-en_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_tsn-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_tsn-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0ee8f4152a80d5af1bfd4b792167021d81e37284
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_tsn-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-tsn
+include: mafand
+task: mafand_tsn-en_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_twi-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_twi-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a37d2395a4721c6553a878f6693221cbac6a22a6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_twi-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-twi
+include: mafand
+task: mafand_twi-en_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_wol-fr.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_wol-fr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ed778cbe6690aafa7abd33fad50ef41fc25dbaea
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_wol-fr.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fr-wol
+include: mafand
+task: mafand_wol-fr_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_xho-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_xho-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..93e9e2fee5b39b755b9e060697e509a5f63d58e0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_xho-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-xho
+include: mafand
+task: mafand_xho-en_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_yor-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_yor-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..78301f7e658cd489eb6433f3ac5a10a0f0cde49b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_yor-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-yor
+include: mafand
+task: mafand_yor-en_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_zul-en.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_zul-en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..06177d14ec829fe74a12030d88e06a5fee7bc9a0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/mafand_zul-en.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-zul
+include: mafand
+task: mafand_zul-en_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/utils.py b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0df3a329824d44fa94eb830ae943fa30dd32bab7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/african-english/utils.py
@@ -0,0 +1,121 @@
+languages = {
+    "amh": "Amharic",
+    "bam": "Bambara",
+    "bbj": "Gbomala",
+    "ewe": "Ewe",
+    "fon": "Fon",
+    "hau": "Hausa",
+    "ibo": "Igbo",
+    "kin": "Kinyarwanda",
+    "lug": "Luganda",
+    "luo": "Luo",
+    "mos": "Mossi",
+    "nya": "Chichewa",
+    "pcm": "Nigerian Pidgin",
+    "sna": "Shona",
+    "swa": "Swahili",
+    "tsn": "Setswana",
+    "twi": "Twi",
+    "wol": "Wolof",
+    "xho": "Xhosa",
+    "yor": "Yoruba",
+    "zul": "Zulu",
+}
+
+
+def get_target(doc):
+    target = (
+        doc["translation"]["en"]
+        if "en" in doc["translation"].keys()
+        else doc["translation"]["fr"]
+    )
+    return target
+
+
+def get_target_reverse(doc):
+    target_key = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    target = doc["translation"][target_key]
+    return target
+
+
+def create_text_prompt_1(doc):
+    source_key = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    source_sentence = doc["translation"][source_key]
+    source_lang = "English" if "en" in doc["translation"].keys() else "French"
+    prompt = (
+        "You are an advanced Translator, a specialized assistant designed to translate documents from "
+        f"{languages[source_key]} into {source_lang}. \nYour main goal is to ensure translations are grammatically "
+        f"correct and human-oriented. \n{languages[source_key]}: {source_sentence} \n{source_lang}: "
+    )
+    return prompt
+
+
+def create_reverse_prompt_1(doc):
+    target_lang = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    source_key = "en" if "en" in doc["translation"].keys() else "fr"
+    source_lang = "English" if source_key == "en" else "French"
+    source_sentence = doc["translation"][source_key]
+    prompt = (
+        "You are an advanced Translator, a specialized assistant designed to translate documents from "
+        f"{source_lang} into {languages[target_lang]}. \nYour main goal is to ensure translations are "
+        f"grammatically correct and human-oriented. \n{source_lang}: {source_sentence} \n{languages[target_lang]}: "
+    )
+    return prompt
+
+
+def create_text_prompt_2(doc):
+    source_key = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    source_sentence = doc["translation"][source_key]
+    source_lang = "English" if "en" in doc["translation"].keys() else "French"
+    prompt = (
+        f"{languages[source_key]} sentence: {source_sentence} \n{source_lang} sentence: ",
+    )
+    return prompt
+
+
+def create_reverse_prompt_2(doc):
+    target_lang = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    source_key = "en" if "en" in doc["translation"].keys() else "fr"
+    source_lang = "English" if source_key == "en" else "French"
+    source_sentence = doc["translation"][source_key]
+    prompt = (
+        f"{source_lang} sentence: {source_sentence} \n{languages[target_lang]} sentence: \n",
+    )
+    return prompt
+
+
+def create_text_prompt_3(doc):
+    source_key = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    source_sentence = doc["translation"][source_key]
+    source_lang = "English" if "en" in doc["translation"].keys() else "French"
+    prompt = (
+        f"You are a translation expert. Translate the following {languages[source_key]} sentences "
+        f"to {source_lang}. \n{languages[source_key]} sentence: {source_sentence}\n{source_lang} sentence: "
+    )
+    return prompt
+
+
+def create_reverse_prompt_3(doc):
+    target_lang = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    source_key = "en" if "en" in doc["translation"].keys() else "fr"
+    source_lang = "English" if source_key == "en" else "French"
+    source_sentence = doc["translation"][source_key]
+    prompt = (
+        f"You are a translation expert. Translate the following {source_lang} sentence into {languages[target_lang]}\n"
+        f"{source_lang} sentence: {source_sentence}\n{languages[target_lang]} sentence: "
+    )
+    return prompt
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand
new file mode 100644
index 0000000000000000000000000000000000000000..9a59654e4feba09f57277b539633c2b0efda291e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand
@@ -0,0 +1,28 @@
+tag:
+- mafand_tasks
+- mafand_eng-afr
+- mafand_eng-afr_prompt_3
+- afrobench_MT_tasks
+dataset_path: masakhane/mafand
+dataset_kwargs: {trust_remote_code: True}
+output_type: generate_until
+validation_split: validation
+fewshot_split: validation
+test_split: test
+doc_to_target: !function utils.get_target_reverse
+doc_to_text: !function utils.create_reverse_prompt_3
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: chrf
+    aggregation: chrf
+    higher_is_better: true
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+repeats: 1
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-amh.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..10872430688882c53e5f2dba5aea70d1194c6020
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-amh.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-amh
+include: mafand
+task: mafand_en-amh_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-hau.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f64e68162a76b1a0cc33f44d62a0f66b5dc099e0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-hau.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-hau
+include: mafand
+task: mafand_en-hau_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-ibo.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..72df05e551444a9e82de940171c646797f1c18d6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-ibo.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-ibo
+include: mafand
+task: mafand_en-ibo_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-kin.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..44c48678e89c3dd7e72d310fd050b5a1d3bc6092
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-kin.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-kin
+include: mafand
+task: mafand_en-kin_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-lug.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2beae91b569db6fc361da97e0879e854af005e4d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-lug.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-lug
+include: mafand
+task: mafand_en-lug_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-luo.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-luo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b4c1aa8becf693cb16d4d1be2820707e521a1052
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-luo.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-luo
+include: mafand
+task: mafand_en-luo_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-nya.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-nya.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..eee7af0ce8ed105fa08769793ad816c2f4d17318
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-nya.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-nya
+include: mafand
+task: mafand_en-nya_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-pcm.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-pcm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6e60642562403f3b86d2e13fb3ea48368dc84883
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-pcm.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-pcm
+include: mafand
+task: mafand_en-pcm_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-sna.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..82abd862535426ee078cd45f057c792643062981
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-sna.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-sna
+include: mafand
+task: mafand_en-sna_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-swa.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8a7135ff6921556928338e52e3dbcc8afa731025
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-swa.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-swa
+include: mafand
+task: mafand_en-swa_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-tsn.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-tsn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b976b5fd24007928854afe3b533ffd8b66ea0851
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-tsn.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-tsn
+include: mafand
+task: mafand_en-tsn_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-twi.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..53345a2668eccb93b11568c22f6d218598f20ba2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-twi.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-twi
+include: mafand
+task: mafand_en-twi_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-xho.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4eba7f6994b577b4769b6b74a2848ecc0b3b0fb0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-xho.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-xho
+include: mafand
+task: mafand_en-xho_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-yor.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9b20e9f920deb657630ef0aa5aa6a443934f0519
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-yor.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-yor
+include: mafand
+task: mafand_en-yor_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-zul.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cb5280b995b0754ae6a4f6cd33bfc82350d0e8cb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_en-zul.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: en-zul
+include: mafand
+task: mafand_en-zul_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_fr-bam.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_fr-bam.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3e94be00dea4c013238a543cc3ceeb2982b92ce4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_fr-bam.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fr-bam
+include: mafand
+task: mafand_fr-bam_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_fr-bbj.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_fr-bbj.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9170a6b500239357dacf736b06044dd17aff5b25
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_fr-bbj.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fr-bbj
+include: mafand
+task: mafand_fr-bbj_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_fr-ewe.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_fr-ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7139c81fd0c991bf9ee6a24013a34a8b0b700efc
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_fr-ewe.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fr-ewe
+include: mafand
+task: mafand_fr-ewe_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_fr-fon.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_fr-fon.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b42292ce56e6abc7127e9988e5619e0eee3d56ab
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_fr-fon.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fr-fon
+include: mafand
+task: mafand_fr-fon_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_fr-mos.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_fr-mos.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..044047c346abcb945443b2b500eef7bd32f2caad
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_fr-mos.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fr-mos
+include: mafand
+task: mafand_fr-mos_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_fr-wol.yaml b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_fr-wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9fc1bca3b94f64489d26632c70c022558e7793b3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/mafand_fr-wol.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: fr-wol
+include: mafand
+task: mafand_fr-wol_prompt_3
diff --git a/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/utils.py b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0df3a329824d44fa94eb830ae943fa30dd32bab7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/mafand/prompt_3/english-african/utils.py
@@ -0,0 +1,121 @@
+languages = {
+    "amh": "Amharic",
+    "bam": "Bambara",
+    "bbj": "Gbomala",
+    "ewe": "Ewe",
+    "fon": "Fon",
+    "hau": "Hausa",
+    "ibo": "Igbo",
+    "kin": "Kinyarwanda",
+    "lug": "Luganda",
+    "luo": "Luo",
+    "mos": "Mossi",
+    "nya": "Chichewa",
+    "pcm": "Nigerian Pidgin",
+    "sna": "Shona",
+    "swa": "Swahili",
+    "tsn": "Setswana",
+    "twi": "Twi",
+    "wol": "Wolof",
+    "xho": "Xhosa",
+    "yor": "Yoruba",
+    "zul": "Zulu",
+}
+
+
+def get_target(doc):
+    target = (
+        doc["translation"]["en"]
+        if "en" in doc["translation"].keys()
+        else doc["translation"]["fr"]
+    )
+    return target
+
+
+def get_target_reverse(doc):
+    target_key = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    target = doc["translation"][target_key]
+    return target
+
+
+def create_text_prompt_1(doc):
+    source_key = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    source_sentence = doc["translation"][source_key]
+    source_lang = "English" if "en" in doc["translation"].keys() else "French"
+    prompt = (
+        "You are an advanced Translator, a specialized assistant designed to translate documents from "
+        f"{languages[source_key]} into {source_lang}. \nYour main goal is to ensure translations are grammatically "
+        f"correct and human-oriented. \n{languages[source_key]}: {source_sentence} \n{source_lang}: "
+    )
+    return prompt
+
+
+def create_reverse_prompt_1(doc):
+    target_lang = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    source_key = "en" if "en" in doc["translation"].keys() else "fr"
+    source_lang = "English" if source_key == "en" else "French"
+    source_sentence = doc["translation"][source_key]
+    prompt = (
+        "You are an advanced Translator, a specialized assistant designed to translate documents from "
+        f"{source_lang} into {languages[target_lang]}. \nYour main goal is to ensure translations are "
+        f"grammatically correct and human-oriented. \n{source_lang}: {source_sentence} \n{languages[target_lang]}: "
+    )
+    return prompt
+
+
+def create_text_prompt_2(doc):
+    source_key = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    source_sentence = doc["translation"][source_key]
+    source_lang = "English" if "en" in doc["translation"].keys() else "French"
+    prompt = (
+        f"{languages[source_key]} sentence: {source_sentence} \n{source_lang} sentence: ",
+    )
+    return prompt
+
+
+def create_reverse_prompt_2(doc):
+    target_lang = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    source_key = "en" if "en" in doc["translation"].keys() else "fr"
+    source_lang = "English" if source_key == "en" else "French"
+    source_sentence = doc["translation"][source_key]
+    prompt = (
+        f"{source_lang} sentence: {source_sentence} \n{languages[target_lang]} sentence: \n",
+    )
+    return prompt
+
+
+def create_text_prompt_3(doc):
+    source_key = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    source_sentence = doc["translation"][source_key]
+    source_lang = "English" if "en" in doc["translation"].keys() else "French"
+    prompt = (
+        f"You are a translation expert. Translate the following {languages[source_key]} sentences "
+        f"to {source_lang}. \n{languages[source_key]} sentence: {source_sentence}\n{source_lang} sentence: "
+    )
+    return prompt
+
+
+def create_reverse_prompt_3(doc):
+    target_lang = [key for key in doc["translation"].keys() if key not in ["en", "fr"]][
+        0
+    ]
+    source_key = "en" if "en" in doc["translation"].keys() else "fr"
+    source_lang = "English" if source_key == "en" else "French"
+    source_sentence = doc["translation"][source_key]
+    prompt = (
+        f"You are a translation expert. Translate the following {source_lang} sentence into {languages[target_lang]}\n"
+        f"{source_lang} sentence: {source_sentence}\n{languages[target_lang]} sentence: "
+    )
+    return prompt
diff --git a/lm_eval/tasks/afrobench/masakhaner/README.md b/lm_eval/tasks/afrobench/masakhaner/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ca96648e07bb2c54fbf0c79d968b2ef4cb6aba75
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/README.md
@@ -0,0 +1,76 @@
+#
+
+## Paper
+Title: `MasakhaNER 2.0: Africa-centric Transfer Learning for Named Entity Recognition`
+
+Paper Link: https://aclanthology.org/2022.emnlp-main.298/
+
+## Abstract
+>African languages are spoken by over a billion people, but they are under-represented in NLP research and development. Multiple challenges exist, including the limited availability of annotated training and evaluation datasets as well as the lack of understanding of which settings, languages, and recently proposed methods like cross-lingual transfer will be effective. In this paper, we aim to move towards solutions for these challenges, focusing on the task of named entity recognition (NER). We present the creation of the largest to-date human-annotated NER dataset for 20 African languages. We study the behaviour of state-of-the-art cross-lingual transfer methods in an Africa-centric setting, empirically demonstrating that the choice of source transfer language significantly affects performance. While much previous work defaults to using English as the source language, our results show that choosing the best transfer language improves zero-shot F1 scores by an average of 14% over 20 languages as compared to using English.
+
+HomePage: https://github.com/masakhane-io/masakhane-ner
+
+### Citation
+
+```
+@inproceedings{adelani-etal-2022-masakhaner,
+    title = "{M}asakha{NER} 2.0: {A}frica-centric Transfer Learning for Named Entity Recognition",
+    author = "Adelani, David Ifeoluwa  and
+      Neubig, Graham  and
+      Ruder, Sebastian  and
+      Rijhwani, Shruti  and
+      Beukman, Michael  and
+      Palen-Michel, Chester  and
+      Lignos, Constantine  and
+      Alabi, Jesujoba O.  and
+      Muhammad, Shamsuddeen H.  and
+      Nabende, Peter  and
+      Dione, Cheikh M. Bamba  and
+      Bukula, Andiswa  and
+      Mabuya, Rooweither  and
+      Dossou, Bonaventure F. P.  and
+      Sibanda, Blessing  and
+      Buzaaba, Happy  and
+      Mukiibi, Jonathan  and
+      Kalipe, Godson  and
+      Mbaye, Derguene  and
+      Taylor, Amelia  and
+      Kabore, Fatoumata  and
+      Emezue, Chris Chinenye  and
+      Aremu, Anuoluwapo  and
+      Ogayo, Perez  and
+      Gitau, Catherine  and
+      Munkoh-Buabeng, Edwin  and
+      Memdjokam Koagne, Victoire  and
+      Tapo, Allahsera Auguste  and
+      Macucwa, Tebogo  and
+      Marivate, Vukosi  and
+      Mboning, Elvis  and
+      Gwadabe, Tajuddeen  and
+      Adewumi, Tosin  and
+      Ahia, Orevaoghene  and
+      Nakatumba-Nabende, Joyce  and
+      Mokono, Neo L.  and
+      Ezeani, Ignatius  and
+      Chukwuneke, Chiamaka  and
+      Adeyemi, Mofetoluwa  and
+      Hacheme, Gilles Q.  and
+      Abdulmumim, Idris  and
+      Ogundepo, Odunayo  and
+      Yousuf, Oreen  and
+      Moteu Ngoli, Tatiana  and
+      Klakow, Dietrich",
+    editor = "Goldberg, Yoav  and
+      Kozareva, Zornitsa  and
+      Zhang, Yue",
+    booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
+    month = dec,
+    year = "2022",
+    address = "Abu Dhabi, United Arab Emirates",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2022.emnlp-main.298/",
+    doi = "10.18653/v1/2022.emnlp-main.298",
+    pages = "4488--4508",
+    abstract = "African languages are spoken by over a billion people, but they are under-represented in NLP research and development. Multiple challenges exist, including the limited availability of annotated training and evaluation datasets as well as the lack of understanding of which settings, languages, and recently proposed methods like cross-lingual transfer will be effective. In this paper, we aim to move towards solutions for these challenges, focusing on the task of named entity recognition (NER). We present the creation of the largest to-date human-annotated NER dataset for 20 African languages. We study the behaviour of state-of-the-art cross-lingual transfer methods in an Africa-centric setting, empirically demonstrating that the choice of source transfer language significantly affects performance. While much previous work defaults to using English as the source language, our results show that choosing the best transfer language improves zero-shot F1 scores by an average of 14{\%} over 20 languages as compared to using English."
+}
+```
diff --git a/lm_eval/tasks/afrobench/masakhaner/gen_utils.py b/lm_eval/tasks/afrobench/masakhaner/gen_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d1012021f567ab02ccdd6259788e00ea1f759e9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/gen_utils.py
@@ -0,0 +1,138 @@
+import argparse
+import os
+
+import yaml
+
+
+class FunctionTag:
+    def __init__(self, value):
+        self.value = value
+
+
+def prompt_func(mode, lang):
+    prompt_map = {
+        "prompt_1": "Named entities refers to names of location, organisation and personal name. \n For example, "
+        "'David is an employee of Amazon and he is visiting New York next week to see Esther' will be \n"
+        "PERSON: David $ ORGANIZATION: Amazon $ LOCATION: New York $ PERSON: Esther \n\n"
+        "Ensure the output strictly follows the format: label: entity $ label: entity, with each unique "
+        "entity on a separate label line, avoiding grouped entities (e.g., avoid LOC: entity, entity) or "
+        "irrelevant entries like none. \n\nText: {{text}} \n"
+        "Return only the output",
+        "prompt_2": "You are working as a named entity recognition expert and your task is to label a given text "
+        "with named entity labels. Your task is to identify and label any named entities present in the "
+        "text. The named entity labels that you will be using are PER (person), LOC (location), "
+        "ORG (organization) and DATE (date). Label multi-word entities as a single named entity. "
+        "For words which are not part of any named entity, do not return any value for it. \n"
+        "Ensure the output strictly follows the format: label: entity $$ label: entity, with each unique "
+        "entity on a separate label line, avoiding grouped entities (e.g., avoid LOC: entity, entity) or "
+        "irrelevant entries like none. Return only the output \n\nText: {{text}}",
+        "prompt_3": f"You are a Named Entity Recognition expert in {lang} language. \nExtract all named entities from "
+        f"the following {lang} text and categorize them into PERSON, LOCATION, ORGANIZATION, or DATE. "
+        f"Ensure the output strictly follows the format: label: entity $$ label: entity, with each unique "
+        "entity on a separate label line, avoiding grouped entities (e.g., avoid LOC: entity, entity) or "
+        "irrelevant entries like none. Return only the output \n\nText: {{text}}",
+        "prompt_4": f"As a {lang} linguist, label all named entities in the {lang} text below with the categories: "
+        "PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the output strictly follows the format: label: "
+        "entity $$ label: entity, with each unique entity on a separate label line, avoiding grouped "
+        "entities (e.g., avoid LOC: entity, entity) or irrelevant entries like none. Return only the "
+        "output. \n\nText: {{text}}",
+        "prompt_5": "Provide a concise list of named entities in the text below. Use the following labels: "
+        "PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the output strictly follows the format: label: "
+        "entity $$ label: entity, with each unique entity on a separate label line, avoiding grouped "
+        "entities (e.g., avoid LOC: entity, entity) or irrelevant entries like none. Return only the "
+        "output.  \n\nText: {{text}}",
+    }
+    return prompt_map[mode]
+
+
+def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
+    """
+    Generate a yaml file for each language.
+
+    :param output_dir: The directory to output the files to.
+    :param overwrite: Whether to overwrite files if they already exist.
+    """
+    err = []
+    languages = {
+        "am": "Amharic",
+        "bm": "Bambara",
+        "bbj": "Ghomala",
+        "ee": "Ewe",
+        "ha": "Hausa",
+        "ig": "Igbo",
+        "rw": "Kinyarwanda",
+        "lg": "Luganda",
+        "luo": "Luo",
+        "mos": "Mossi",
+        "ny": "Chichewa",
+        "pcm": "Nigerian Pidgin",
+        "sn": "chiShona",
+        "sw": "Kiswahili",
+        "tn": "Setswana",
+        "tw": "Twi",
+        "wo": "Wolof",
+        "xh": "isiXhosa",
+        "yo": "Yoruba",
+        "zu": "isiZulu",
+    }
+
+    for lang in languages.keys():
+        try:
+            file_name = f"masakhaner_{lang}.yaml"
+            task_name = f"masakhaner_{lang}_{mode}"
+            yaml_template = "masakhaner"
+            yaml_details = {
+                "include": yaml_template,
+                "task": task_name,
+                "dataset_name": lang,
+                "doc_to_text": prompt_func(mode, languages[lang]),
+            }
+            os.makedirs(f"{output_dir}/{mode}", exist_ok=True)
+            with open(
+                f"{output_dir}/{mode}/{file_name}",
+                "w" if overwrite else "x",
+                encoding="utf8",
+            ) as f:
+                f.write("# Generated by utils.py\n")
+                yaml.dump(
+                    yaml_details,
+                    f,
+                    allow_unicode=True,
+                )
+        except FileExistsError:
+            err.append(file_name)
+
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist (use --overwrite flag):"
+            f" {', '.join(err)}"
+        )
+
+
+def main() -> None:
+    """Parse CLI args and generate language-specific yaml files."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--overwrite",
+        default=True,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default="./",
+        help="Directory to write yaml files to",
+    )
+    parser.add_argument(
+        "--mode",
+        default="prompt_1",
+        choices=["prompt_1", "prompt_2", "prompt_3", "prompt_4", "prompt_5"],
+        help="Prompt number",
+    )
+    args = parser.parse_args()
+
+    gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite, mode=args.mode)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lm_eval/tasks/afrobench/masakhaner/masakhaner.yaml b/lm_eval/tasks/afrobench/masakhaner/masakhaner.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b0d374e80c43cc8831d167887e386f3500773b48
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/masakhaner.yaml
@@ -0,0 +1,13 @@
+group: masakhaner
+task:
+  - masakhaner_prompt_1
+  - masakhaner_prompt_2
+  - masakhaner_prompt_3
+  - masakhaner_prompt_4
+  - masakhaner_prompt_5
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner
new file mode 100644
index 0000000000000000000000000000000000000000..706eb36644524b2aaa10b686ce120048e6322390
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner
@@ -0,0 +1,26 @@
+tag:
+- masakhaner_tasks
+- masakhaner_prompt_1
+dataset_path: masakhane/masakhaner-x
+dataset_name: null
+dataset_kwargs: {trust_remote_code: True}
+output_type: generate_until
+generation_kwargs:
+  do_sample: false
+  until:
+  - </s>
+  - <|im_end|>
+validation_split: validation
+test_split: test
+fewshot_split: train
+doc_to_target: target
+filter_list:
+ - name: flexible-extract
+   filter:
+     - function: format_span
+metric_list:
+  - metric: f1
+    aggregation: !function utils.span_f1_agg
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_am.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_am.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b2128752f754eb8c46f7608c89dbefd7a3800480
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_am.yaml
@@ -0,0 +1,11 @@
+# Generated by utils.py
+dataset_name: am
+doc_to_text: "Named entities refers to names of location, organisation and personal\
+  \ name. \n For example, 'David is an employee of Amazon and he is visiting New York\
+  \ next week to see Esther' will be \nPERSON: David $ ORGANIZATION: Amazon $ LOCATION:\
+  \ New York $ PERSON: Esther \n\nEnsure the output strictly follows the format: label:\
+  \ entity $ label: entity, with each unique entity on a separate label line, avoiding\
+  \ grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries like\
+  \ none. \n\nText: {{text}} \nReturn only the output"
+include: masakhaner
+task: masakhaner_am_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_bbj.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_bbj.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3f3a72bdc0257aedee8787af75a70c14560bfc53
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_bbj.yaml
@@ -0,0 +1,11 @@
+# Generated by utils.py
+dataset_name: bbj
+doc_to_text: "Named entities refers to names of location, organisation and personal\
+  \ name. \n For example, 'David is an employee of Amazon and he is visiting New York\
+  \ next week to see Esther' will be \nPERSON: David $ ORGANIZATION: Amazon $ LOCATION:\
+  \ New York $ PERSON: Esther \n\nEnsure the output strictly follows the format: label:\
+  \ entity $ label: entity, with each unique entity on a separate label line, avoiding\
+  \ grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries like\
+  \ none. \n\nText: {{text}} \nReturn only the output"
+include: masakhaner
+task: masakhaner_bbj_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_bm.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_bm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c38bdee947c2d34a4a7797eae1251fc476be0f53
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_bm.yaml
@@ -0,0 +1,11 @@
+# Generated by utils.py
+dataset_name: bm
+doc_to_text: "Named entities refers to names of location, organisation and personal\
+  \ name. \n For example, 'David is an employee of Amazon and he is visiting New York\
+  \ next week to see Esther' will be \nPERSON: David $ ORGANIZATION: Amazon $ LOCATION:\
+  \ New York $ PERSON: Esther \n\nEnsure the output strictly follows the format: label:\
+  \ entity $ label: entity, with each unique entity on a separate label line, avoiding\
+  \ grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries like\
+  \ none. \n\nText: {{text}} \nReturn only the output"
+include: masakhaner
+task: masakhaner_bm_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_ee.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_ee.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..97903908e30cfe376626f7c668d5d9862593d73e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_ee.yaml
@@ -0,0 +1,11 @@
+# Generated by utils.py
+dataset_name: ee
+doc_to_text: "Named entities refers to names of location, organisation and personal\
+  \ name. \n For example, 'David is an employee of Amazon and he is visiting New York\
+  \ next week to see Esther' will be \nPERSON: David $ ORGANIZATION: Amazon $ LOCATION:\
+  \ New York $ PERSON: Esther \n\nEnsure the output strictly follows the format: label:\
+  \ entity $ label: entity, with each unique entity on a separate label line, avoiding\
+  \ grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries like\
+  \ none. \n\nText: {{text}} \nReturn only the output"
+include: masakhaner
+task: masakhaner_ee_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_ha.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_ha.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ad11710407bda79b9561cd05725c21eb945ca292
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_ha.yaml
@@ -0,0 +1,11 @@
+# Generated by utils.py
+dataset_name: ha
+doc_to_text: "Named entities refers to names of location, organisation and personal\
+  \ name. \n For example, 'David is an employee of Amazon and he is visiting New York\
+  \ next week to see Esther' will be \nPERSON: David $ ORGANIZATION: Amazon $ LOCATION:\
+  \ New York $ PERSON: Esther \n\nEnsure the output strictly follows the format: label:\
+  \ entity $ label: entity, with each unique entity on a separate label line, avoiding\
+  \ grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries like\
+  \ none. \n\nText: {{text}} \nReturn only the output"
+include: masakhaner
+task: masakhaner_ha_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_ig.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_ig.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0f06c0655595ae81b985266e4e629b080d49130c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_ig.yaml
@@ -0,0 +1,11 @@
+# Generated by utils.py
+dataset_name: ig
+doc_to_text: "Named entities refers to names of location, organisation and personal\
+  \ name. \n For example, 'David is an employee of Amazon and he is visiting New York\
+  \ next week to see Esther' will be \nPERSON: David $ ORGANIZATION: Amazon $ LOCATION:\
+  \ New York $ PERSON: Esther \n\nEnsure the output strictly follows the format: label:\
+  \ entity $ label: entity, with each unique entity on a separate label line, avoiding\
+  \ grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries like\
+  \ none. \n\nText: {{text}} \nReturn only the output"
+include: masakhaner
+task: masakhaner_ig_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_lg.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_lg.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1823b20f63e1e4d2bcf5edc1427be758c3d16a62
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_lg.yaml
@@ -0,0 +1,11 @@
+# Generated by utils.py
+dataset_name: lg
+doc_to_text: "Named entities refers to names of location, organisation and personal\
+  \ name. \n For example, 'David is an employee of Amazon and he is visiting New York\
+  \ next week to see Esther' will be \nPERSON: David $ ORGANIZATION: Amazon $ LOCATION:\
+  \ New York $ PERSON: Esther \n\nEnsure the output strictly follows the format: label:\
+  \ entity $ label: entity, with each unique entity on a separate label line, avoiding\
+  \ grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries like\
+  \ none. \n\nText: {{text}} \nReturn only the output"
+include: masakhaner
+task: masakhaner_lg_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_luo.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_luo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..55b6d82968ed9a09ef43a12bd5cba91ea4ab5c87
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_luo.yaml
@@ -0,0 +1,11 @@
+# Generated by utils.py
+dataset_name: luo
+doc_to_text: "Named entities refers to names of location, organisation and personal\
+  \ name. \n For example, 'David is an employee of Amazon and he is visiting New York\
+  \ next week to see Esther' will be \nPERSON: David $ ORGANIZATION: Amazon $ LOCATION:\
+  \ New York $ PERSON: Esther \n\nEnsure the output strictly follows the format: label:\
+  \ entity $ label: entity, with each unique entity on a separate label line, avoiding\
+  \ grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries like\
+  \ none. \n\nText: {{text}} \nReturn only the output"
+include: masakhaner
+task: masakhaner_luo_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_mos.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_mos.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ac5ddf43cd0eefd5fbe85785c7b4687135924938
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_mos.yaml
@@ -0,0 +1,11 @@
+# Generated by utils.py
+dataset_name: mos
+doc_to_text: "Named entities refers to names of location, organisation and personal\
+  \ name. \n For example, 'David is an employee of Amazon and he is visiting New York\
+  \ next week to see Esther' will be \nPERSON: David $ ORGANIZATION: Amazon $ LOCATION:\
+  \ New York $ PERSON: Esther \n\nEnsure the output strictly follows the format: label:\
+  \ entity $ label: entity, with each unique entity on a separate label line, avoiding\
+  \ grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries like\
+  \ none. \n\nText: {{text}} \nReturn only the output"
+include: masakhaner
+task: masakhaner_mos_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_ny.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_ny.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..36d12ad2c00c0101aa2405af5824b6cbf310d132
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_ny.yaml
@@ -0,0 +1,11 @@
+# Generated by utils.py
+dataset_name: ny
+doc_to_text: "Named entities refers to names of location, organisation and personal\
+  \ name. \n For example, 'David is an employee of Amazon and he is visiting New York\
+  \ next week to see Esther' will be \nPERSON: David $ ORGANIZATION: Amazon $ LOCATION:\
+  \ New York $ PERSON: Esther \n\nEnsure the output strictly follows the format: label:\
+  \ entity $ label: entity, with each unique entity on a separate label line, avoiding\
+  \ grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries like\
+  \ none. \n\nText: {{text}} \nReturn only the output"
+include: masakhaner
+task: masakhaner_ny_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_pcm.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_pcm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8c09bf44c682758e53deb9262e4aef393d8bbc8e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_pcm.yaml
@@ -0,0 +1,11 @@
+# Generated by utils.py
+dataset_name: pcm
+doc_to_text: "Named entities refers to names of location, organisation and personal\
+  \ name. \n For example, 'David is an employee of Amazon and he is visiting New York\
+  \ next week to see Esther' will be \nPERSON: David $ ORGANIZATION: Amazon $ LOCATION:\
+  \ New York $ PERSON: Esther \n\nEnsure the output strictly follows the format: label:\
+  \ entity $ label: entity, with each unique entity on a separate label line, avoiding\
+  \ grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries like\
+  \ none. \n\nText: {{text}} \nReturn only the output"
+include: masakhaner
+task: masakhaner_pcm_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_rw.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_rw.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7398e5fbe7b55f837b900158b7fdc4a3b7e2ac92
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_rw.yaml
@@ -0,0 +1,11 @@
+# Generated by utils.py
+dataset_name: rw
+doc_to_text: "Named entities refers to names of location, organisation and personal\
+  \ name. \n For example, 'David is an employee of Amazon and he is visiting New York\
+  \ next week to see Esther' will be \nPERSON: David $ ORGANIZATION: Amazon $ LOCATION:\
+  \ New York $ PERSON: Esther \n\nEnsure the output strictly follows the format: label:\
+  \ entity $ label: entity, with each unique entity on a separate label line, avoiding\
+  \ grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries like\
+  \ none. \n\nText: {{text}} \nReturn only the output"
+include: masakhaner
+task: masakhaner_rw_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_sn.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_sn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ecdd3260fcb8b0fd147dfba666aa0a42e4687323
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_sn.yaml
@@ -0,0 +1,11 @@
+# Generated by utils.py
+dataset_name: sn
+doc_to_text: "Named entities refers to names of location, organisation and personal\
+  \ name. \n For example, 'David is an employee of Amazon and he is visiting New York\
+  \ next week to see Esther' will be \nPERSON: David $ ORGANIZATION: Amazon $ LOCATION:\
+  \ New York $ PERSON: Esther \n\nEnsure the output strictly follows the format: label:\
+  \ entity $ label: entity, with each unique entity on a separate label line, avoiding\
+  \ grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries like\
+  \ none. \n\nText: {{text}} \nReturn only the output"
+include: masakhaner
+task: masakhaner_sn_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_sw.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_sw.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f2bd3379c3b3508067ae500c5cad947ba7006b74
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_sw.yaml
@@ -0,0 +1,11 @@
+# Generated by utils.py
+dataset_name: sw
+doc_to_text: "Named entities refers to names of location, organisation and personal\
+  \ name. \n For example, 'David is an employee of Amazon and he is visiting New York\
+  \ next week to see Esther' will be \nPERSON: David $ ORGANIZATION: Amazon $ LOCATION:\
+  \ New York $ PERSON: Esther \n\nEnsure the output strictly follows the format: label:\
+  \ entity $ label: entity, with each unique entity on a separate label line, avoiding\
+  \ grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries like\
+  \ none. \n\nText: {{text}} \nReturn only the output"
+include: masakhaner
+task: masakhaner_sw_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_tn.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_tn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..50d80dcb79ab8f2bb0f423d42159dd1d56fd262f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_tn.yaml
@@ -0,0 +1,11 @@
+# Generated by utils.py
+dataset_name: tn
+doc_to_text: "Named entities refers to names of location, organisation and personal\
+  \ name. \n For example, 'David is an employee of Amazon and he is visiting New York\
+  \ next week to see Esther' will be \nPERSON: David $ ORGANIZATION: Amazon $ LOCATION:\
+  \ New York $ PERSON: Esther \n\nEnsure the output strictly follows the format: label:\
+  \ entity $ label: entity, with each unique entity on a separate label line, avoiding\
+  \ grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries like\
+  \ none. \n\nText: {{text}} \nReturn only the output"
+include: masakhaner
+task: masakhaner_tn_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_tw.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_tw.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6c8a8d40575c4867cc0ec33a96343f1eb9c29f7b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_tw.yaml
@@ -0,0 +1,11 @@
+# Generated by utils.py
+dataset_name: tw
+doc_to_text: "Named entities refers to names of location, organisation and personal\
+  \ name. \n For example, 'David is an employee of Amazon and he is visiting New York\
+  \ next week to see Esther' will be \nPERSON: David $ ORGANIZATION: Amazon $ LOCATION:\
+  \ New York $ PERSON: Esther \n\nEnsure the output strictly follows the format: label:\
+  \ entity $ label: entity, with each unique entity on a separate label line, avoiding\
+  \ grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries like\
+  \ none. \n\nText: {{text}} \nReturn only the output"
+include: masakhaner
+task: masakhaner_tw_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_wo.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_wo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5e5f6eeaecb9ad56ee2cf035cf1390f489d8ba98
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_wo.yaml
@@ -0,0 +1,11 @@
+# Generated by utils.py
+dataset_name: wo
+doc_to_text: "Named entities refers to names of location, organisation and personal\
+  \ name. \n For example, 'David is an employee of Amazon and he is visiting New York\
+  \ next week to see Esther' will be \nPERSON: David $ ORGANIZATION: Amazon $ LOCATION:\
+  \ New York $ PERSON: Esther \n\nEnsure the output strictly follows the format: label:\
+  \ entity $ label: entity, with each unique entity on a separate label line, avoiding\
+  \ grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries like\
+  \ none. \n\nText: {{text}} \nReturn only the output"
+include: masakhaner
+task: masakhaner_wo_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_xh.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_xh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8b27051f5df77d39561c0fcee81033a357a9220d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_xh.yaml
@@ -0,0 +1,11 @@
+# Generated by utils.py
+dataset_name: xh
+doc_to_text: "Named entities refers to names of location, organisation and personal\
+  \ name. \n For example, 'David is an employee of Amazon and he is visiting New York\
+  \ next week to see Esther' will be \nPERSON: David $ ORGANIZATION: Amazon $ LOCATION:\
+  \ New York $ PERSON: Esther \n\nEnsure the output strictly follows the format: label:\
+  \ entity $ label: entity, with each unique entity on a separate label line, avoiding\
+  \ grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries like\
+  \ none. \n\nText: {{text}} \nReturn only the output"
+include: masakhaner
+task: masakhaner_xh_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_yo.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_yo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2fdb71aa53d4eacdb5cfcdada220423909f9515d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_yo.yaml
@@ -0,0 +1,11 @@
+# Generated by utils.py
+dataset_name: yo
+doc_to_text: "Named entities refers to names of location, organisation and personal\
+  \ name. \n For example, 'David is an employee of Amazon and he is visiting New York\
+  \ next week to see Esther' will be \nPERSON: David $ ORGANIZATION: Amazon $ LOCATION:\
+  \ New York $ PERSON: Esther \n\nEnsure the output strictly follows the format: label:\
+  \ entity $ label: entity, with each unique entity on a separate label line, avoiding\
+  \ grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries like\
+  \ none. \n\nText: {{text}} \nReturn only the output"
+include: masakhaner
+task: masakhaner_yo_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_zu.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_zu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..83b9d4b0fd5ef3ab2d9f62639d49a9cea1e3a1a1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_1/masakhaner_zu.yaml
@@ -0,0 +1,11 @@
+# Generated by utils.py
+dataset_name: zu
+doc_to_text: "Named entities refers to names of location, organisation and personal\
+  \ name. \n For example, 'David is an employee of Amazon and he is visiting New York\
+  \ next week to see Esther' will be \nPERSON: David $ ORGANIZATION: Amazon $ LOCATION:\
+  \ New York $ PERSON: Esther \n\nEnsure the output strictly follows the format: label:\
+  \ entity $ label: entity, with each unique entity on a separate label line, avoiding\
+  \ grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries like\
+  \ none. \n\nText: {{text}} \nReturn only the output"
+include: masakhaner
+task: masakhaner_zu_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_1/utils.py b/lm_eval/tasks/afrobench/masakhaner/prompt_1/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..76909044e7f35948156f8bb506ce2fce563ec689
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_1/utils.py
@@ -0,0 +1,146 @@
+import collections
+import re
+
+from lm_eval.utils import weighted_f1_score
+
+
+def doc_to_target(doc):
+    return transform_text(doc["ner_tags"])
+
+
+def transform_text(text):
+    entities = []
+    current_entity = ""
+    current_tag = ""
+
+    for pair in text.split("\n"):
+        if pair:  # Check if the line is not empty
+            word, tag = pair.strip().split(": ")
+            tag = tag.upper()
+            word = word.lower()
+            word = word.strip(",.").strip()
+
+            if tag.startswith("B-"):
+                if current_entity:
+                    entities.append(f"{current_tag}: {current_entity}")
+                current_tag = tag.split("-")[1]
+                current_entity = word
+            elif tag.startswith("I-") and tag.split("-")[1] == current_tag:
+                current_entity += word
+            else:
+                if current_entity:
+                    entities.append(f"{current_tag}: {current_entity}")
+                    current_entity = ""
+                    current_tag = ""
+    if current_entity:
+        entities.append(f"{current_tag}: {current_entity}")
+
+        # Join all the transformed output lines with $$ as separator
+    return " $$ ".join(entities)
+
+
+def span_f1_agg(items):
+    """Computes Span based F1 score.
+
+    This function is copied from
+    https://github.com/google-research/multilingual-t5/blob/master/multilingual_t5/evaluation/metrics.py
+
+    Args:
+    targets: list of strings or list of list of strings if multiple references
+      are present.
+    predictions: list of strings
+
+    Returns:
+    span f1 across all targets and predictions (Based on CoNLL script)
+    """
+    unzipped_list = list(zip(*items))
+    targets = unzipped_list[0]
+    predictions = unzipped_list[1]
+
+    true_positives = collections.defaultdict(int)
+    false_positives = collections.defaultdict(int)
+    false_negatives = collections.defaultdict(int)
+
+    def normalize_text(strings):
+        def get_blank_spaces_pattern():
+            return re.compile(r"\s{3,}|\t")
+
+        def remove_blank_spaces(text):
+            text = re.sub(pattern=get_blank_spaces_pattern(), repl="", string=text)
+            text = re.sub("\s+", " ", text)
+            return text
+
+        def remove_punctuation(text):
+            my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@.""-,`'
+            text = re.sub(
+                "[" + my_punctuation + "]+", " ", str(text)
+            )  # strip punctuation
+            return text
+
+        def remove_articles(text):
+            regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
+            return re.sub(regex, " ", text)
+
+        def lowercase(text):
+            text = text.lower()
+            return text
+
+        strings = remove_punctuation(strings)
+        strings = remove_articles(strings)
+        strings = remove_blank_spaces(strings)
+        strings = lowercase(strings)
+
+        return strings
+
+    def tags_to_spans(tag_sequence, delimiter="$$"):
+        """Extract spans from IOB1 or BIO tags."""
+        if isinstance(tag_sequence, list):
+            tag_sequence = " ".join(i.strip() for i in tag_sequence)
+        tag_sequence_split = [
+            item.strip()
+            for sub in tag_sequence.strip().split(delimiter)
+            for item in sub.split("$")
+            if item
+        ]
+        tag_sequence_split = [
+            item.strip()
+            for value in tag_sequence_split
+            for sub in value.split(". ")
+            for item in sub.split(", ")
+        ]
+        tags_entities = []
+        for tag_entity in tag_sequence_split:
+            tag_entity_split = tag_entity.split(": ")
+            if len(tag_entity_split) != 2:
+                continue
+            tag = normalize_text(tag_entity_split[0].strip())
+            entity = normalize_text(tag_entity_split[1].rstrip().lstrip())
+            tags_entities.append((tag, entity))
+        return tags_entities
+
+    def compute_f1_metrics(true_positive, false_positive, false_negative):
+        precision = float(true_positive) / float(true_positive + false_positive + 1e-13)
+        recall = float(true_positive) / float(true_positive + false_negative + 1e-13)
+        f1_measures = 2.0 * ((precision * recall) / (precision + recall + 1e-13))
+        return precision, recall, f1_measures
+
+    for target, pred in zip(targets, predictions):
+        gold_spans = tags_to_spans(target)
+        predicted_spans = tags_to_spans(pred)
+
+        for span in predicted_spans:
+            if span in gold_spans:
+                true_positives[span[0]] += 1
+                gold_spans.remove(span)
+            else:
+                false_positives[span[0]] += 1
+        # These spans weren't predicted.
+        for span in gold_spans:
+            false_negatives[span[0]] += 1
+
+    _, _, f1_measure = compute_f1_metrics(
+        sum(true_positives.values()),
+        sum(false_positives.values()),
+        sum(false_negatives.values()),
+    )
+    return f1_measure
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner
new file mode 100644
index 0000000000000000000000000000000000000000..2fd5eb829ce60a5a16d970dbf6b0078e42135cf4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner
@@ -0,0 +1,26 @@
+tag:
+- masakhaner_tasks
+- masakhaner_prompt_2
+dataset_path: masakhane/masakhaner-x
+dataset_name: null
+dataset_kwargs: {trust_remote_code: True}
+output_type: generate_until
+generation_kwargs:
+  do_sample: false
+  until:
+  - </s>
+  - <|im_end|>
+validation_split: validation
+test_split: test
+fewshot_split: train
+doc_to_target: target
+filter_list:
+ - name: flexible-extract
+   filter:
+     - function: format_span
+metric_list:
+  - metric: f1
+    aggregation: !function utils.span_f1_agg
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_am.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_am.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bd1bd33551e152f6f29ff9d248d76ec236be75d4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_am.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: am
+doc_to_text: "You are working as a named entity recognition expert and your task is\
+  \ to label a given text with named entity labels. Your task is to identify and label\
+  \ any named entities present in the text. The named entity labels that you will\
+  \ be using are PER (person), LOC (location), ORG (organization) and DATE (date).\
+  \ Label multi-word entities as a single named entity. For words which are not part\
+  \ of any named entity, do not return any value for it. \nEnsure the output strictly\
+  \ follows the format: label: entity $$ label: entity, with each unique entity on\
+  \ a separate label line, avoiding grouped entities (e.g., avoid LOC: entity, entity)\
+  \ or irrelevant entries like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_am_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_bbj.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_bbj.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5d817ecbe3596b40fad80fe4b95af7ace7bd9e35
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_bbj.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: bbj
+doc_to_text: "You are working as a named entity recognition expert and your task is\
+  \ to label a given text with named entity labels. Your task is to identify and label\
+  \ any named entities present in the text. The named entity labels that you will\
+  \ be using are PER (person), LOC (location), ORG (organization) and DATE (date).\
+  \ Label multi-word entities as a single named entity. For words which are not part\
+  \ of any named entity, do not return any value for it. \nEnsure the output strictly\
+  \ follows the format: label: entity $$ label: entity, with each unique entity on\
+  \ a separate label line, avoiding grouped entities (e.g., avoid LOC: entity, entity)\
+  \ or irrelevant entries like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_bbj_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_bm.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_bm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f99a03c7486f8f7170d19b1935e223210915c137
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_bm.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: bm
+doc_to_text: "You are working as a named entity recognition expert and your task is\
+  \ to label a given text with named entity labels. Your task is to identify and label\
+  \ any named entities present in the text. The named entity labels that you will\
+  \ be using are PER (person), LOC (location), ORG (organization) and DATE (date).\
+  \ Label multi-word entities as a single named entity. For words which are not part\
+  \ of any named entity, do not return any value for it. \nEnsure the output strictly\
+  \ follows the format: label: entity $$ label: entity, with each unique entity on\
+  \ a separate label line, avoiding grouped entities (e.g., avoid LOC: entity, entity)\
+  \ or irrelevant entries like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_bm_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_ee.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_ee.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..da31685e7d9fb0fa462403e0347347600c705d41
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_ee.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: ee
+doc_to_text: "You are working as a named entity recognition expert and your task is\
+  \ to label a given text with named entity labels. Your task is to identify and label\
+  \ any named entities present in the text. The named entity labels that you will\
+  \ be using are PER (person), LOC (location), ORG (organization) and DATE (date).\
+  \ Label multi-word entities as a single named entity. For words which are not part\
+  \ of any named entity, do not return any value for it. \nEnsure the output strictly\
+  \ follows the format: label: entity $$ label: entity, with each unique entity on\
+  \ a separate label line, avoiding grouped entities (e.g., avoid LOC: entity, entity)\
+  \ or irrelevant entries like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_ee_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_ha.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_ha.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8075046a92f4c88ce8a75db0416732b4f0e96f45
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_ha.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: ha
+doc_to_text: "You are working as a named entity recognition expert and your task is\
+  \ to label a given text with named entity labels. Your task is to identify and label\
+  \ any named entities present in the text. The named entity labels that you will\
+  \ be using are PER (person), LOC (location), ORG (organization) and DATE (date).\
+  \ Label multi-word entities as a single named entity. For words which are not part\
+  \ of any named entity, do not return any value for it. \nEnsure the output strictly\
+  \ follows the format: label: entity $$ label: entity, with each unique entity on\
+  \ a separate label line, avoiding grouped entities (e.g., avoid LOC: entity, entity)\
+  \ or irrelevant entries like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_ha_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_ig.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_ig.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c8771f510a5de2a7ff56acd593b4de3b2309dd07
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_ig.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: ig
+doc_to_text: "You are working as a named entity recognition expert and your task is\
+  \ to label a given text with named entity labels. Your task is to identify and label\
+  \ any named entities present in the text. The named entity labels that you will\
+  \ be using are PER (person), LOC (location), ORG (organization) and DATE (date).\
+  \ Label multi-word entities as a single named entity. For words which are not part\
+  \ of any named entity, do not return any value for it. \nEnsure the output strictly\
+  \ follows the format: label: entity $$ label: entity, with each unique entity on\
+  \ a separate label line, avoiding grouped entities (e.g., avoid LOC: entity, entity)\
+  \ or irrelevant entries like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_ig_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_lg.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_lg.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5c6729e368b3cf53900035c5730d9b729a9e2baa
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_lg.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: lg
+doc_to_text: "You are working as a named entity recognition expert and your task is\
+  \ to label a given text with named entity labels. Your task is to identify and label\
+  \ any named entities present in the text. The named entity labels that you will\
+  \ be using are PER (person), LOC (location), ORG (organization) and DATE (date).\
+  \ Label multi-word entities as a single named entity. For words which are not part\
+  \ of any named entity, do not return any value for it. \nEnsure the output strictly\
+  \ follows the format: label: entity $$ label: entity, with each unique entity on\
+  \ a separate label line, avoiding grouped entities (e.g., avoid LOC: entity, entity)\
+  \ or irrelevant entries like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_lg_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_luo.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_luo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a458235f101ae430c77f9da1124a0b8d9fdcda38
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_luo.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: luo
+doc_to_text: "You are working as a named entity recognition expert and your task is\
+  \ to label a given text with named entity labels. Your task is to identify and label\
+  \ any named entities present in the text. The named entity labels that you will\
+  \ be using are PER (person), LOC (location), ORG (organization) and DATE (date).\
+  \ Label multi-word entities as a single named entity. For words which are not part\
+  \ of any named entity, do not return any value for it. \nEnsure the output strictly\
+  \ follows the format: label: entity $$ label: entity, with each unique entity on\
+  \ a separate label line, avoiding grouped entities (e.g., avoid LOC: entity, entity)\
+  \ or irrelevant entries like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_luo_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_mos.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_mos.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..816b9bdedc4578c6d2ade8cb05258a4ebc7280de
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_mos.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: mos
+doc_to_text: "You are working as a named entity recognition expert and your task is\
+  \ to label a given text with named entity labels. Your task is to identify and label\
+  \ any named entities present in the text. The named entity labels that you will\
+  \ be using are PER (person), LOC (location), ORG (organization) and DATE (date).\
+  \ Label multi-word entities as a single named entity. For words which are not part\
+  \ of any named entity, do not return any value for it. \nEnsure the output strictly\
+  \ follows the format: label: entity $$ label: entity, with each unique entity on\
+  \ a separate label line, avoiding grouped entities (e.g., avoid LOC: entity, entity)\
+  \ or irrelevant entries like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_mos_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_ny.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_ny.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2f8c4c13c89495b4da3b07ad432b3969310037f8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_ny.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: ny
+doc_to_text: "You are working as a named entity recognition expert and your task is\
+  \ to label a given text with named entity labels. Your task is to identify and label\
+  \ any named entities present in the text. The named entity labels that you will\
+  \ be using are PER (person), LOC (location), ORG (organization) and DATE (date).\
+  \ Label multi-word entities as a single named entity. For words which are not part\
+  \ of any named entity, do not return any value for it. \nEnsure the output strictly\
+  \ follows the format: label: entity $$ label: entity, with each unique entity on\
+  \ a separate label line, avoiding grouped entities (e.g., avoid LOC: entity, entity)\
+  \ or irrelevant entries like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_ny_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_pcm.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_pcm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..75dc6ec048cab2dc550d8174dbcc44d966a6ff8e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_pcm.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: pcm
+doc_to_text: "You are working as a named entity recognition expert and your task is\
+  \ to label a given text with named entity labels. Your task is to identify and label\
+  \ any named entities present in the text. The named entity labels that you will\
+  \ be using are PER (person), LOC (location), ORG (organization) and DATE (date).\
+  \ Label multi-word entities as a single named entity. For words which are not part\
+  \ of any named entity, do not return any value for it. \nEnsure the output strictly\
+  \ follows the format: label: entity $$ label: entity, with each unique entity on\
+  \ a separate label line, avoiding grouped entities (e.g., avoid LOC: entity, entity)\
+  \ or irrelevant entries like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_pcm_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_rw.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_rw.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fb93e2d4b24c68bc46b0f141d4c43a51b39ea41e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_rw.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: rw
+doc_to_text: "You are working as a named entity recognition expert and your task is\
+  \ to label a given text with named entity labels. Your task is to identify and label\
+  \ any named entities present in the text. The named entity labels that you will\
+  \ be using are PER (person), LOC (location), ORG (organization) and DATE (date).\
+  \ Label multi-word entities as a single named entity. For words which are not part\
+  \ of any named entity, do not return any value for it. \nEnsure the output strictly\
+  \ follows the format: label: entity $$ label: entity, with each unique entity on\
+  \ a separate label line, avoiding grouped entities (e.g., avoid LOC: entity, entity)\
+  \ or irrelevant entries like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_rw_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_sn.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_sn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..60380a512424253dc84f826922fc1fd2f9e75d72
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_sn.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: sn
+doc_to_text: "You are working as a named entity recognition expert and your task is\
+  \ to label a given text with named entity labels. Your task is to identify and label\
+  \ any named entities present in the text. The named entity labels that you will\
+  \ be using are PER (person), LOC (location), ORG (organization) and DATE (date).\
+  \ Label multi-word entities as a single named entity. For words which are not part\
+  \ of any named entity, do not return any value for it. \nEnsure the output strictly\
+  \ follows the format: label: entity $$ label: entity, with each unique entity on\
+  \ a separate label line, avoiding grouped entities (e.g., avoid LOC: entity, entity)\
+  \ or irrelevant entries like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_sn_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_sw.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_sw.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..82cf74ae26a0cec7d9d659713c7d0203ab829a1f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_sw.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: sw
+doc_to_text: "You are working as a named entity recognition expert and your task is\
+  \ to label a given text with named entity labels. Your task is to identify and label\
+  \ any named entities present in the text. The named entity labels that you will\
+  \ be using are PER (person), LOC (location), ORG (organization) and DATE (date).\
+  \ Label multi-word entities as a single named entity. For words which are not part\
+  \ of any named entity, do not return any value for it. \nEnsure the output strictly\
+  \ follows the format: label: entity $$ label: entity, with each unique entity on\
+  \ a separate label line, avoiding grouped entities (e.g., avoid LOC: entity, entity)\
+  \ or irrelevant entries like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_sw_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_tn.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_tn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1852ebe9ae79b51e586fed461b88e7b03fc8557c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_tn.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: tn
+doc_to_text: "You are working as a named entity recognition expert and your task is\
+  \ to label a given text with named entity labels. Your task is to identify and label\
+  \ any named entities present in the text. The named entity labels that you will\
+  \ be using are PER (person), LOC (location), ORG (organization) and DATE (date).\
+  \ Label multi-word entities as a single named entity. For words which are not part\
+  \ of any named entity, do not return any value for it. \nEnsure the output strictly\
+  \ follows the format: label: entity $$ label: entity, with each unique entity on\
+  \ a separate label line, avoiding grouped entities (e.g., avoid LOC: entity, entity)\
+  \ or irrelevant entries like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_tn_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_tw.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_tw.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ea354958bcf1590a41fc4e48426d403cae4f9454
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_tw.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: tw
+doc_to_text: "You are working as a named entity recognition expert and your task is\
+  \ to label a given text with named entity labels. Your task is to identify and label\
+  \ any named entities present in the text. The named entity labels that you will\
+  \ be using are PER (person), LOC (location), ORG (organization) and DATE (date).\
+  \ Label multi-word entities as a single named entity. For words which are not part\
+  \ of any named entity, do not return any value for it. \nEnsure the output strictly\
+  \ follows the format: label: entity $$ label: entity, with each unique entity on\
+  \ a separate label line, avoiding grouped entities (e.g., avoid LOC: entity, entity)\
+  \ or irrelevant entries like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_tw_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_wo.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_wo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e7cd0d754be2d6fc09054f50d18d29fa07a8551a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_wo.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: wo
+doc_to_text: "You are working as a named entity recognition expert and your task is\
+  \ to label a given text with named entity labels. Your task is to identify and label\
+  \ any named entities present in the text. The named entity labels that you will\
+  \ be using are PER (person), LOC (location), ORG (organization) and DATE (date).\
+  \ Label multi-word entities as a single named entity. For words which are not part\
+  \ of any named entity, do not return any value for it. \nEnsure the output strictly\
+  \ follows the format: label: entity $$ label: entity, with each unique entity on\
+  \ a separate label line, avoiding grouped entities (e.g., avoid LOC: entity, entity)\
+  \ or irrelevant entries like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_wo_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_xh.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_xh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9451f0edd121337f8d4dd316284c05a8ea73ff6b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_xh.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: xh
+doc_to_text: "You are working as a named entity recognition expert and your task is\
+  \ to label a given text with named entity labels. Your task is to identify and label\
+  \ any named entities present in the text. The named entity labels that you will\
+  \ be using are PER (person), LOC (location), ORG (organization) and DATE (date).\
+  \ Label multi-word entities as a single named entity. For words which are not part\
+  \ of any named entity, do not return any value for it. \nEnsure the output strictly\
+  \ follows the format: label: entity $$ label: entity, with each unique entity on\
+  \ a separate label line, avoiding grouped entities (e.g., avoid LOC: entity, entity)\
+  \ or irrelevant entries like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_xh_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_yo.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_yo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fc0d92c50ed1cf7182b55ed192b2b440d58317e9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_yo.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: yo
+doc_to_text: "You are working as a named entity recognition expert and your task is\
+  \ to label a given text with named entity labels. Your task is to identify and label\
+  \ any named entities present in the text. The named entity labels that you will\
+  \ be using are PER (person), LOC (location), ORG (organization) and DATE (date).\
+  \ Label multi-word entities as a single named entity. For words which are not part\
+  \ of any named entity, do not return any value for it. \nEnsure the output strictly\
+  \ follows the format: label: entity $$ label: entity, with each unique entity on\
+  \ a separate label line, avoiding grouped entities (e.g., avoid LOC: entity, entity)\
+  \ or irrelevant entries like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_yo_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_zu.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_zu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e06bf3cef883c5ae3f9bf12d7abf5c27618d37e8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_2/masakhaner_zu.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: zu
+doc_to_text: "You are working as a named entity recognition expert and your task is\
+  \ to label a given text with named entity labels. Your task is to identify and label\
+  \ any named entities present in the text. The named entity labels that you will\
+  \ be using are PER (person), LOC (location), ORG (organization) and DATE (date).\
+  \ Label multi-word entities as a single named entity. For words which are not part\
+  \ of any named entity, do not return any value for it. \nEnsure the output strictly\
+  \ follows the format: label: entity $$ label: entity, with each unique entity on\
+  \ a separate label line, avoiding grouped entities (e.g., avoid LOC: entity, entity)\
+  \ or irrelevant entries like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_zu_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_2/utils.py b/lm_eval/tasks/afrobench/masakhaner/prompt_2/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..76909044e7f35948156f8bb506ce2fce563ec689
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_2/utils.py
@@ -0,0 +1,146 @@
+import collections
+import re
+
+from lm_eval.utils import weighted_f1_score
+
+
+def doc_to_target(doc):
+    return transform_text(doc["ner_tags"])
+
+
+def transform_text(text):
+    entities = []
+    current_entity = ""
+    current_tag = ""
+
+    for pair in text.split("\n"):
+        if pair:  # Check if the line is not empty
+            word, tag = pair.strip().split(": ")
+            tag = tag.upper()
+            word = word.lower()
+            word = word.strip(",.").strip()
+
+            if tag.startswith("B-"):
+                if current_entity:
+                    entities.append(f"{current_tag}: {current_entity}")
+                current_tag = tag.split("-")[1]
+                current_entity = word
+            elif tag.startswith("I-") and tag.split("-")[1] == current_tag:
+                current_entity += word
+            else:
+                if current_entity:
+                    entities.append(f"{current_tag}: {current_entity}")
+                    current_entity = ""
+                    current_tag = ""
+    if current_entity:
+        entities.append(f"{current_tag}: {current_entity}")
+
+        # Join all the transformed output lines with $$ as separator
+    return " $$ ".join(entities)
+
+
+def span_f1_agg(items):
+    """Computes Span based F1 score.
+
+    This function is copied from
+    https://github.com/google-research/multilingual-t5/blob/master/multilingual_t5/evaluation/metrics.py
+
+    Args:
+    targets: list of strings or list of list of strings if multiple references
+      are present.
+    predictions: list of strings
+
+    Returns:
+    span f1 across all targets and predictions (Based on CoNLL script)
+    """
+    unzipped_list = list(zip(*items))
+    targets = unzipped_list[0]
+    predictions = unzipped_list[1]
+
+    true_positives = collections.defaultdict(int)
+    false_positives = collections.defaultdict(int)
+    false_negatives = collections.defaultdict(int)
+
+    def normalize_text(strings):
+        def get_blank_spaces_pattern():
+            return re.compile(r"\s{3,}|\t")
+
+        def remove_blank_spaces(text):
+            text = re.sub(pattern=get_blank_spaces_pattern(), repl="", string=text)
+            text = re.sub("\s+", " ", text)
+            return text
+
+        def remove_punctuation(text):
+            my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@.""-,`'
+            text = re.sub(
+                "[" + my_punctuation + "]+", " ", str(text)
+            )  # strip punctuation
+            return text
+
+        def remove_articles(text):
+            regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
+            return re.sub(regex, " ", text)
+
+        def lowercase(text):
+            text = text.lower()
+            return text
+
+        strings = remove_punctuation(strings)
+        strings = remove_articles(strings)
+        strings = remove_blank_spaces(strings)
+        strings = lowercase(strings)
+
+        return strings
+
+    def tags_to_spans(tag_sequence, delimiter="$$"):
+        """Extract spans from IOB1 or BIO tags."""
+        if isinstance(tag_sequence, list):
+            tag_sequence = " ".join(i.strip() for i in tag_sequence)
+        tag_sequence_split = [
+            item.strip()
+            for sub in tag_sequence.strip().split(delimiter)
+            for item in sub.split("$")
+            if item
+        ]
+        tag_sequence_split = [
+            item.strip()
+            for value in tag_sequence_split
+            for sub in value.split(". ")
+            for item in sub.split(", ")
+        ]
+        tags_entities = []
+        for tag_entity in tag_sequence_split:
+            tag_entity_split = tag_entity.split(": ")
+            if len(tag_entity_split) != 2:
+                continue
+            tag = normalize_text(tag_entity_split[0].strip())
+            entity = normalize_text(tag_entity_split[1].rstrip().lstrip())
+            tags_entities.append((tag, entity))
+        return tags_entities
+
+    def compute_f1_metrics(true_positive, false_positive, false_negative):
+        precision = float(true_positive) / float(true_positive + false_positive + 1e-13)
+        recall = float(true_positive) / float(true_positive + false_negative + 1e-13)
+        f1_measures = 2.0 * ((precision * recall) / (precision + recall + 1e-13))
+        return precision, recall, f1_measures
+
+    for target, pred in zip(targets, predictions):
+        gold_spans = tags_to_spans(target)
+        predicted_spans = tags_to_spans(pred)
+
+        for span in predicted_spans:
+            if span in gold_spans:
+                true_positives[span[0]] += 1
+                gold_spans.remove(span)
+            else:
+                false_positives[span[0]] += 1
+        # These spans weren't predicted.
+        for span in gold_spans:
+            false_negatives[span[0]] += 1
+
+    _, _, f1_measure = compute_f1_metrics(
+        sum(true_positives.values()),
+        sum(false_positives.values()),
+        sum(false_negatives.values()),
+    )
+    return f1_measure
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner
new file mode 100644
index 0000000000000000000000000000000000000000..7f32f86b1e194826a7ffe7d4edb0935eac80c491
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner
@@ -0,0 +1,26 @@
+tag:
+- masakhaner_tasks
+- masakhaner_prompt_3
+dataset_path: masakhane/masakhaner-x
+dataset_name: null
+dataset_kwargs: {trust_remote_code: True}
+output_type: generate_until
+generation_kwargs:
+  do_sample: false
+  until:
+  - </s>
+  - <|im_end|>
+validation_split: validation
+test_split: test
+fewshot_split: train
+doc_to_target: target
+filter_list:
+ - name: flexible-extract
+   filter:
+     - function: format_span
+metric_list:
+  - metric: f1
+    aggregation: !function utils.span_f1_agg
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_am.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_am.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..54ad8b54111743ecf392d944c6201bfc56e5362c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_am.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: am
+doc_to_text: "You are a Named Entity Recognition expert in Amharic language. \nExtract\
+  \ all named entities from the following Amharic text and categorize them into PERSON,\
+  \ LOCATION, ORGANIZATION, or DATE. Ensure the output strictly follows the format:\
+  \ label: entity $$ label: entity, with each unique entity on a separate label line,\
+  \ avoiding grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries\
+  \ like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_am_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_bbj.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_bbj.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..23e724f424a862308d1947b176cc24b5eb040d47
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_bbj.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: bbj
+doc_to_text: "You are a Named Entity Recognition expert in Ghomala language. \nExtract\
+  \ all named entities from the following Ghomala text and categorize them into PERSON,\
+  \ LOCATION, ORGANIZATION, or DATE. Ensure the output strictly follows the format:\
+  \ label: entity $$ label: entity, with each unique entity on a separate label line,\
+  \ avoiding grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries\
+  \ like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_bbj_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_bm.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_bm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..62b5b80e7c26335f304f7ed9673dd9f1c94ef970
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_bm.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: bm
+doc_to_text: "You are a Named Entity Recognition expert in Bambara language. \nExtract\
+  \ all named entities from the following Bambara text and categorize them into PERSON,\
+  \ LOCATION, ORGANIZATION, or DATE. Ensure the output strictly follows the format:\
+  \ label: entity $$ label: entity, with each unique entity on a separate label line,\
+  \ avoiding grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries\
+  \ like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_bm_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_ee.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_ee.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2cdadd27559ab964cacfd396be6fb29a3ae392e8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_ee.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: ee
+doc_to_text: "You are a Named Entity Recognition expert in Ewe language. \nExtract\
+  \ all named entities from the following Ewe text and categorize them into PERSON,\
+  \ LOCATION, ORGANIZATION, or DATE. Ensure the output strictly follows the format:\
+  \ label: entity $$ label: entity, with each unique entity on a separate label line,\
+  \ avoiding grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries\
+  \ like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_ee_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_ha.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_ha.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9d19d26f67447f8916f74b5e26d1bccc0e65bc57
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_ha.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: ha
+doc_to_text: "You are a Named Entity Recognition expert in Hausa language. \nExtract\
+  \ all named entities from the following Hausa text and categorize them into PERSON,\
+  \ LOCATION, ORGANIZATION, or DATE. Ensure the output strictly follows the format:\
+  \ label: entity $$ label: entity, with each unique entity on a separate label line,\
+  \ avoiding grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries\
+  \ like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_ha_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_ig.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_ig.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..edf6119689b51760de8a77b70c7dd4461f41b99d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_ig.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: ig
+doc_to_text: "You are a Named Entity Recognition expert in Igbo language. \nExtract\
+  \ all named entities from the following Igbo text and categorize them into PERSON,\
+  \ LOCATION, ORGANIZATION, or DATE. Ensure the output strictly follows the format:\
+  \ label: entity $$ label: entity, with each unique entity on a separate label line,\
+  \ avoiding grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries\
+  \ like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_ig_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_lg.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_lg.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9318a78207c0f3f9e6d2de954e6707100143258e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_lg.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: lg
+doc_to_text: "You are a Named Entity Recognition expert in Luganda language. \nExtract\
+  \ all named entities from the following Luganda text and categorize them into PERSON,\
+  \ LOCATION, ORGANIZATION, or DATE. Ensure the output strictly follows the format:\
+  \ label: entity $$ label: entity, with each unique entity on a separate label line,\
+  \ avoiding grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries\
+  \ like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_lg_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_luo.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_luo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..61254fc358a2bd8f2ed2a5dd6aca3d09f88ae232
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_luo.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: luo
+doc_to_text: "You are a Named Entity Recognition expert in Luo language. \nExtract\
+  \ all named entities from the following Luo text and categorize them into PERSON,\
+  \ LOCATION, ORGANIZATION, or DATE. Ensure the output strictly follows the format:\
+  \ label: entity $$ label: entity, with each unique entity on a separate label line,\
+  \ avoiding grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries\
+  \ like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_luo_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_mos.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_mos.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..84ff6b24aaffaf14564de139bde92b1c850bcddd
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_mos.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: mos
+doc_to_text: "You are a Named Entity Recognition expert in Mossi language. \nExtract\
+  \ all named entities from the following Mossi text and categorize them into PERSON,\
+  \ LOCATION, ORGANIZATION, or DATE. Ensure the output strictly follows the format:\
+  \ label: entity $$ label: entity, with each unique entity on a separate label line,\
+  \ avoiding grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries\
+  \ like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_mos_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_ny.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_ny.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bd592c5b93e39a488d5c5d909137e1caada54840
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_ny.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: ny
+doc_to_text: "You are a Named Entity Recognition expert in Chichewa language. \nExtract\
+  \ all named entities from the following Chichewa text and categorize them into PERSON,\
+  \ LOCATION, ORGANIZATION, or DATE. Ensure the output strictly follows the format:\
+  \ label: entity $$ label: entity, with each unique entity on a separate label line,\
+  \ avoiding grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries\
+  \ like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_ny_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_pcm.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_pcm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b448b244b8d6580c0ebc53817060de633dd39efb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_pcm.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: pcm
+doc_to_text: "You are a Named Entity Recognition expert in Nigerian Pidgin language.\
+  \ \nExtract all named entities from the following Nigerian Pidgin text and categorize\
+  \ them into PERSON, LOCATION, ORGANIZATION, or DATE. Ensure the output strictly\
+  \ follows the format: label: entity $$ label: entity, with each unique entity on\
+  \ a separate label line, avoiding grouped entities (e.g., avoid LOC: entity, entity)\
+  \ or irrelevant entries like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_pcm_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_rw.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_rw.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5356ce8b011c53b77ab18302512e30b52c727e37
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_rw.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: rw
+doc_to_text: "You are a Named Entity Recognition expert in Kinyarwanda language. \n\
+  Extract all named entities from the following Kinyarwanda text and categorize them\
+  \ into PERSON, LOCATION, ORGANIZATION, or DATE. Ensure the output strictly follows\
+  \ the format: label: entity $$ label: entity, with each unique entity on a separate\
+  \ label line, avoiding grouped entities (e.g., avoid LOC: entity, entity) or irrelevant\
+  \ entries like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_rw_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_sn.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_sn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ab356ae80061b8c49348513b8f6fa05b2dea9473
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_sn.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: sn
+doc_to_text: "You are a Named Entity Recognition expert in chiShona language. \nExtract\
+  \ all named entities from the following chiShona text and categorize them into PERSON,\
+  \ LOCATION, ORGANIZATION, or DATE. Ensure the output strictly follows the format:\
+  \ label: entity $$ label: entity, with each unique entity on a separate label line,\
+  \ avoiding grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries\
+  \ like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_sn_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_sw.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_sw.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bb3d69595796755c03f4f02f201d36e121e4b6bd
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_sw.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: sw
+doc_to_text: "You are a Named Entity Recognition expert in Kiswahili language. \n\
+  Extract all named entities from the following Kiswahili text and categorize them\
+  \ into PERSON, LOCATION, ORGANIZATION, or DATE. Ensure the output strictly follows\
+  \ the format: label: entity $$ label: entity, with each unique entity on a separate\
+  \ label line, avoiding grouped entities (e.g., avoid LOC: entity, entity) or irrelevant\
+  \ entries like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_sw_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_tn.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_tn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d42d164ad81853e92855fd51e73bd0e276d4c761
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_tn.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: tn
+doc_to_text: "You are a Named Entity Recognition expert in Setswana language. \nExtract\
+  \ all named entities from the following Setswana text and categorize them into PERSON,\
+  \ LOCATION, ORGANIZATION, or DATE. Ensure the output strictly follows the format:\
+  \ label: entity $$ label: entity, with each unique entity on a separate label line,\
+  \ avoiding grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries\
+  \ like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_tn_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_tw.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_tw.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..62b4e2af7c4916e851c834a27c8357a91140d45b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_tw.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: tw
+doc_to_text: "You are a Named Entity Recognition expert in Twi language. \nExtract\
+  \ all named entities from the following Twi text and categorize them into PERSON,\
+  \ LOCATION, ORGANIZATION, or DATE. Ensure the output strictly follows the format:\
+  \ label: entity $$ label: entity, with each unique entity on a separate label line,\
+  \ avoiding grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries\
+  \ like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_tw_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_wo.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_wo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6db45e2bccb590f2586494d3fb586b3e5966a17b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_wo.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: wo
+doc_to_text: "You are a Named Entity Recognition expert in Wolof language. \nExtract\
+  \ all named entities from the following Wolof text and categorize them into PERSON,\
+  \ LOCATION, ORGANIZATION, or DATE. Ensure the output strictly follows the format:\
+  \ label: entity $$ label: entity, with each unique entity on a separate label line,\
+  \ avoiding grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries\
+  \ like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_wo_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_xh.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_xh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6a697b274e71c54293edab7d6dbefabd719843ea
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_xh.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: xh
+doc_to_text: "You are a Named Entity Recognition expert in isiXhosa language. \nExtract\
+  \ all named entities from the following isiXhosa text and categorize them into PERSON,\
+  \ LOCATION, ORGANIZATION, or DATE. Ensure the output strictly follows the format:\
+  \ label: entity $$ label: entity, with each unique entity on a separate label line,\
+  \ avoiding grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries\
+  \ like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_xh_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_yo.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_yo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..589cd5b35a6b27c8a9075ee22b9f8fd98a11860d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_yo.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: yo
+doc_to_text: "You are a Named Entity Recognition expert in Yoruba language. \nExtract\
+  \ all named entities from the following Yoruba text and categorize them into PERSON,\
+  \ LOCATION, ORGANIZATION, or DATE. Ensure the output strictly follows the format:\
+  \ label: entity $$ label: entity, with each unique entity on a separate label line,\
+  \ avoiding grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries\
+  \ like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_yo_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_zu.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_zu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c25d5a0c89da79ea982a35f8d032c3c489a16a80
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_3/masakhaner_zu.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: zu
+doc_to_text: "You are a Named Entity Recognition expert in isiZulu language. \nExtract\
+  \ all named entities from the following isiZulu text and categorize them into PERSON,\
+  \ LOCATION, ORGANIZATION, or DATE. Ensure the output strictly follows the format:\
+  \ label: entity $$ label: entity, with each unique entity on a separate label line,\
+  \ avoiding grouped entities (e.g., avoid LOC: entity, entity) or irrelevant entries\
+  \ like none. Return only the output \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_zu_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_3/utils.py b/lm_eval/tasks/afrobench/masakhaner/prompt_3/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..76909044e7f35948156f8bb506ce2fce563ec689
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_3/utils.py
@@ -0,0 +1,146 @@
+import collections
+import re
+
+from lm_eval.utils import weighted_f1_score
+
+
+def doc_to_target(doc):
+    return transform_text(doc["ner_tags"])
+
+
+def transform_text(text):
+    entities = []
+    current_entity = ""
+    current_tag = ""
+
+    for pair in text.split("\n"):
+        if pair:  # Check if the line is not empty
+            word, tag = pair.strip().split(": ")
+            tag = tag.upper()
+            word = word.lower()
+            word = word.strip(",.").strip()
+
+            if tag.startswith("B-"):
+                if current_entity:
+                    entities.append(f"{current_tag}: {current_entity}")
+                current_tag = tag.split("-")[1]
+                current_entity = word
+            elif tag.startswith("I-") and tag.split("-")[1] == current_tag:
+                current_entity += word
+            else:
+                if current_entity:
+                    entities.append(f"{current_tag}: {current_entity}")
+                    current_entity = ""
+                    current_tag = ""
+    if current_entity:
+        entities.append(f"{current_tag}: {current_entity}")
+
+        # Join all the transformed output lines with $$ as separator
+    return " $$ ".join(entities)
+
+
+def span_f1_agg(items):
+    """Computes Span based F1 score.
+
+    This function is copied from
+    https://github.com/google-research/multilingual-t5/blob/master/multilingual_t5/evaluation/metrics.py
+
+    Args:
+    targets: list of strings or list of list of strings if multiple references
+      are present.
+    predictions: list of strings
+
+    Returns:
+    span f1 across all targets and predictions (Based on CoNLL script)
+    """
+    unzipped_list = list(zip(*items))
+    targets = unzipped_list[0]
+    predictions = unzipped_list[1]
+
+    true_positives = collections.defaultdict(int)
+    false_positives = collections.defaultdict(int)
+    false_negatives = collections.defaultdict(int)
+
+    def normalize_text(strings):
+        def get_blank_spaces_pattern():
+            return re.compile(r"\s{3,}|\t")
+
+        def remove_blank_spaces(text):
+            text = re.sub(pattern=get_blank_spaces_pattern(), repl="", string=text)
+            text = re.sub("\s+", " ", text)
+            return text
+
+        def remove_punctuation(text):
+            my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@.""-,`'
+            text = re.sub(
+                "[" + my_punctuation + "]+", " ", str(text)
+            )  # strip punctuation
+            return text
+
+        def remove_articles(text):
+            regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
+            return re.sub(regex, " ", text)
+
+        def lowercase(text):
+            text = text.lower()
+            return text
+
+        strings = remove_punctuation(strings)
+        strings = remove_articles(strings)
+        strings = remove_blank_spaces(strings)
+        strings = lowercase(strings)
+
+        return strings
+
+    def tags_to_spans(tag_sequence, delimiter="$$"):
+        """Extract spans from IOB1 or BIO tags."""
+        if isinstance(tag_sequence, list):
+            tag_sequence = " ".join(i.strip() for i in tag_sequence)
+        tag_sequence_split = [
+            item.strip()
+            for sub in tag_sequence.strip().split(delimiter)
+            for item in sub.split("$")
+            if item
+        ]
+        tag_sequence_split = [
+            item.strip()
+            for value in tag_sequence_split
+            for sub in value.split(". ")
+            for item in sub.split(", ")
+        ]
+        tags_entities = []
+        for tag_entity in tag_sequence_split:
+            tag_entity_split = tag_entity.split(": ")
+            if len(tag_entity_split) != 2:
+                continue
+            tag = normalize_text(tag_entity_split[0].strip())
+            entity = normalize_text(tag_entity_split[1].rstrip().lstrip())
+            tags_entities.append((tag, entity))
+        return tags_entities
+
+    def compute_f1_metrics(true_positive, false_positive, false_negative):
+        precision = float(true_positive) / float(true_positive + false_positive + 1e-13)
+        recall = float(true_positive) / float(true_positive + false_negative + 1e-13)
+        f1_measures = 2.0 * ((precision * recall) / (precision + recall + 1e-13))
+        return precision, recall, f1_measures
+
+    for target, pred in zip(targets, predictions):
+        gold_spans = tags_to_spans(target)
+        predicted_spans = tags_to_spans(pred)
+
+        for span in predicted_spans:
+            if span in gold_spans:
+                true_positives[span[0]] += 1
+                gold_spans.remove(span)
+            else:
+                false_positives[span[0]] += 1
+        # These spans weren't predicted.
+        for span in gold_spans:
+            false_negatives[span[0]] += 1
+
+    _, _, f1_measure = compute_f1_metrics(
+        sum(true_positives.values()),
+        sum(false_positives.values()),
+        sum(false_negatives.values()),
+    )
+    return f1_measure
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner
new file mode 100644
index 0000000000000000000000000000000000000000..5c0ae52e62da27b202b794918ac568747195de34
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner
@@ -0,0 +1,26 @@
+tag:
+- masakhaner_tasks
+- masakhaner_prompt_4
+dataset_path: masakhane/masakhaner-x
+dataset_name: null
+dataset_kwargs: {trust_remote_code: True}
+output_type: generate_until
+generation_kwargs:
+  do_sample: false
+  until:
+  - </s>
+  - <|im_end|>
+validation_split: validation
+test_split: test
+fewshot_split: train
+doc_to_target: target
+filter_list:
+ - name: flexible-extract
+   filter:
+     - function: format_span
+metric_list:
+  - metric: f1
+    aggregation: !function utils.span_f1_agg
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_am.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_am.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..19b06221f2366dbeb7418961ecbf00f6b1146f1d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_am.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: am
+doc_to_text: "As a Amharic linguist, label all named entities in the Amharic text\
+  \ below with the categories: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the\
+  \ output strictly follows the format: label: entity $$ label: entity, with each\
+  \ unique entity on a separate label line, avoiding grouped entities (e.g., avoid\
+  \ LOC: entity, entity) or irrelevant entries like none. Return only the output.\
+  \ \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_am_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_bbj.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_bbj.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..03ed5210a03e4ce5bc9afde45166519152a153c7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_bbj.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: bbj
+doc_to_text: "As a Ghomala linguist, label all named entities in the Ghomala text\
+  \ below with the categories: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the\
+  \ output strictly follows the format: label: entity $$ label: entity, with each\
+  \ unique entity on a separate label line, avoiding grouped entities (e.g., avoid\
+  \ LOC: entity, entity) or irrelevant entries like none. Return only the output.\
+  \ \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_bbj_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_bm.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_bm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e719db9ac9f7ffb2011e7af1f12bf6319cb1b9cc
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_bm.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: bm
+doc_to_text: "As a Bambara linguist, label all named entities in the Bambara text\
+  \ below with the categories: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the\
+  \ output strictly follows the format: label: entity $$ label: entity, with each\
+  \ unique entity on a separate label line, avoiding grouped entities (e.g., avoid\
+  \ LOC: entity, entity) or irrelevant entries like none. Return only the output.\
+  \ \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_bm_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_ee.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_ee.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fe5fc75d28eef016bac579673cf1db875ee0a9f3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_ee.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: ee
+doc_to_text: "As a Ewe linguist, label all named entities in the Ewe text below with\
+  \ the categories: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the output strictly\
+  \ follows the format: label: entity $$ label: entity, with each unique entity on\
+  \ a separate label line, avoiding grouped entities (e.g., avoid LOC: entity, entity)\
+  \ or irrelevant entries like none. Return only the output. \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_ee_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_ha.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_ha.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9f88b9d19d4545c1131573a2303c6014421ed54b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_ha.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: ha
+doc_to_text: "As a Hausa linguist, label all named entities in the Hausa text below\
+  \ with the categories: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the output\
+  \ strictly follows the format: label: entity $$ label: entity, with each unique\
+  \ entity on a separate label line, avoiding grouped entities (e.g., avoid LOC: entity,\
+  \ entity) or irrelevant entries like none. Return only the output. \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_ha_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_ig.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_ig.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d4712d7e8edd36a87c59c9f0bc759f7b884ec830
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_ig.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: ig
+doc_to_text: "As a Igbo linguist, label all named entities in the Igbo text below\
+  \ with the categories: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the output\
+  \ strictly follows the format: label: entity $$ label: entity, with each unique\
+  \ entity on a separate label line, avoiding grouped entities (e.g., avoid LOC: entity,\
+  \ entity) or irrelevant entries like none. Return only the output. \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_ig_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_lg.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_lg.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cd7bde4a6f098bc8e946d83b556a00e38de43a5b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_lg.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: lg
+doc_to_text: "As a Luganda linguist, label all named entities in the Luganda text\
+  \ below with the categories: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the\
+  \ output strictly follows the format: label: entity $$ label: entity, with each\
+  \ unique entity on a separate label line, avoiding grouped entities (e.g., avoid\
+  \ LOC: entity, entity) or irrelevant entries like none. Return only the output.\
+  \ \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_lg_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_luo.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_luo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..92c0ddfa2c58c90a9da84e3dd3e002f9eb8c1098
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_luo.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: luo
+doc_to_text: "As a Luo linguist, label all named entities in the Luo text below with\
+  \ the categories: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the output strictly\
+  \ follows the format: label: entity $$ label: entity, with each unique entity on\
+  \ a separate label line, avoiding grouped entities (e.g., avoid LOC: entity, entity)\
+  \ or irrelevant entries like none. Return only the output. \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_luo_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_mos.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_mos.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2eb75d8e554dba222cf9f92fc6c8f013e7d232b2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_mos.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: mos
+doc_to_text: "As a Mossi linguist, label all named entities in the Mossi text below\
+  \ with the categories: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the output\
+  \ strictly follows the format: label: entity $$ label: entity, with each unique\
+  \ entity on a separate label line, avoiding grouped entities (e.g., avoid LOC: entity,\
+  \ entity) or irrelevant entries like none. Return only the output. \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_mos_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_ny.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_ny.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e8cb8218aff23a34f48455b2cbb897112a407f06
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_ny.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: ny
+doc_to_text: "As a Chichewa linguist, label all named entities in the Chichewa text\
+  \ below with the categories: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the\
+  \ output strictly follows the format: label: entity $$ label: entity, with each\
+  \ unique entity on a separate label line, avoiding grouped entities (e.g., avoid\
+  \ LOC: entity, entity) or irrelevant entries like none. Return only the output.\
+  \ \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_ny_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_pcm.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_pcm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..93f8ae3adb27f74df18c13a0ed886d176b62eead
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_pcm.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: pcm
+doc_to_text: "As a Nigerian Pidgin linguist, label all named entities in the Nigerian\
+  \ Pidgin text below with the categories: PERSON, LOCATION, ORGANIZATION, and DATE.\
+  \ Ensure the output strictly follows the format: label: entity $$ label: entity,\
+  \ with each unique entity on a separate label line, avoiding grouped entities (e.g.,\
+  \ avoid LOC: entity, entity) or irrelevant entries like none. Return only the output.\
+  \ \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_pcm_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_rw.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_rw.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d64d49925bf1668e0f2b5ebdb6516f4ba0668f88
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_rw.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: rw
+doc_to_text: "As a Kinyarwanda linguist, label all named entities in the Kinyarwanda\
+  \ text below with the categories: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure\
+  \ the output strictly follows the format: label: entity $$ label: entity, with each\
+  \ unique entity on a separate label line, avoiding grouped entities (e.g., avoid\
+  \ LOC: entity, entity) or irrelevant entries like none. Return only the output.\
+  \ \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_rw_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_sn.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_sn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..40230fb1cebb4f1aa9d9001360389ef1cdfda64e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_sn.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: sn
+doc_to_text: "As a chiShona linguist, label all named entities in the chiShona text\
+  \ below with the categories: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the\
+  \ output strictly follows the format: label: entity $$ label: entity, with each\
+  \ unique entity on a separate label line, avoiding grouped entities (e.g., avoid\
+  \ LOC: entity, entity) or irrelevant entries like none. Return only the output.\
+  \ \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_sn_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_sw.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_sw.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7b27554ddd1faa7a796b5249c340b4204fdfaa5a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_sw.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: sw
+doc_to_text: "As a Kiswahili linguist, label all named entities in the Kiswahili text\
+  \ below with the categories: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the\
+  \ output strictly follows the format: label: entity $$ label: entity, with each\
+  \ unique entity on a separate label line, avoiding grouped entities (e.g., avoid\
+  \ LOC: entity, entity) or irrelevant entries like none. Return only the output.\
+  \ \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_sw_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_tn.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_tn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..88080456ba554e20a63f3c68638908b31d5294cd
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_tn.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: tn
+doc_to_text: "As a Setswana linguist, label all named entities in the Setswana text\
+  \ below with the categories: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the\
+  \ output strictly follows the format: label: entity $$ label: entity, with each\
+  \ unique entity on a separate label line, avoiding grouped entities (e.g., avoid\
+  \ LOC: entity, entity) or irrelevant entries like none. Return only the output.\
+  \ \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_tn_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_tw.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_tw.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8d2eec6befd2442e314abba2e2655dc8aa0baf4a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_tw.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: tw
+doc_to_text: "As a Twi linguist, label all named entities in the Twi text below with\
+  \ the categories: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the output strictly\
+  \ follows the format: label: entity $$ label: entity, with each unique entity on\
+  \ a separate label line, avoiding grouped entities (e.g., avoid LOC: entity, entity)\
+  \ or irrelevant entries like none. Return only the output. \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_tw_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_wo.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_wo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..41501cb385fe50b38d5d30afe592da4705585881
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_wo.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: wo
+doc_to_text: "As a Wolof linguist, label all named entities in the Wolof text below\
+  \ with the categories: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the output\
+  \ strictly follows the format: label: entity $$ label: entity, with each unique\
+  \ entity on a separate label line, avoiding grouped entities (e.g., avoid LOC: entity,\
+  \ entity) or irrelevant entries like none. Return only the output. \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_wo_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_xh.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_xh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4b29fda3f444dd526ee7cc94f6579e74b6f63b97
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_xh.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: xh
+doc_to_text: "As a isiXhosa linguist, label all named entities in the isiXhosa text\
+  \ below with the categories: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the\
+  \ output strictly follows the format: label: entity $$ label: entity, with each\
+  \ unique entity on a separate label line, avoiding grouped entities (e.g., avoid\
+  \ LOC: entity, entity) or irrelevant entries like none. Return only the output.\
+  \ \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_xh_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_yo.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_yo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a0c327bd53c90651e9c7e6b699d3fb9fb52748a1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_yo.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: yo
+doc_to_text: "As a Yoruba linguist, label all named entities in the Yoruba text below\
+  \ with the categories: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the output\
+  \ strictly follows the format: label: entity $$ label: entity, with each unique\
+  \ entity on a separate label line, avoiding grouped entities (e.g., avoid LOC: entity,\
+  \ entity) or irrelevant entries like none. Return only the output. \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_yo_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_zu.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_zu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..24961ec759862a77b2ce608f4a5954ec62f139fe
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_4/masakhaner_zu.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: zu
+doc_to_text: "As a isiZulu linguist, label all named entities in the isiZulu text\
+  \ below with the categories: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the\
+  \ output strictly follows the format: label: entity $$ label: entity, with each\
+  \ unique entity on a separate label line, avoiding grouped entities (e.g., avoid\
+  \ LOC: entity, entity) or irrelevant entries like none. Return only the output.\
+  \ \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_zu_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_4/utils.py b/lm_eval/tasks/afrobench/masakhaner/prompt_4/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..76909044e7f35948156f8bb506ce2fce563ec689
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_4/utils.py
@@ -0,0 +1,146 @@
+import collections
+import re
+
+from lm_eval.utils import weighted_f1_score
+
+
+def doc_to_target(doc):
+    return transform_text(doc["ner_tags"])
+
+
+def transform_text(text):
+    entities = []
+    current_entity = ""
+    current_tag = ""
+
+    for pair in text.split("\n"):
+        if pair:  # Check if the line is not empty
+            word, tag = pair.strip().split(": ")
+            tag = tag.upper()
+            word = word.lower()
+            word = word.strip(",.").strip()
+
+            if tag.startswith("B-"):
+                if current_entity:
+                    entities.append(f"{current_tag}: {current_entity}")
+                current_tag = tag.split("-")[1]
+                current_entity = word
+            elif tag.startswith("I-") and tag.split("-")[1] == current_tag:
+                current_entity += word
+            else:
+                if current_entity:
+                    entities.append(f"{current_tag}: {current_entity}")
+                    current_entity = ""
+                    current_tag = ""
+    if current_entity:
+        entities.append(f"{current_tag}: {current_entity}")
+
+        # Join all the transformed output lines with $$ as separator
+    return " $$ ".join(entities)
+
+
+def span_f1_agg(items):
+    """Computes Span based F1 score.
+
+    This function is copied from
+    https://github.com/google-research/multilingual-t5/blob/master/multilingual_t5/evaluation/metrics.py
+
+    Args:
+    targets: list of strings or list of list of strings if multiple references
+      are present.
+    predictions: list of strings
+
+    Returns:
+    span f1 across all targets and predictions (Based on CoNLL script)
+    """
+    unzipped_list = list(zip(*items))
+    targets = unzipped_list[0]
+    predictions = unzipped_list[1]
+
+    true_positives = collections.defaultdict(int)
+    false_positives = collections.defaultdict(int)
+    false_negatives = collections.defaultdict(int)
+
+    def normalize_text(strings):
+        def get_blank_spaces_pattern():
+            return re.compile(r"\s{3,}|\t")
+
+        def remove_blank_spaces(text):
+            text = re.sub(pattern=get_blank_spaces_pattern(), repl="", string=text)
+            text = re.sub("\s+", " ", text)
+            return text
+
+        def remove_punctuation(text):
+            my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@.""-,`'
+            text = re.sub(
+                "[" + my_punctuation + "]+", " ", str(text)
+            )  # strip punctuation
+            return text
+
+        def remove_articles(text):
+            regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
+            return re.sub(regex, " ", text)
+
+        def lowercase(text):
+            text = text.lower()
+            return text
+
+        strings = remove_punctuation(strings)
+        strings = remove_articles(strings)
+        strings = remove_blank_spaces(strings)
+        strings = lowercase(strings)
+
+        return strings
+
+    def tags_to_spans(tag_sequence, delimiter="$$"):
+        """Extract spans from IOB1 or BIO tags."""
+        if isinstance(tag_sequence, list):
+            tag_sequence = " ".join(i.strip() for i in tag_sequence)
+        tag_sequence_split = [
+            item.strip()
+            for sub in tag_sequence.strip().split(delimiter)
+            for item in sub.split("$")
+            if item
+        ]
+        tag_sequence_split = [
+            item.strip()
+            for value in tag_sequence_split
+            for sub in value.split(". ")
+            for item in sub.split(", ")
+        ]
+        tags_entities = []
+        for tag_entity in tag_sequence_split:
+            tag_entity_split = tag_entity.split(": ")
+            if len(tag_entity_split) != 2:
+                continue
+            tag = normalize_text(tag_entity_split[0].strip())
+            entity = normalize_text(tag_entity_split[1].rstrip().lstrip())
+            tags_entities.append((tag, entity))
+        return tags_entities
+
+    def compute_f1_metrics(true_positive, false_positive, false_negative):
+        precision = float(true_positive) / float(true_positive + false_positive + 1e-13)
+        recall = float(true_positive) / float(true_positive + false_negative + 1e-13)
+        f1_measures = 2.0 * ((precision * recall) / (precision + recall + 1e-13))
+        return precision, recall, f1_measures
+
+    for target, pred in zip(targets, predictions):
+        gold_spans = tags_to_spans(target)
+        predicted_spans = tags_to_spans(pred)
+
+        for span in predicted_spans:
+            if span in gold_spans:
+                true_positives[span[0]] += 1
+                gold_spans.remove(span)
+            else:
+                false_positives[span[0]] += 1
+        # These spans weren't predicted.
+        for span in gold_spans:
+            false_negatives[span[0]] += 1
+
+    _, _, f1_measure = compute_f1_metrics(
+        sum(true_positives.values()),
+        sum(false_positives.values()),
+        sum(false_negatives.values()),
+    )
+    return f1_measure
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner
new file mode 100644
index 0000000000000000000000000000000000000000..09cd77e13106cca8862dcfa31b86c7742b97985a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner
@@ -0,0 +1,26 @@
+tag:
+- masakhaner_tasks
+- masakhaner_prompt_5
+dataset_path: masakhane/masakhaner-x
+dataset_name: null
+dataset_kwargs: {trust_remote_code: True}
+output_type: generate_until
+generation_kwargs:
+  do_sample: false
+  until:
+  - </s>
+  - <|im_end|>
+validation_split: validation
+test_split: test
+fewshot_split: train
+doc_to_target: target
+filter_list:
+ - name: flexible-extract
+   filter:
+     - function: format_span
+metric_list:
+  - metric: f1
+    aggregation: !function utils.span_f1_agg
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_am.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_am.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..90c485745377aa9532eb0f6e7b35b15cd31d5414
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_am.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: am
+doc_to_text: "Provide a concise list of named entities in the text below. Use the\
+  \ following labels: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the output\
+  \ strictly follows the format: label: entity $$ label: entity, with each unique\
+  \ entity on a separate label line, avoiding grouped entities (e.g., avoid LOC: entity,\
+  \ entity) or irrelevant entries like none. Return only the output.  \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_am_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_bbj.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_bbj.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..74726694ef67fd5f572c1a58b0b637cf410a9997
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_bbj.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: bbj
+doc_to_text: "Provide a concise list of named entities in the text below. Use the\
+  \ following labels: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the output\
+  \ strictly follows the format: label: entity $$ label: entity, with each unique\
+  \ entity on a separate label line, avoiding grouped entities (e.g., avoid LOC: entity,\
+  \ entity) or irrelevant entries like none. Return only the output.  \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_bbj_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_bm.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_bm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c97e0c22a609ceb20a40b5185a58baad850e4e81
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_bm.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: bm
+doc_to_text: "Provide a concise list of named entities in the text below. Use the\
+  \ following labels: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the output\
+  \ strictly follows the format: label: entity $$ label: entity, with each unique\
+  \ entity on a separate label line, avoiding grouped entities (e.g., avoid LOC: entity,\
+  \ entity) or irrelevant entries like none. Return only the output.  \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_bm_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_ee.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_ee.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6371649d2db361bfda7fabe8ed353ca529324375
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_ee.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: ee
+doc_to_text: "Provide a concise list of named entities in the text below. Use the\
+  \ following labels: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the output\
+  \ strictly follows the format: label: entity $$ label: entity, with each unique\
+  \ entity on a separate label line, avoiding grouped entities (e.g., avoid LOC: entity,\
+  \ entity) or irrelevant entries like none. Return only the output.  \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_ee_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_ha.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_ha.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1d68c7eed339d51ba9d72863624a2ee9842be64e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_ha.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: ha
+doc_to_text: "Provide a concise list of named entities in the text below. Use the\
+  \ following labels: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the output\
+  \ strictly follows the format: label: entity $$ label: entity, with each unique\
+  \ entity on a separate label line, avoiding grouped entities (e.g., avoid LOC: entity,\
+  \ entity) or irrelevant entries like none. Return only the output.  \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_ha_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_ig.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_ig.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3b8a429593210e2db50535011174cdbff26ad9c2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_ig.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: ig
+doc_to_text: "Provide a concise list of named entities in the text below. Use the\
+  \ following labels: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the output\
+  \ strictly follows the format: label: entity $$ label: entity, with each unique\
+  \ entity on a separate label line, avoiding grouped entities (e.g., avoid LOC: entity,\
+  \ entity) or irrelevant entries like none. Return only the output.  \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_ig_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_lg.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_lg.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..84bdc8b9af8e76d09da546db020390531450ed85
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_lg.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: lg
+doc_to_text: "Provide a concise list of named entities in the text below. Use the\
+  \ following labels: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the output\
+  \ strictly follows the format: label: entity $$ label: entity, with each unique\
+  \ entity on a separate label line, avoiding grouped entities (e.g., avoid LOC: entity,\
+  \ entity) or irrelevant entries like none. Return only the output.  \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_lg_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_luo.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_luo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..55a0b5744cdaa4cf894ac882642978f66c7dbe5f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_luo.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: luo
+doc_to_text: "Provide a concise list of named entities in the text below. Use the\
+  \ following labels: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the output\
+  \ strictly follows the format: label: entity $$ label: entity, with each unique\
+  \ entity on a separate label line, avoiding grouped entities (e.g., avoid LOC: entity,\
+  \ entity) or irrelevant entries like none. Return only the output.  \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_luo_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_mos.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_mos.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..06bcc4467d43d8559b8e9b2cd4b0a89fa6b3fa40
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_mos.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: mos
+doc_to_text: "Provide a concise list of named entities in the text below. Use the\
+  \ following labels: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the output\
+  \ strictly follows the format: label: entity $$ label: entity, with each unique\
+  \ entity on a separate label line, avoiding grouped entities (e.g., avoid LOC: entity,\
+  \ entity) or irrelevant entries like none. Return only the output.  \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_mos_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_ny.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_ny.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e400f10e186f0e4ae6119fa622065a23da73c680
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_ny.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: ny
+doc_to_text: "Provide a concise list of named entities in the text below. Use the\
+  \ following labels: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the output\
+  \ strictly follows the format: label: entity $$ label: entity, with each unique\
+  \ entity on a separate label line, avoiding grouped entities (e.g., avoid LOC: entity,\
+  \ entity) or irrelevant entries like none. Return only the output.  \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_ny_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_pcm.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_pcm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b9d897bc01a217c01d9fbffdf496d060ec54b434
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_pcm.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: pcm
+doc_to_text: "Provide a concise list of named entities in the text below. Use the\
+  \ following labels: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the output\
+  \ strictly follows the format: label: entity $$ label: entity, with each unique\
+  \ entity on a separate label line, avoiding grouped entities (e.g., avoid LOC: entity,\
+  \ entity) or irrelevant entries like none. Return only the output.  \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_pcm_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_rw.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_rw.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0742bc4dd567005a51861bab9b6208d64c66166f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_rw.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: rw
+doc_to_text: "Provide a concise list of named entities in the text below. Use the\
+  \ following labels: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the output\
+  \ strictly follows the format: label: entity $$ label: entity, with each unique\
+  \ entity on a separate label line, avoiding grouped entities (e.g., avoid LOC: entity,\
+  \ entity) or irrelevant entries like none. Return only the output.  \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_rw_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_sn.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_sn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..56711335c8c11879338fbc3b245bc49d0702733e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_sn.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: sn
+doc_to_text: "Provide a concise list of named entities in the text below. Use the\
+  \ following labels: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the output\
+  \ strictly follows the format: label: entity $$ label: entity, with each unique\
+  \ entity on a separate label line, avoiding grouped entities (e.g., avoid LOC: entity,\
+  \ entity) or irrelevant entries like none. Return only the output.  \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_sn_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_sw.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_sw.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c418beb45612e31ff1738909d5d1c181bfbab079
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_sw.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: sw
+doc_to_text: "Provide a concise list of named entities in the text below. Use the\
+  \ following labels: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the output\
+  \ strictly follows the format: label: entity $$ label: entity, with each unique\
+  \ entity on a separate label line, avoiding grouped entities (e.g., avoid LOC: entity,\
+  \ entity) or irrelevant entries like none. Return only the output.  \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_sw_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_tn.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_tn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bf94a1081352827a7fe09eb913bfabd9e5f0c576
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_tn.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: tn
+doc_to_text: "Provide a concise list of named entities in the text below. Use the\
+  \ following labels: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the output\
+  \ strictly follows the format: label: entity $$ label: entity, with each unique\
+  \ entity on a separate label line, avoiding grouped entities (e.g., avoid LOC: entity,\
+  \ entity) or irrelevant entries like none. Return only the output.  \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_tn_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_tw.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_tw.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cad2e2e3e64819dc6ab3151929a6ec76bb868821
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_tw.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: tw
+doc_to_text: "Provide a concise list of named entities in the text below. Use the\
+  \ following labels: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the output\
+  \ strictly follows the format: label: entity $$ label: entity, with each unique\
+  \ entity on a separate label line, avoiding grouped entities (e.g., avoid LOC: entity,\
+  \ entity) or irrelevant entries like none. Return only the output.  \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_tw_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_wo.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_wo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ec7af039234047cda3500be81b361bae294bace2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_wo.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: wo
+doc_to_text: "Provide a concise list of named entities in the text below. Use the\
+  \ following labels: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the output\
+  \ strictly follows the format: label: entity $$ label: entity, with each unique\
+  \ entity on a separate label line, avoiding grouped entities (e.g., avoid LOC: entity,\
+  \ entity) or irrelevant entries like none. Return only the output.  \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_wo_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_xh.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_xh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..debb164aef161a178ca53046e9b674a677d5fc08
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_xh.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: xh
+doc_to_text: "Provide a concise list of named entities in the text below. Use the\
+  \ following labels: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the output\
+  \ strictly follows the format: label: entity $$ label: entity, with each unique\
+  \ entity on a separate label line, avoiding grouped entities (e.g., avoid LOC: entity,\
+  \ entity) or irrelevant entries like none. Return only the output.  \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_xh_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_yo.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_yo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9abe1acbcb473c1bd091277c8a1913c792fdd0a3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_yo.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: yo
+doc_to_text: "Provide a concise list of named entities in the text below. Use the\
+  \ following labels: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the output\
+  \ strictly follows the format: label: entity $$ label: entity, with each unique\
+  \ entity on a separate label line, avoiding grouped entities (e.g., avoid LOC: entity,\
+  \ entity) or irrelevant entries like none. Return only the output.  \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_yo_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_zu.yaml b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_zu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c5af591aa464ef88ff6a4d4b62e55c504cb777c4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_5/masakhaner_zu.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: zu
+doc_to_text: "Provide a concise list of named entities in the text below. Use the\
+  \ following labels: PERSON, LOCATION, ORGANIZATION, and DATE. Ensure the output\
+  \ strictly follows the format: label: entity $$ label: entity, with each unique\
+  \ entity on a separate label line, avoiding grouped entities (e.g., avoid LOC: entity,\
+  \ entity) or irrelevant entries like none. Return only the output.  \n\nText: {{text}}"
+include: masakhaner
+task: masakhaner_zu_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhaner/prompt_5/utils.py b/lm_eval/tasks/afrobench/masakhaner/prompt_5/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..76909044e7f35948156f8bb506ce2fce563ec689
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhaner/prompt_5/utils.py
@@ -0,0 +1,146 @@
+import collections
+import re
+
+from lm_eval.utils import weighted_f1_score
+
+
+def doc_to_target(doc):
+    return transform_text(doc["ner_tags"])
+
+
+def transform_text(text):
+    entities = []
+    current_entity = ""
+    current_tag = ""
+
+    for pair in text.split("\n"):
+        if pair:  # Check if the line is not empty
+            word, tag = pair.strip().split(": ")
+            tag = tag.upper()
+            word = word.lower()
+            word = word.strip(",.").strip()
+
+            if tag.startswith("B-"):
+                if current_entity:
+                    entities.append(f"{current_tag}: {current_entity}")
+                current_tag = tag.split("-")[1]
+                current_entity = word
+            elif tag.startswith("I-") and tag.split("-")[1] == current_tag:
+                current_entity += word
+            else:
+                if current_entity:
+                    entities.append(f"{current_tag}: {current_entity}")
+                    current_entity = ""
+                    current_tag = ""
+    if current_entity:
+        entities.append(f"{current_tag}: {current_entity}")
+
+        # Join all the transformed output lines with $$ as separator
+    return " $$ ".join(entities)
+
+
+def span_f1_agg(items):
+    """Computes Span based F1 score.
+
+    This function is copied from
+    https://github.com/google-research/multilingual-t5/blob/master/multilingual_t5/evaluation/metrics.py
+
+    Args:
+    targets: list of strings or list of list of strings if multiple references
+      are present.
+    predictions: list of strings
+
+    Returns:
+    span f1 across all targets and predictions (Based on CoNLL script)
+    """
+    unzipped_list = list(zip(*items))
+    targets = unzipped_list[0]
+    predictions = unzipped_list[1]
+
+    true_positives = collections.defaultdict(int)
+    false_positives = collections.defaultdict(int)
+    false_negatives = collections.defaultdict(int)
+
+    def normalize_text(strings):
+        def get_blank_spaces_pattern():
+            return re.compile(r"\s{3,}|\t")
+
+        def remove_blank_spaces(text):
+            text = re.sub(pattern=get_blank_spaces_pattern(), repl="", string=text)
+            text = re.sub("\s+", " ", text)
+            return text
+
+        def remove_punctuation(text):
+            my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@.""-,`'
+            text = re.sub(
+                "[" + my_punctuation + "]+", " ", str(text)
+            )  # strip punctuation
+            return text
+
+        def remove_articles(text):
+            regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
+            return re.sub(regex, " ", text)
+
+        def lowercase(text):
+            text = text.lower()
+            return text
+
+        strings = remove_punctuation(strings)
+        strings = remove_articles(strings)
+        strings = remove_blank_spaces(strings)
+        strings = lowercase(strings)
+
+        return strings
+
+    def tags_to_spans(tag_sequence, delimiter="$$"):
+        """Extract spans from IOB1 or BIO tags."""
+        if isinstance(tag_sequence, list):
+            tag_sequence = " ".join(i.strip() for i in tag_sequence)
+        tag_sequence_split = [
+            item.strip()
+            for sub in tag_sequence.strip().split(delimiter)
+            for item in sub.split("$")
+            if item
+        ]
+        tag_sequence_split = [
+            item.strip()
+            for value in tag_sequence_split
+            for sub in value.split(". ")
+            for item in sub.split(", ")
+        ]
+        tags_entities = []
+        for tag_entity in tag_sequence_split:
+            tag_entity_split = tag_entity.split(": ")
+            if len(tag_entity_split) != 2:
+                continue
+            tag = normalize_text(tag_entity_split[0].strip())
+            entity = normalize_text(tag_entity_split[1].rstrip().lstrip())
+            tags_entities.append((tag, entity))
+        return tags_entities
+
+    def compute_f1_metrics(true_positive, false_positive, false_negative):
+        precision = float(true_positive) / float(true_positive + false_positive + 1e-13)
+        recall = float(true_positive) / float(true_positive + false_negative + 1e-13)
+        f1_measures = 2.0 * ((precision * recall) / (precision + recall + 1e-13))
+        return precision, recall, f1_measures
+
+    for target, pred in zip(targets, predictions):
+        gold_spans = tags_to_spans(target)
+        predicted_spans = tags_to_spans(pred)
+
+        for span in predicted_spans:
+            if span in gold_spans:
+                true_positives[span[0]] += 1
+                gold_spans.remove(span)
+            else:
+                false_positives[span[0]] += 1
+        # These spans weren't predicted.
+        for span in gold_spans:
+            false_negatives[span[0]] += 1
+
+    _, _, f1_measure = compute_f1_metrics(
+        sum(true_positives.values()),
+        sum(false_positives.values()),
+        sum(false_negatives.values()),
+    )
+    return f1_measure
diff --git a/lm_eval/tasks/afrobench/masakhanews/README.md b/lm_eval/tasks/afrobench/masakhanews/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..16df2df1d62f2d83d6d34e22373d6680a246eaa8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/README.md
@@ -0,0 +1,99 @@
+#
+
+## Paper
+Title: `MasakhaNEWS: News Topic Classification for African languages`
+
+Paper Link: https://aclanthology.org/2023.ijcnlp-main.10/
+
+## Abstract
+>African languages are severely under-represented in NLP research due to lack of datasets covering several NLP tasks. While there are individual language specific datasets that are being expanded to different tasks, only a handful of NLP tasks (e.g. named entity recognition and machine translation) have standardized benchmark datasets covering several geographical and typologically-diverse African languages. In this paper, we develop MasakhaNEWS -- a new benchmark dataset for news topic classification covering 16 languages widely spoken in Africa. We provide an evaluation of baseline models by training classical machine learning models and fine-tuning several language models. Furthermore, we explore several alternatives to full fine-tuning of language models that are better suited for zero-shot and few-shot learning such as cross-lingual parameter-efficient fine-tuning (like MAD-X), pattern exploiting training (PET), prompting language models (like ChatGPT), and prompt-free sentence transformer fine-tuning (SetFit and Cohere Embedding API). Our evaluation in zero-shot setting shows the potential of prompting ChatGPT for news topic classification in low-resource African languages, achieving an average performance of 70 F1 points without leveraging additional supervision like MAD-X. In few-shot setting, we show that with as little as 10 examples per label, we achieved more than 90% (i.e. 86.0 F1 points) of the performance of full supervised training (92.6 F1 points) leveraging the PET approach.
+
+HomePage: https://github.com/masakhane-io/masakhane-news
+
+### Citation
+
+```
+@inproceedings{adelani-etal-2023-masakhanews,
+    title = "{M}asakha{NEWS}: News Topic Classification for {A}frican languages",
+    author = "Adelani, David Ifeoluwa  and
+      Masiak, Marek  and
+      Azime, Israel Abebe  and
+      Alabi, Jesujoba  and
+      Tonja, Atnafu Lambebo  and
+      Mwase, Christine  and
+      Ogundepo, Odunayo  and
+      Dossou, Bonaventure F. P.  and
+      Oladipo, Akintunde  and
+      Nixdorf, Doreen  and
+      Emezue, Chris Chinenye  and
+      Al-azzawi, Sana  and
+      Sibanda, Blessing  and
+      David, Davis  and
+      Ndolela, Lolwethu  and
+      Mukiibi, Jonathan  and
+      Ajayi, Tunde  and
+      Moteu, Tatiana  and
+      Odhiambo, Brian  and
+      Owodunni, Abraham  and
+      Obiefuna, Nnaemeka  and
+      Mohamed, Muhidin  and
+      Muhammad, Shamsuddeen Hassan  and
+      Ababu, Teshome Mulugeta  and
+      Salahudeen, Saheed Abdullahi  and
+      Yigezu, Mesay Gemeda  and
+      Gwadabe, Tajuddeen  and
+      Abdulmumin, Idris  and
+      Taye, Mahlet  and
+      Awoyomi, Oluwabusayo  and
+      Shode, Iyanuoluwa  and
+      Adelani, Tolulope  and
+      Abdulganiyu, Habiba  and
+      Omotayo, Abdul-Hakeem  and
+      Adeeko, Adetola  and
+      Afolabi, Abeeb  and
+      Aremu, Anuoluwapo  and
+      Samuel, Olanrewaju  and
+      Siro, Clemencia  and
+      Kimotho, Wangari  and
+      Ogbu, Onyekachi  and
+      Mbonu, Chinedu  and
+      Chukwuneke, Chiamaka  and
+      Fanijo, Samuel  and
+      Ojo, Jessica  and
+      Awosan, Oyinkansola  and
+      Kebede, Tadesse  and
+      Sakayo, Toadoum Sari  and
+      Nyatsine, Pamela  and
+      Sidume, Freedmore  and
+      Yousuf, Oreen  and
+      Oduwole, Mardiyyah  and
+      Tshinu, Kanda  and
+      Kimanuka, Ussen  and
+      Diko, Thina  and
+      Nxakama, Siyanda  and
+      Nigusse, Sinodos  and
+      Johar, Abdulmejid  and
+      Mohamed, Shafie  and
+      Hassan, Fuad Mire  and
+      Mehamed, Moges Ahmed  and
+      Ngabire, Evrard  and
+      Jules, Jules  and
+      Ssenkungu, Ivan  and
+      Stenetorp, Pontus",
+    editor = "Park, Jong C.  and
+      Arase, Yuki  and
+      Hu, Baotian  and
+      Lu, Wei  and
+      Wijaya, Derry  and
+      Purwarianti, Ayu  and
+      Krisnadhi, Adila Alfa",
+    booktitle = "Proceedings of the 13th International Joint Conference on Natural Language Processing and the 3rd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)",
+    month = nov,
+    year = "2023",
+    address = "Nusa Dua, Bali",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2023.ijcnlp-main.10/",
+    doi = "10.18653/v1/2023.ijcnlp-main.10",
+    pages = "144--159"
+}
+```
diff --git a/lm_eval/tasks/afrobench/masakhanews/masakhanews.yaml b/lm_eval/tasks/afrobench/masakhanews/masakhanews.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..93b6f29d8cdc05cf0904e4a8fe9afdd35d111c88
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/masakhanews.yaml
@@ -0,0 +1,13 @@
+group: masakhanews
+task:
+  - masakhanews_prompt_1
+  - masakhanews_prompt_2
+  - masakhanews_prompt_3
+  - masakhanews_prompt_4
+  - masakhanews_prompt_5
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews b/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews
new file mode 100644
index 0000000000000000000000000000000000000000..282a38422e526b9f8ce8731f950ae04e2a6cbf08
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews
@@ -0,0 +1,43 @@
+tag:
+- masakhanews_tasks
+- masakhanews_prompt_1
+- afrobench_TC_tasks
+dataset_path: masakhane/masakhanews
+dataset_name: null
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: validation
+doc_to_target: label
+doc_to_choice:
+    - "business"
+    - "entertainment"
+    - "health"
+    - "politics"
+    - "religion"
+    - "sports"
+    - "technology"
+should_decontaminate: true
+doc_to_decontamination_query: headline_text
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_amh.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d45b784facda2e60335d1980b9a1038c5cb91ec0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_amh.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: amh
+doc_to_text: "Given the categories technology, business, politics, sports, health,\
+  \ entertainment, or religion; what category does the text: '{{headline_text}}' belong\
+  \ to: \n\n"
+include: masakhanews
+task: masakhanews_amh_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_eng.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..40685c17d8616dafb3797ab84153f77242e6a364
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_eng.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng
+doc_to_text: "Given the categories technology, business, politics, sports, health,\
+  \ entertainment, or religion; what category does the text: '{{headline_text}}' belong\
+  \ to: \n\n"
+include: masakhanews
+task: masakhanews_eng_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_fra.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2371172156b0cfb8532d93366518f2faf8793aed
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_fra.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: fra
+doc_to_text: "Given the categories technology, business, politics, sports, health,\
+  \ entertainment, or religion; what category does the text: '{{headline_text}}' belong\
+  \ to: \n\n"
+include: masakhanews
+task: masakhanews_fra_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_hau.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c7288982d35eaaad09407c59b6b6d02af3fb637a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_hau.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: "Given the categories technology, business, politics, sports, health,\
+  \ entertainment, or religion; what category does the text: '{{headline_text}}' belong\
+  \ to: \n\n"
+include: masakhanews
+task: masakhanews_hau_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_ibo.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4bf65cca861c670be75aa9b63a914590d4d996c4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_ibo.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: "Given the categories technology, business, politics, sports, health,\
+  \ entertainment, or religion; what category does the text: '{{headline_text}}' belong\
+  \ to: \n\n"
+include: masakhanews
+task: masakhanews_ibo_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_lin.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c6cdbe8de5ef1593bd2f98d75ec4ecca3fc33084
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_lin.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: lin
+doc_to_text: "Given the categories technology, business, politics, sports, health,\
+  \ entertainment, or religion; what category does the text: '{{headline_text}}' belong\
+  \ to: \n\n"
+include: masakhanews
+task: masakhanews_lin_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_lug.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e2f0ec1bada04bf7c72649621330e828bd449951
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_lug.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: lug
+doc_to_text: "Given the categories technology, business, politics, sports, health,\
+  \ entertainment, or religion; what category does the text: '{{headline_text}}' belong\
+  \ to: \n\n"
+include: masakhanews
+task: masakhanews_lug_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_orm.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a9bff1ac5113f06c5d035e46c0e139fbeb0a8d28
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_orm.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: orm
+doc_to_text: "Given the categories technology, business, politics, sports, health,\
+  \ entertainment, or religion; what category does the text: '{{headline_text}}' belong\
+  \ to: \n\n"
+include: masakhanews
+task: masakhanews_orm_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_pcm.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_pcm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..119b01bb158d15d307949226dc71a055d169e4fa
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_pcm.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: pcm
+doc_to_text: "Given the categories technology, business, politics, sports, health,\
+  \ entertainment, or religion; what category does the text: '{{headline_text}}' belong\
+  \ to: \n\n"
+include: masakhanews
+task: masakhanews_pcm_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_run.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_run.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d8bc2923fa069f71d5bb00d4f272c235d6e7f0c6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_run.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: run
+doc_to_text: "Given the categories technology, business, politics, sports, health,\
+  \ entertainment, or religion; what category does the text: '{{headline_text}}' belong\
+  \ to: \n\n"
+include: masakhanews
+task: masakhanews_run_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_sna.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ee4fabdc96a680e0482040f00daca794eaf87dfe
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_sna.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sna
+doc_to_text: "Given the categories technology, business, politics, sports, health,\
+  \ entertainment, or religion; what category does the text: '{{headline_text}}' belong\
+  \ to: \n\n"
+include: masakhanews
+task: masakhanews_sna_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_som.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_som.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..88d7774c1b4c1c08367d8ebc79bc1ca7214cab9f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_som.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: som
+doc_to_text: "Given the categories technology, business, politics, sports, health,\
+  \ entertainment, or religion; what category does the text: '{{headline_text}}' belong\
+  \ to: \n\n"
+include: masakhanews
+task: masakhanews_som_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_swa.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c4e02aae282bdba08b8f057c71ab580a7ee9c031
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_swa.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: "Given the categories technology, business, politics, sports, health,\
+  \ entertainment, or religion; what category does the text: '{{headline_text}}' belong\
+  \ to: \n\n"
+include: masakhanews
+task: masakhanews_swa_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_tir.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_tir.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..72fa30ae7012379011b04a1fb3255fcad2a9a4e7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_tir.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: tir
+doc_to_text: "Given the categories technology, business, politics, sports, health,\
+  \ entertainment, or religion; what category does the text: '{{headline_text}}' belong\
+  \ to: \n\n"
+include: masakhanews
+task: masakhanews_tir_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_xho.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1d98b3b681de265b99207db0d3d2347d952d05a0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_xho.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: xho
+doc_to_text: "Given the categories technology, business, politics, sports, health,\
+  \ entertainment, or religion; what category does the text: '{{headline_text}}' belong\
+  \ to: \n\n"
+include: masakhanews
+task: masakhanews_xho_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_yor.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3ef4eec0e46425b9b02771ee6a360f3498cc0a1a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_1/masakhanews_yor.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: "Given the categories technology, business, politics, sports, health,\
+  \ entertainment, or religion; what category does the text: '{{headline_text}}' belong\
+  \ to: \n\n"
+include: masakhanews
+task: masakhanews_yor_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_1/utils.py b/lm_eval/tasks/afrobench/masakhanews/prompt_1/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e735e2deb1f9c53152c072615aebe8ba3acb90b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_1/utils.py
@@ -0,0 +1 @@
+from lm_eval.utils import weighted_f1_score
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews b/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews
new file mode 100644
index 0000000000000000000000000000000000000000..c174d2c7ff991b749881260b1ccb93d63a5e9f26
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews
@@ -0,0 +1,43 @@
+tag:
+- masakhanews_tasks
+- masakhanews_prompt_2
+- afrobench_TC_tasks
+dataset_path: masakhane/masakhanews
+dataset_name: null
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: validation
+doc_to_target: label
+doc_to_choice:
+    - "business"
+    - "entertainment"
+    - "health"
+    - "politics"
+    - "religion"
+    - "sports"
+    - "technology"
+should_decontaminate: true
+doc_to_decontamination_query: headline_text
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_amh.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cee7619cfb4c215a9e05551b668d2ea4e9e517ca
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_amh.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: amh
+doc_to_text: 'Does this Amharic topic; ''{{headline_text}}'' belong to one of the
+  following categories: technology, business, politics, sports, health, entertainment,
+  or religion? category only
+
+
+  '
+include: masakhanews
+task: masakhanews_amh_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_eng.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d3d6dd16c461eb7df5b730aa3283f6695f130503
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_eng.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: eng
+doc_to_text: 'Does this English topic; ''{{headline_text}}'' belong to one of the
+  following categories: technology, business, politics, sports, health, entertainment,
+  or religion? category only
+
+
+  '
+include: masakhanews
+task: masakhanews_eng_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_fra.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c35a6a1d34753893185c2d0fcd5dd82e73853e35
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_fra.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: fra
+doc_to_text: 'Does this French topic; ''{{headline_text}}'' belong to one of the following
+  categories: technology, business, politics, sports, health, entertainment, or religion?
+  category only
+
+
+  '
+include: masakhanews
+task: masakhanews_fra_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_hau.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..93e9cc5a7f3e53fd8fc97ee1ecf8bfbb85911939
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_hau.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: 'Does this Hausa topic; ''{{headline_text}}'' belong to one of the following
+  categories: technology, business, politics, sports, health, entertainment, or religion?
+  category only
+
+
+  '
+include: masakhanews
+task: masakhanews_hau_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_ibo.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1638e435c76f6fdd671a6bf165d81debc4be3b3a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_ibo.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: 'Does this Igbo topic; ''{{headline_text}}'' belong to one of the following
+  categories: technology, business, politics, sports, health, entertainment, or religion?
+  category only
+
+
+  '
+include: masakhanews
+task: masakhanews_ibo_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_lin.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0010d0e1ad36356f8c5b9bccccffc62f46730d93
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_lin.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: lin
+doc_to_text: 'Does this Lingala topic; ''{{headline_text}}'' belong to one of the
+  following categories: technology, business, politics, sports, health, entertainment,
+  or religion? category only
+
+
+  '
+include: masakhanews
+task: masakhanews_lin_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_lug.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d526067289d1145ed494418408b0de55d51a74ca
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_lug.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: lug
+doc_to_text: 'Does this Luganda topic; ''{{headline_text}}'' belong to one of the
+  following categories: technology, business, politics, sports, health, entertainment,
+  or religion? category only
+
+
+  '
+include: masakhanews
+task: masakhanews_lug_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_orm.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cd04c845d4eb1d46dd48052f0a582d7557e610a8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_orm.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: orm
+doc_to_text: 'Does this Afaan Oromoo topic; ''{{headline_text}}'' belong to one of
+  the following categories: technology, business, politics, sports, health, entertainment,
+  or religion? category only
+
+
+  '
+include: masakhanews
+task: masakhanews_orm_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_pcm.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_pcm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..de685e3ac8a62a2e5ca5b3d0b16119a77caa1994
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_pcm.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: pcm
+doc_to_text: 'Does this Nigerian Pidgin topic; ''{{headline_text}}'' belong to one
+  of the following categories: technology, business, politics, sports, health, entertainment,
+  or religion? category only
+
+
+  '
+include: masakhanews
+task: masakhanews_pcm_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_run.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_run.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..62236d590bfc009043dd8dca6ab3c43edcaff995
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_run.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: run
+doc_to_text: 'Does this Kirundi topic; ''{{headline_text}}'' belong to one of the
+  following categories: technology, business, politics, sports, health, entertainment,
+  or religion? category only
+
+
+  '
+include: masakhanews
+task: masakhanews_run_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_sna.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2a97e176b865335804af5efe9e911894acbbcc78
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_sna.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: sna
+doc_to_text: 'Does this Shona topic; ''{{headline_text}}'' belong to one of the following
+  categories: technology, business, politics, sports, health, entertainment, or religion?
+  category only
+
+
+  '
+include: masakhanews
+task: masakhanews_sna_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_som.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_som.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..318b9b87beabeef3837c826cbf128b3b8d1b4e8d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_som.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: som
+doc_to_text: 'Does this Somali topic; ''{{headline_text}}'' belong to one of the following
+  categories: technology, business, politics, sports, health, entertainment, or religion?
+  category only
+
+
+  '
+include: masakhanews
+task: masakhanews_som_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_swa.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..75b9345f3229c6e105f4cd99e65a970c1061b7d4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_swa.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: 'Does this Swahili topic; ''{{headline_text}}'' belong to one of the
+  following categories: technology, business, politics, sports, health, entertainment,
+  or religion? category only
+
+
+  '
+include: masakhanews
+task: masakhanews_swa_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_tir.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_tir.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..258a2bd3d7431083088e09f74044ce518cbaa7b1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_tir.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: tir
+doc_to_text: 'Does this Tigrinya topic; ''{{headline_text}}'' belong to one of the
+  following categories: technology, business, politics, sports, health, entertainment,
+  or religion? category only
+
+
+  '
+include: masakhanews
+task: masakhanews_tir_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_xho.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..30c4c3ac3abdb3b239406a2d36efc9331b1597d7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_xho.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: xho
+doc_to_text: 'Does this Xhosa topic; ''{{headline_text}}'' belong to one of the following
+  categories: technology, business, politics, sports, health, entertainment, or religion?
+  category only
+
+
+  '
+include: masakhanews
+task: masakhanews_xho_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_yor.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..067cf10632de430b650053d5537a733052e34b09
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_2/masakhanews_yor.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: 'Does this Yoruba topic; ''{{headline_text}}'' belong to one of the following
+  categories: technology, business, politics, sports, health, entertainment, or religion?
+  category only
+
+
+  '
+include: masakhanews
+task: masakhanews_yor_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_2/utils.py b/lm_eval/tasks/afrobench/masakhanews/prompt_2/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e735e2deb1f9c53152c072615aebe8ba3acb90b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_2/utils.py
@@ -0,0 +1 @@
+from lm_eval.utils import weighted_f1_score
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews b/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews
new file mode 100644
index 0000000000000000000000000000000000000000..ecc2108967078bb24a1efd15acdd8387d47e173c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews
@@ -0,0 +1,43 @@
+tag:
+- masakhanews_tasks
+- masakhanews_prompt_3
+- afrobench_TC_tasks
+dataset_path: masakhane/masakhanews
+dataset_name: null
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: validation
+doc_to_target: label
+doc_to_choice:
+    - "business"
+    - "entertainment"
+    - "health"
+    - "politics"
+    - "religion"
+    - "sports"
+    - "technology"
+should_decontaminate: true
+doc_to_decontamination_query: headline_text
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_amh.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dec10d2963dd00fce7e8dbd7f24f8a61a178e0a7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_amh.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: amh
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories technology, religion, politics, sports, health, entertainment, or business;\
+  \ what is the topic of the Amharic statement below? Return only the category. \n\
+  \ntext: {{headline_text}} \\category:\n\n"
+include: masakhanews
+task: masakhanews_amh_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_eng.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a8b7159e215dd6bc5a766d51b06f77289e4ce1a7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_eng.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: eng
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories technology, religion, politics, sports, health, entertainment, or business;\
+  \ what is the topic of the English statement below? Return only the category. \n\
+  \ntext: {{headline_text}} \\category:\n\n"
+include: masakhanews
+task: masakhanews_eng_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_fra.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..328316a8361d29a4db6ab882b46944fc65b2ff9b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_fra.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: fra
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories technology, religion, politics, sports, health, entertainment, or business;\
+  \ what is the topic of the French statement below? Return only the category. \n\n\
+  text: {{headline_text}} \\category:\n\n"
+include: masakhanews
+task: masakhanews_fra_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_hau.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c18ff2779cc9a9d149afe1eb7c438e3d18e8af2c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_hau.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories technology, religion, politics, sports, health, entertainment, or business;\
+  \ what is the topic of the Hausa statement below? Return only the category. \n\n\
+  text: {{headline_text}} \\category:\n\n"
+include: masakhanews
+task: masakhanews_hau_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_ibo.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3a91db840f2b72f041cffb827ea87520e28434cb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_ibo.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories technology, religion, politics, sports, health, entertainment, or business;\
+  \ what is the topic of the Igbo statement below? Return only the category. \n\n\
+  text: {{headline_text}} \\category:\n\n"
+include: masakhanews
+task: masakhanews_ibo_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_lin.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..19c4cca2e5f65851b6c44a1baa6dd2842ce3bd5f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_lin.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: lin
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories technology, religion, politics, sports, health, entertainment, or business;\
+  \ what is the topic of the Lingala statement below? Return only the category. \n\
+  \ntext: {{headline_text}} \\category:\n\n"
+include: masakhanews
+task: masakhanews_lin_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_lug.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9e3d4319fc82762f63292da9f916166132e53a42
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_lug.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: lug
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories technology, religion, politics, sports, health, entertainment, or business;\
+  \ what is the topic of the Luganda statement below? Return only the category. \n\
+  \ntext: {{headline_text}} \\category:\n\n"
+include: masakhanews
+task: masakhanews_lug_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_orm.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9bacf0420b81df236351a3698e37cf3eca8983e7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_orm.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: orm
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories technology, religion, politics, sports, health, entertainment, or business;\
+  \ what is the topic of the Afaan Oromoo statement below? Return only the category.\
+  \ \n\ntext: {{headline_text}} \\category:\n\n"
+include: masakhanews
+task: masakhanews_orm_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_pcm.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_pcm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e873becd56c21cd0d92841994a4fb6bed5119b51
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_pcm.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: pcm
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories technology, religion, politics, sports, health, entertainment, or business;\
+  \ what is the topic of the Nigerian Pidgin statement below? Return only the category.\
+  \ \n\ntext: {{headline_text}} \\category:\n\n"
+include: masakhanews
+task: masakhanews_pcm_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_run.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_run.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..307e13710dde72132a0df4011500aca4ccfd9e22
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_run.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: run
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories technology, religion, politics, sports, health, entertainment, or business;\
+  \ what is the topic of the Kirundi statement below? Return only the category. \n\
+  \ntext: {{headline_text}} \\category:\n\n"
+include: masakhanews
+task: masakhanews_run_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_sna.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ee69be3de11efe89bac3dd355f4a9ffe99206c37
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_sna.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: sna
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories technology, religion, politics, sports, health, entertainment, or business;\
+  \ what is the topic of the Shona statement below? Return only the category. \n\n\
+  text: {{headline_text}} \\category:\n\n"
+include: masakhanews
+task: masakhanews_sna_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_som.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_som.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8c181fddb819a9f42f18a31141bc71f837f759cd
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_som.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: som
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories technology, religion, politics, sports, health, entertainment, or business;\
+  \ what is the topic of the Somali statement below? Return only the category. \n\n\
+  text: {{headline_text}} \\category:\n\n"
+include: masakhanews
+task: masakhanews_som_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_swa.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fbe1c4200f9953d603eae6262067facfc09fe694
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_swa.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories technology, religion, politics, sports, health, entertainment, or business;\
+  \ what is the topic of the Swahili statement below? Return only the category. \n\
+  \ntext: {{headline_text}} \\category:\n\n"
+include: masakhanews
+task: masakhanews_swa_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_tir.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_tir.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b6055da2859d8932b3d6c40130d895846069f285
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_tir.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: tir
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories technology, religion, politics, sports, health, entertainment, or business;\
+  \ what is the topic of the Tigrinya statement below? Return only the category. \n\
+  \ntext: {{headline_text}} \\category:\n\n"
+include: masakhanews
+task: masakhanews_tir_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_xho.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..110fc08778130c4880b904a682211df4800a2cd5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_xho.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: xho
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories technology, religion, politics, sports, health, entertainment, or business;\
+  \ what is the topic of the Xhosa statement below? Return only the category. \n\n\
+  text: {{headline_text}} \\category:\n\n"
+include: masakhanews
+task: masakhanews_xho_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_yor.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d31e9b23fdfbdc5bbc85be20457c80c9b90c4a31
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_3/masakhanews_yor.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories technology, religion, politics, sports, health, entertainment, or business;\
+  \ what is the topic of the Yoruba statement below? Return only the category. \n\n\
+  text: {{headline_text}} \\category:\n\n"
+include: masakhanews
+task: masakhanews_yor_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_3/utils.py b/lm_eval/tasks/afrobench/masakhanews/prompt_3/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e735e2deb1f9c53152c072615aebe8ba3acb90b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_3/utils.py
@@ -0,0 +1 @@
+from lm_eval.utils import weighted_f1_score
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews b/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews
new file mode 100644
index 0000000000000000000000000000000000000000..a1801f4e00b1d90a885ed9d73a14c2745cd73f01
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews
@@ -0,0 +1,43 @@
+tag:
+- masakhanews_tasks
+- masakhanews_prompt_4
+- afrobench_TC_tasks
+dataset_path: masakhane/masakhanews
+dataset_name: null
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: validation
+doc_to_target: label
+doc_to_choice:
+    - "business"
+    - "entertainment"
+    - "health"
+    - "politics"
+    - "religion"
+    - "sports"
+    - "technology"
+should_decontaminate: true
+doc_to_decontamination_query: headline_text
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_amh.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a76305859c1e27a1b00b3be6492a76b207d313da
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_amh.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: amh
+doc_to_text: "Label the following text as technology, religion, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{headline_text}} \\category: \n\n"
+include: masakhanews
+task: masakhanews_amh_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_eng.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8567113756fde2e8e98c5f6f2f68073a5d14550b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_eng.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng
+doc_to_text: "Label the following text as technology, religion, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{headline_text}} \\category: \n\n"
+include: masakhanews
+task: masakhanews_eng_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_fra.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3f86635f6d5edf28590264ae45a0f3546d868feb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_fra.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: fra
+doc_to_text: "Label the following text as technology, religion, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{headline_text}} \\category: \n\n"
+include: masakhanews
+task: masakhanews_fra_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_hau.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c1b7ce562b51486f41ce75c6716eda24d56caf1f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_hau.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: "Label the following text as technology, religion, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{headline_text}} \\category: \n\n"
+include: masakhanews
+task: masakhanews_hau_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_ibo.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d76a905d62d2e93f09608684592dff02c60f131c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_ibo.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: "Label the following text as technology, religion, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{headline_text}} \\category: \n\n"
+include: masakhanews
+task: masakhanews_ibo_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_lin.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e0247529b5c4496cce3f52651f6969e894484bb1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_lin.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: lin
+doc_to_text: "Label the following text as technology, religion, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{headline_text}} \\category: \n\n"
+include: masakhanews
+task: masakhanews_lin_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_lug.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ca02c0a5fcbd248c82e945646d932614c8e515e8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_lug.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: lug
+doc_to_text: "Label the following text as technology, religion, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{headline_text}} \\category: \n\n"
+include: masakhanews
+task: masakhanews_lug_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_orm.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..781eb4cc977bd0ed74698737c56c9f97190b9623
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_orm.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: orm
+doc_to_text: "Label the following text as technology, religion, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{headline_text}} \\category: \n\n"
+include: masakhanews
+task: masakhanews_orm_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_pcm.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_pcm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..93ad9f482b3539efd0b4e7ab89b64ab75ce91147
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_pcm.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: pcm
+doc_to_text: "Label the following text as technology, religion, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{headline_text}} \\category: \n\n"
+include: masakhanews
+task: masakhanews_pcm_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_run.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_run.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f5d985481f1b4443cadd4c6d1ef12424825e02cb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_run.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: run
+doc_to_text: "Label the following text as technology, religion, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{headline_text}} \\category: \n\n"
+include: masakhanews
+task: masakhanews_run_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_sna.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2676db850ff3b7ec98dfdd2b596e8d3011b30915
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_sna.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sna
+doc_to_text: "Label the following text as technology, religion, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{headline_text}} \\category: \n\n"
+include: masakhanews
+task: masakhanews_sna_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_som.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_som.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6562da417b3a50c0d712038db88bd4f205c13df8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_som.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: som
+doc_to_text: "Label the following text as technology, religion, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{headline_text}} \\category: \n\n"
+include: masakhanews
+task: masakhanews_som_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_swa.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3bb9764ad0dc0c69ba98fc85fcb1a51cda37c3b8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_swa.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: "Label the following text as technology, religion, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{headline_text}} \\category: \n\n"
+include: masakhanews
+task: masakhanews_swa_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_tir.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_tir.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3dfb1d4e7de9510175386192bcdf7f4524181308
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_tir.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: tir
+doc_to_text: "Label the following text as technology, religion, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{headline_text}} \\category: \n\n"
+include: masakhanews
+task: masakhanews_tir_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_xho.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9c1b51c20386a2c1d5196d4da47223603b5637ab
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_xho.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: xho
+doc_to_text: "Label the following text as technology, religion, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{headline_text}} \\category: \n\n"
+include: masakhanews
+task: masakhanews_xho_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_yor.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9d22d1c7f59e8e2103d605b3e5c9c4dd08811bfb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_4/masakhanews_yor.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: "Label the following text as technology, religion, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{headline_text}} \\category: \n\n"
+include: masakhanews
+task: masakhanews_yor_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_4/utils.py b/lm_eval/tasks/afrobench/masakhanews/prompt_4/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e735e2deb1f9c53152c072615aebe8ba3acb90b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_4/utils.py
@@ -0,0 +1 @@
+from lm_eval.utils import weighted_f1_score
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews b/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews
new file mode 100644
index 0000000000000000000000000000000000000000..8d76af03ab044d68314853c0a0005a05141c1dca
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews
@@ -0,0 +1,43 @@
+tag:
+- masakhanews_tasks
+- masakhanews_prompt_5
+- afrobench_TC_tasks
+dataset_path: masakhane/masakhanews
+dataset_name: null
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: validation
+doc_to_target: label
+doc_to_choice:
+    - "business"
+    - "entertainment"
+    - "health"
+    - "politics"
+    - "religion"
+    - "sports"
+    - "technology"
+should_decontaminate: true
+doc_to_decontamination_query: headline_text
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_amh.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..759ce913fe968c78eed1f302719b61dd0d62aa2a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_amh.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: amh
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Amharic text. For each input, classify the topic as technology, business, politics,\
+  \ sports, health, entertainment, or religion. Use the following guidelines: \n\n\
+  \ technology: The text discusses scientific discoveries, technological advancements,\
+  \ or related topics. \npolitics: The text covers political events, policies, or\
+  \ related topics. \nsports: The text talks about sports events, athletes, or related\
+  \ topics. \nhealth: The text addresses health issues, medical advancements, or related\
+  \ topics. \nentertainment: The text pertains to movies, music, celebrities, or related\
+  \ topics. \nreligion: The text talks about relgions, religious institutions and\
+  \ beliefs or related topics. \n\nbusiness: The text covers economy, business, or\
+  \ related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{headline_text}} \\category: \n\n"
+include: masakhanews
+task: masakhanews_amh_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_eng.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2c03032b48ecf521e9563277d2b149c703171321
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_eng.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: eng
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ English text. For each input, classify the topic as technology, business, politics,\
+  \ sports, health, entertainment, or religion. Use the following guidelines: \n\n\
+  \ technology: The text discusses scientific discoveries, technological advancements,\
+  \ or related topics. \npolitics: The text covers political events, policies, or\
+  \ related topics. \nsports: The text talks about sports events, athletes, or related\
+  \ topics. \nhealth: The text addresses health issues, medical advancements, or related\
+  \ topics. \nentertainment: The text pertains to movies, music, celebrities, or related\
+  \ topics. \nreligion: The text talks about relgions, religious institutions and\
+  \ beliefs or related topics. \n\nbusiness: The text covers economy, business, or\
+  \ related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{headline_text}} \\category: \n\n"
+include: masakhanews
+task: masakhanews_eng_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_fra.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..603d149d733336355b4874a5bbffe61786a9edd7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_fra.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: fra
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ French text. For each input, classify the topic as technology, business, politics,\
+  \ sports, health, entertainment, or religion. Use the following guidelines: \n\n\
+  \ technology: The text discusses scientific discoveries, technological advancements,\
+  \ or related topics. \npolitics: The text covers political events, policies, or\
+  \ related topics. \nsports: The text talks about sports events, athletes, or related\
+  \ topics. \nhealth: The text addresses health issues, medical advancements, or related\
+  \ topics. \nentertainment: The text pertains to movies, music, celebrities, or related\
+  \ topics. \nreligion: The text talks about relgions, religious institutions and\
+  \ beliefs or related topics. \n\nbusiness: The text covers economy, business, or\
+  \ related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{headline_text}} \\category: \n\n"
+include: masakhanews
+task: masakhanews_fra_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_hau.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..04a478cf6a5b63269c1ef2ef061d50fd08f95c11
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_hau.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Hausa text. For each input, classify the topic as technology, business, politics,\
+  \ sports, health, entertainment, or religion. Use the following guidelines: \n\n\
+  \ technology: The text discusses scientific discoveries, technological advancements,\
+  \ or related topics. \npolitics: The text covers political events, policies, or\
+  \ related topics. \nsports: The text talks about sports events, athletes, or related\
+  \ topics. \nhealth: The text addresses health issues, medical advancements, or related\
+  \ topics. \nentertainment: The text pertains to movies, music, celebrities, or related\
+  \ topics. \nreligion: The text talks about relgions, religious institutions and\
+  \ beliefs or related topics. \n\nbusiness: The text covers economy, business, or\
+  \ related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{headline_text}} \\category: \n\n"
+include: masakhanews
+task: masakhanews_hau_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_ibo.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ce3cc15b942e3cdbf02fa8b885aa2b796b409544
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_ibo.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Igbo text. For each input, classify the topic as technology, business, politics,\
+  \ sports, health, entertainment, or religion. Use the following guidelines: \n\n\
+  \ technology: The text discusses scientific discoveries, technological advancements,\
+  \ or related topics. \npolitics: The text covers political events, policies, or\
+  \ related topics. \nsports: The text talks about sports events, athletes, or related\
+  \ topics. \nhealth: The text addresses health issues, medical advancements, or related\
+  \ topics. \nentertainment: The text pertains to movies, music, celebrities, or related\
+  \ topics. \nreligion: The text talks about relgions, religious institutions and\
+  \ beliefs or related topics. \n\nbusiness: The text covers economy, business, or\
+  \ related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{headline_text}} \\category: \n\n"
+include: masakhanews
+task: masakhanews_ibo_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_lin.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e22303fe79bca34cef9784b7ecea4fe1d1a39ab7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_lin.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: lin
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Lingala text. For each input, classify the topic as technology, business, politics,\
+  \ sports, health, entertainment, or religion. Use the following guidelines: \n\n\
+  \ technology: The text discusses scientific discoveries, technological advancements,\
+  \ or related topics. \npolitics: The text covers political events, policies, or\
+  \ related topics. \nsports: The text talks about sports events, athletes, or related\
+  \ topics. \nhealth: The text addresses health issues, medical advancements, or related\
+  \ topics. \nentertainment: The text pertains to movies, music, celebrities, or related\
+  \ topics. \nreligion: The text talks about relgions, religious institutions and\
+  \ beliefs or related topics. \n\nbusiness: The text covers economy, business, or\
+  \ related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{headline_text}} \\category: \n\n"
+include: masakhanews
+task: masakhanews_lin_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_lug.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fe949b6f3c7c60a48d72ecf58d47aac7d36cb130
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_lug.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: lug
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Luganda text. For each input, classify the topic as technology, business, politics,\
+  \ sports, health, entertainment, or religion. Use the following guidelines: \n\n\
+  \ technology: The text discusses scientific discoveries, technological advancements,\
+  \ or related topics. \npolitics: The text covers political events, policies, or\
+  \ related topics. \nsports: The text talks about sports events, athletes, or related\
+  \ topics. \nhealth: The text addresses health issues, medical advancements, or related\
+  \ topics. \nentertainment: The text pertains to movies, music, celebrities, or related\
+  \ topics. \nreligion: The text talks about relgions, religious institutions and\
+  \ beliefs or related topics. \n\nbusiness: The text covers economy, business, or\
+  \ related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{headline_text}} \\category: \n\n"
+include: masakhanews
+task: masakhanews_lug_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_orm.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_orm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..413e88125dc1e69cb4aac285ed8f8fc59b887bfe
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_orm.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: orm
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Afaan Oromoo text. For each input, classify the topic as technology, business,\
+  \ politics, sports, health, entertainment, or religion. Use the following guidelines:\
+  \ \n\n technology: The text discusses scientific discoveries, technological advancements,\
+  \ or related topics. \npolitics: The text covers political events, policies, or\
+  \ related topics. \nsports: The text talks about sports events, athletes, or related\
+  \ topics. \nhealth: The text addresses health issues, medical advancements, or related\
+  \ topics. \nentertainment: The text pertains to movies, music, celebrities, or related\
+  \ topics. \nreligion: The text talks about relgions, religious institutions and\
+  \ beliefs or related topics. \n\nbusiness: The text covers economy, business, or\
+  \ related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{headline_text}} \\category: \n\n"
+include: masakhanews
+task: masakhanews_orm_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_pcm.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_pcm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b9322857eaaa71c77ed2399e23470b8085ebd4f5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_pcm.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: pcm
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Nigerian Pidgin text. For each input, classify the topic as technology, business,\
+  \ politics, sports, health, entertainment, or religion. Use the following guidelines:\
+  \ \n\n technology: The text discusses scientific discoveries, technological advancements,\
+  \ or related topics. \npolitics: The text covers political events, policies, or\
+  \ related topics. \nsports: The text talks about sports events, athletes, or related\
+  \ topics. \nhealth: The text addresses health issues, medical advancements, or related\
+  \ topics. \nentertainment: The text pertains to movies, music, celebrities, or related\
+  \ topics. \nreligion: The text talks about relgions, religious institutions and\
+  \ beliefs or related topics. \n\nbusiness: The text covers economy, business, or\
+  \ related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{headline_text}} \\category: \n\n"
+include: masakhanews
+task: masakhanews_pcm_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_run.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_run.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6f207fb703debacf6655975485fe188b13f4313d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_run.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: run
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Kirundi text. For each input, classify the topic as technology, business, politics,\
+  \ sports, health, entertainment, or religion. Use the following guidelines: \n\n\
+  \ technology: The text discusses scientific discoveries, technological advancements,\
+  \ or related topics. \npolitics: The text covers political events, policies, or\
+  \ related topics. \nsports: The text talks about sports events, athletes, or related\
+  \ topics. \nhealth: The text addresses health issues, medical advancements, or related\
+  \ topics. \nentertainment: The text pertains to movies, music, celebrities, or related\
+  \ topics. \nreligion: The text talks about relgions, religious institutions and\
+  \ beliefs or related topics. \n\nbusiness: The text covers economy, business, or\
+  \ related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{headline_text}} \\category: \n\n"
+include: masakhanews
+task: masakhanews_run_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_sna.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..737d335e6df283c7bf9f81c186c6e90f0cd81991
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_sna.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: sna
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Shona text. For each input, classify the topic as technology, business, politics,\
+  \ sports, health, entertainment, or religion. Use the following guidelines: \n\n\
+  \ technology: The text discusses scientific discoveries, technological advancements,\
+  \ or related topics. \npolitics: The text covers political events, policies, or\
+  \ related topics. \nsports: The text talks about sports events, athletes, or related\
+  \ topics. \nhealth: The text addresses health issues, medical advancements, or related\
+  \ topics. \nentertainment: The text pertains to movies, music, celebrities, or related\
+  \ topics. \nreligion: The text talks about relgions, religious institutions and\
+  \ beliefs or related topics. \n\nbusiness: The text covers economy, business, or\
+  \ related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{headline_text}} \\category: \n\n"
+include: masakhanews
+task: masakhanews_sna_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_som.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_som.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..39bb80c47bd7f16e34ec4ebefbdea0a08e6a6bef
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_som.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: som
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Somali text. For each input, classify the topic as technology, business, politics,\
+  \ sports, health, entertainment, or religion. Use the following guidelines: \n\n\
+  \ technology: The text discusses scientific discoveries, technological advancements,\
+  \ or related topics. \npolitics: The text covers political events, policies, or\
+  \ related topics. \nsports: The text talks about sports events, athletes, or related\
+  \ topics. \nhealth: The text addresses health issues, medical advancements, or related\
+  \ topics. \nentertainment: The text pertains to movies, music, celebrities, or related\
+  \ topics. \nreligion: The text talks about relgions, religious institutions and\
+  \ beliefs or related topics. \n\nbusiness: The text covers economy, business, or\
+  \ related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{headline_text}} \\category: \n\n"
+include: masakhanews
+task: masakhanews_som_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_swa.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c59e359c21af54c0f9e78950925fb16e2e0e5b29
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_swa.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Swahili text. For each input, classify the topic as technology, business, politics,\
+  \ sports, health, entertainment, or religion. Use the following guidelines: \n\n\
+  \ technology: The text discusses scientific discoveries, technological advancements,\
+  \ or related topics. \npolitics: The text covers political events, policies, or\
+  \ related topics. \nsports: The text talks about sports events, athletes, or related\
+  \ topics. \nhealth: The text addresses health issues, medical advancements, or related\
+  \ topics. \nentertainment: The text pertains to movies, music, celebrities, or related\
+  \ topics. \nreligion: The text talks about relgions, religious institutions and\
+  \ beliefs or related topics. \n\nbusiness: The text covers economy, business, or\
+  \ related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{headline_text}} \\category: \n\n"
+include: masakhanews
+task: masakhanews_swa_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_tir.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_tir.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..959de7a803f556ecde803744c6dc1451ca4493d0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_tir.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: tir
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Tigrinya text. For each input, classify the topic as technology, business, politics,\
+  \ sports, health, entertainment, or religion. Use the following guidelines: \n\n\
+  \ technology: The text discusses scientific discoveries, technological advancements,\
+  \ or related topics. \npolitics: The text covers political events, policies, or\
+  \ related topics. \nsports: The text talks about sports events, athletes, or related\
+  \ topics. \nhealth: The text addresses health issues, medical advancements, or related\
+  \ topics. \nentertainment: The text pertains to movies, music, celebrities, or related\
+  \ topics. \nreligion: The text talks about relgions, religious institutions and\
+  \ beliefs or related topics. \n\nbusiness: The text covers economy, business, or\
+  \ related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{headline_text}} \\category: \n\n"
+include: masakhanews
+task: masakhanews_tir_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_xho.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..35cad7295a830661882c30930153700936817082
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_xho.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: xho
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Xhosa text. For each input, classify the topic as technology, business, politics,\
+  \ sports, health, entertainment, or religion. Use the following guidelines: \n\n\
+  \ technology: The text discusses scientific discoveries, technological advancements,\
+  \ or related topics. \npolitics: The text covers political events, policies, or\
+  \ related topics. \nsports: The text talks about sports events, athletes, or related\
+  \ topics. \nhealth: The text addresses health issues, medical advancements, or related\
+  \ topics. \nentertainment: The text pertains to movies, music, celebrities, or related\
+  \ topics. \nreligion: The text talks about relgions, religious institutions and\
+  \ beliefs or related topics. \n\nbusiness: The text covers economy, business, or\
+  \ related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{headline_text}} \\category: \n\n"
+include: masakhanews
+task: masakhanews_xho_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_yor.yaml b/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e83c70454d5cd330383f49a7fa3bbaf0f1226790
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_5/masakhanews_yor.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Yoruba text. For each input, classify the topic as technology, business, politics,\
+  \ sports, health, entertainment, or religion. Use the following guidelines: \n\n\
+  \ technology: The text discusses scientific discoveries, technological advancements,\
+  \ or related topics. \npolitics: The text covers political events, policies, or\
+  \ related topics. \nsports: The text talks about sports events, athletes, or related\
+  \ topics. \nhealth: The text addresses health issues, medical advancements, or related\
+  \ topics. \nentertainment: The text pertains to movies, music, celebrities, or related\
+  \ topics. \nreligion: The text talks about relgions, religious institutions and\
+  \ beliefs or related topics. \n\nbusiness: The text covers economy, business, or\
+  \ related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{headline_text}} \\category: \n\n"
+include: masakhanews
+task: masakhanews_yor_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhanews/prompt_5/utils.py b/lm_eval/tasks/afrobench/masakhanews/prompt_5/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e735e2deb1f9c53152c072615aebe8ba3acb90b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/prompt_5/utils.py
@@ -0,0 +1 @@
+from lm_eval.utils import weighted_f1_score
diff --git a/lm_eval/tasks/afrobench/masakhanews/utils.py b/lm_eval/tasks/afrobench/masakhanews/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..310a7aeb5af2b998d57c6a793f27b00c8ab04029
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhanews/utils.py
@@ -0,0 +1,127 @@
+import argparse
+import os
+
+import yaml
+
+
+def prompt_func(mode, lang):
+    prompt_map = {
+        "prompt_1": "Given the categories technology, business, politics, sports, health, entertainment, or religion; what category does the text: '{{headline}}' belong to: \n\n",
+        "prompt_2": f"Does this {lang} topic; "
+        "'{{headline}}' belong to one of the following categories: technology, business, politics, sports, health, entertainment, or religion? category only\n\n",
+        "prompt_3": f"You are an assistant able to classify topics in texts. \n\n"
+        f"Given the categories technology, religion, politics, sports, health, entertainment, or business; what is "
+        f"the topic of the {lang} statement below? Return only the category. "
+        "\n\ntext: {{headline}} \category:\n\n",
+        "prompt_4": "Label the following text as technology, religion, politics, sports, health, entertainment, or geography. Provide only the category as your "
+        "response. \n\ntext: {{headline}} \category: \n\n",
+        "prompt_5": f"You are tasked with performing topic classification on the following {lang} text. "
+        f"For each input, classify the topic as technology, business, politics, sports, health, entertainment, or religion. "
+        f"Use the following guidelines: \n\n "
+        f"technology: The text discusses scientific discoveries, technological advancements, or related topics. \n"
+        f"politics: The text covers political events, policies, or related topics. \n"
+        f"sports: The text talks about sports events, athletes, or related topics. \n"
+        f"health: The text addresses health issues, medical advancements, or related topics. \n"
+        f"entertainment: The text pertains to movies, music, celebrities, or related topics. \n"
+        f"religion: The text talks about relgions, religious institutions and beliefs or related topics. \n\n"
+        f"business: The text covers economy, business, or related topics. \n\n"
+        f"If the text contains multiple topics, choose the dominant topic. "
+        f"For ambiguous or unclear topics, select the category that best reflects the overall content. "
+        "Please provide a single classification for each input.\n\ntext: {{headline}} \category: \n\n",
+    }
+    return prompt_map[mode]
+
+
+def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
+    """
+    Generate a yaml file for each language.
+
+    :param output_dir: The directory to output the files to.
+    :param overwrite: Whether to overwrite files if they already exist.
+    """
+    err = []
+    languages = {
+        "amh": "Amharic",
+        "eng": "English",
+        "fra": "French",
+        "hau": "Hausa",
+        "ibo": "Igbo",
+        "lin": "Lingala",
+        "lug": "Luganda",
+        "orm": "Afaan Oromoo",
+        "pcm": "Nigerian Pidgin",
+        "run": "Kirundi",
+        "sna": "Shona",
+        "som": "Somali",
+        "swa": "Swahili",
+        "tir": "Tigrinya",
+        "xho": "Xhosa",
+        "yor": "Yoruba",
+    }
+
+    for lang in languages.keys():
+        try:
+            file_name = f"masakhanews_{lang}.yaml"
+            task_name = f"masakhanews_{lang}_{mode}"
+            yaml_template = "masakhanews"
+            yaml_details = {
+                "include": yaml_template,
+                "task": task_name,
+                "dataset_name": lang,
+                "doc_to_text": prompt_func(mode, languages[lang]),
+            }
+            file_path = os.path.join(output_dir, mode)
+            os.makedirs(file_path, exist_ok=True)
+
+            with open(
+                f"{output_dir}/{mode}/{file_name}",
+                "w" if overwrite else "x",
+                encoding="utf8",
+            ) as f:
+                f.write("# Generated by utils.py\n")
+                yaml.dump(
+                    yaml_details,
+                    f,
+                    allow_unicode=True,
+                )
+        except FileExistsError:
+            err.append(file_name)
+
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist (use --overwrite flag):"
+            f" {', '.join(err)}"
+        )
+
+
+def main() -> None:
+    """Parse CLI args and generate language-specific yaml files."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--overwrite",
+        default=True,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default="./",
+        help="Directory to write yaml files to",
+    )
+
+    PROMPT_CHOICES = ["prompt_1", "prompt_2", "prompt_3", "prompt_4", "prompt_5"]
+    parser.add_argument(
+        "--mode",
+        nargs="*",
+        default=PROMPT_CHOICES,
+        choices=PROMPT_CHOICES,
+        help="Prompt number(s)",
+    )
+    args = parser.parse_args()
+
+    for mode in args.mode:
+        gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite, mode=mode)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lm_eval/tasks/afrobench/masakhapos/README.md b/lm_eval/tasks/afrobench/masakhapos/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1fcf11c780e88864fef93b46ef536cc11f33e60b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/README.md
@@ -0,0 +1,75 @@
+#
+
+## Paper
+Title: `MasakhaPOS: Part-of-Speech Tagging for Typologically Diverse African languages`
+
+Paper Link: https://aclanthology.org/2023.acl-long.609/
+
+## Abstract
+>In this paper, we present AfricaPOS, the largest part-of-speech (POS) dataset for 20 typologically diverse African languages. We discuss the challenges in annotating POS for these languages using the universal dependencies (UD) guidelines. We conducted extensive POS baseline experiments using both conditional random field and several multilingual pre-trained language models. We applied various cross-lingual transfer models trained with data available in the UD. Evaluating on the AfricaPOS dataset, we show that choosing the best transfer language(s) in both single-source and multi-source setups greatly improves the POS tagging performance of the target languages, in particular when combined with parameter-fine-tuning methods. Crucially, transferring knowledge from a language that matches the language family and morphosyntactic properties seems to be more effective for POS tagging in unseen languages.
+
+HomePage: https://github.com/masakhane-io/masakhane-pos
+
+### Citation
+
+```
+@inproceedings{dione-etal-2023-masakhapos,
+    title = "{M}asakha{POS}: Part-of-Speech Tagging for Typologically Diverse {A}frican languages",
+    author = "Dione, Cheikh M. Bamba  and
+      Adelani, David Ifeoluwa  and
+      Nabende, Peter  and
+      Alabi, Jesujoba  and
+      Sindane, Thapelo  and
+      Buzaaba, Happy  and
+      Muhammad, Shamsuddeen Hassan  and
+      Emezue, Chris Chinenye  and
+      Ogayo, Perez  and
+      Aremu, Anuoluwapo  and
+      Gitau, Catherine  and
+      Mbaye, Derguene  and
+      Mukiibi, Jonathan  and
+      Sibanda, Blessing  and
+      Dossou, Bonaventure F. P.  and
+      Bukula, Andiswa  and
+      Mabuya, Rooweither  and
+      Tapo, Allahsera Auguste  and
+      Munkoh-Buabeng, Edwin  and
+      Memdjokam Koagne, Victoire  and
+      Ouoba Kabore, Fatoumata  and
+      Taylor, Amelia  and
+      Kalipe, Godson  and
+      Macucwa, Tebogo  and
+      Marivate, Vukosi  and
+      Gwadabe, Tajuddeen  and
+      Elvis, Mboning Tchiaze  and
+      Onyenwe, Ikechukwu  and
+      Atindogbe, Gratien  and
+      Adelani, Tolulope  and
+      Akinade, Idris  and
+      Samuel, Olanrewaju  and
+      Nahimana, Marien  and
+      Musabeyezu, Th{\'e}og{\`e}ne  and
+      Niyomutabazi, Emile  and
+      Chimhenga, Ester  and
+      Gotosa, Kudzai  and
+      Mizha, Patrick  and
+      Agbolo, Apelete  and
+      Traore, Seydou  and
+      Uchechukwu, Chinedu  and
+      Yusuf, Aliyu  and
+      Abdullahi, Muhammad  and
+      Klakow, Dietrich",
+    editor = "Rogers, Anna  and
+      Boyd-Graber, Jordan  and
+      Okazaki, Naoaki",
+    booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
+    month = jul,
+    year = "2023",
+    address = "Toronto, Canada",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2023.acl-long.609/",
+    doi = "10.18653/v1/2023.acl-long.609",
+    pages = "10883--10900",
+    abstract = "In this paper, we present AfricaPOS, the largest part-of-speech (POS) dataset for 20 typologically diverse African languages. We discuss the challenges in annotating POS for these languages using the universal dependencies (UD) guidelines. We conducted extensive POS baseline experiments using both conditional random field and several multilingual pre-trained language models. We applied various cross-lingual transfer models trained with data available in the UD. Evaluating on the AfricaPOS dataset, we show that choosing the best transfer language(s) in both single-source and multi-source setups greatly improves the POS tagging performance of the target languages, in particular when combined with parameter-fine-tuning methods. Crucially, transferring knowledge from a language that matches the language family and morphosyntactic properties seems to be more effective for POS tagging in unseen languages."
+}
+```
diff --git a/lm_eval/tasks/afrobench/masakhapos/gen_utils.py b/lm_eval/tasks/afrobench/masakhapos/gen_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..52b9dafb435cf5f24664d7fb9c8ba73a687a7d4c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/gen_utils.py
@@ -0,0 +1,151 @@
+import argparse
+import os
+
+import yaml
+
+
+class FunctionTag:
+    def __init__(self, value):
+        self.value = value
+
+
+def prompt_func(mode, lang):
+    prompt_map = {
+        "prompt_1": "Please provide the POS tags for each word in the input sentence. The input will be a list of "
+        "words in the sentence. The output format should be a list of tuples, where each tuple consists of "
+        "a word from the input text and its corresponding POS tag label from the tag label set: ['ADJ', "
+        "'ADP', 'ADV', 'AUX', 'CCONJ, 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', "
+        "'SCONJ', 'SYM', 'VERB', 'X']. \nYour response should include only a list of tuples, in the order "
+        "that the words appear in the input sentence, including punctuations, with each tuple containing the corresponding POS tag "
+        "label for a word. \n\nSentence: {{tokens}} \nOutput: ",
+        "prompt_2": f"You are an expert in tagging words and sentences in {lang} with the right POS tag. "
+        f"\n\nPlease provide the POS tags for each word in the {lang} sentence. The input is a list of words in"
+        " the sentence. POS tag label set: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET', 'INTJ', 'NOUN', "
+        "'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']. The output format should "
+        "be a list of tuples, where each tuple consists of a word from the input text and its corresponding"
+        " POS tag label from the POS tag label set provided\nYour response should include only a list of "
+        "tuples, in the order that the words appear in the input sentence, including punctuations, with each tuple containing the "
+        "corresponding POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: ",
+        "prompt_3": f"Acting as a {lang} linguist and without making any corrections or changes to the text, perform a part of "
+        "speech (POS) analysis of the sentences using the following POS tag label annotation ['ADJ', "
+        "'ADP', 'ADV', 'AUX', 'CCONJ, 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', "
+        "'SCONJ', 'SYM', 'VERB', 'X']. The input will be a list of words in the sentence. The output format should "
+        "be a list of tuples, where each tuple consists of a word from the input text and its corresponding"
+        " POS tag label from the POS tag label set provided\nYour response should include only a list of "
+        "tuples, in the order that the words appear in the input sentence, including punctuations, with each tuple containing the "
+        "corresponding POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: ",
+        "prompt_4": "Annotate each word in the provided sentence with the appropriate POS tag. The annotation "
+        "list is given as: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', "
+        "'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']. The input sentence will be a list of words"
+        " in the sentence. The output format should "
+        "be a list of tuples, where each tuple consists of a word from the input text and its corresponding"
+        " POS tag label from the POS tag label set provided\nYour response should include only a list of "
+        "tuples, in the order that the words appear in the input sentence, including punctuations, with each tuple containing the "
+        "corresponding POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: ",
+        "prompt_5": "Given the following sentence, identify the part of speech (POS) for each word. Use the following "
+        "POS tag set: \nNOUN: Noun (person, place, thing), \nVERB: Verb (action, state), "
+        "\nADJ: Adjective (describes a noun), \nADV: Adverb (modifies a verb, adjective, or adverb), "
+        "\nPRON: Pronoun (replaces a noun), \nDET: Determiner (introduces a noun), "
+        "\nADP: Adposition (preposition or postposition), \nCCONJ: Conjunction (connects words, phrases, clauses)"
+        "\nPUNCT: Punctuation, \nPROPN: Proper Noun, \nAUX: Auxiliary verb (helper verb), "
+        "\nSCONJ: Subordinating conjunction \nPART: Particle, \nSYM: Symbol, \nINTJ: Interjection, "
+        "\nNUM: Numeral, \nX: others. The output format should "
+        "be a list of tuples, where each tuple consists of a word from the input text and its corresponding"
+        " POS tag label key only from the POS tag set provided\nYour response should include only a list of "
+        "tuples, in the order that the words appear in the input sentence, including punctuations, with each tuple containing the "
+        "corresponding POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: ",
+    }
+    return prompt_map[mode]
+
+
+def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
+    """
+    Generate a yaml file for each language.
+
+    :param output_dir: The directory to output the files to.
+    :param overwrite: Whether to overwrite files if they already exist.
+    """
+    err = []
+    languages = {
+        "bam": "Bambara",
+        "bbj": "Ghomala",
+        "ewe": "Ewe",
+        "fon": "Fon",
+        "hau": "Hausa",
+        "ibo": "Igbo",
+        "kin": "Kinyarwanda",
+        "lug": "Luganda",
+        "luo": "Dholuo",
+        "mos": "Mossi",
+        "nya": "Chichewa",
+        "pcm": "Nigerian Pidgin",
+        "sna": "chiShona",
+        "swa": "Kiswahili",
+        "tsn": "Setswana",
+        "twi": "Twi",
+        "wol": "Wolof",
+        "xho": "isiXhosa",
+        "yor": "Yoruba",
+        "zul": "isiZulu",
+    }
+
+    for lang in languages.keys():
+        try:
+            file_name = f"masakhapos_{lang}.yaml"
+            task_name = f"masakhapos_{lang}_{mode}"
+            yaml_template = "masakhapos_yaml"
+            yaml_details = {
+                "include": yaml_template,
+                "task": task_name,
+                "dataset_name": lang,
+                "doc_to_text": prompt_func(mode, languages[lang]),
+            }
+            os.makedirs(f"{output_dir}/{mode}", exist_ok=True)
+            with open(
+                f"{output_dir}/{mode}/{file_name}",
+                "w" if overwrite else "x",
+                encoding="utf8",
+            ) as f:
+                f.write("# Generated by utils.py\n")
+                yaml.dump(
+                    yaml_details,
+                    f,
+                    allow_unicode=True,
+                )
+        except FileExistsError:
+            err.append(file_name)
+
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist (use --overwrite flag):"
+            f" {', '.join(err)}"
+        )
+
+
+def main() -> None:
+    """Parse CLI args and generate language-specific yaml files."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--overwrite",
+        default=True,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default="./",
+        help="Directory to write yaml files to",
+    )
+    parser.add_argument(
+        "--mode",
+        default="prompt_1",
+        choices=["prompt_1", "prompt_2", "prompt_3", "prompt_4", "prompt_5"],
+        help="Prompt number",
+    )
+    args = parser.parse_args()
+
+    gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite, mode=args.mode)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lm_eval/tasks/afrobench/masakhapos/masakhapos.yaml b/lm_eval/tasks/afrobench/masakhapos/masakhapos.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3fb1574eb32a0203198a4d210c788765cf476f34
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/masakhapos.yaml
@@ -0,0 +1,13 @@
+group: masakhapos
+task:
+  - masakhapos_prompt_1
+  - masakhapos_prompt_2
+  - masakhapos_prompt_3
+  - masakhapos_prompt_4
+  - masakhapos_prompt_5
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_bam.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_bam.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b1c64e387ae638c83e30b1172f458c3976d20728
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_bam.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: bam
+doc_to_text: "Please provide the POS tags for each word in the input sentence. The\
+  \ input will be a list of words in the sentence. The output format should be a list\
+  \ of tuples, where each tuple consists of a word from the input text and its corresponding\
+  \ POS tag label from the tag label set: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. \nYour response should include only a list of tuples, in the order that\
+  \ the words appear in the input sentence, including punctuations, with each tuple\
+  \ containing the corresponding POS tag label for a word. \n\nSentence: {{tokens}}\
+  \ \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_bam_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_bbj.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_bbj.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..418c8e0ca6c411620056f280d51696e730107c2c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_bbj.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: bbj
+doc_to_text: "Please provide the POS tags for each word in the input sentence. The\
+  \ input will be a list of words in the sentence. The output format should be a list\
+  \ of tuples, where each tuple consists of a word from the input text and its corresponding\
+  \ POS tag label from the tag label set: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. \nYour response should include only a list of tuples, in the order that\
+  \ the words appear in the input sentence, including punctuations, with each tuple\
+  \ containing the corresponding POS tag label for a word. \n\nSentence: {{tokens}}\
+  \ \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_bbj_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_ewe.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1eeb249744fc6f75bb7a08896fa0caaacdc1e84d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_ewe.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: ewe
+doc_to_text: "Please provide the POS tags for each word in the input sentence. The\
+  \ input will be a list of words in the sentence. The output format should be a list\
+  \ of tuples, where each tuple consists of a word from the input text and its corresponding\
+  \ POS tag label from the tag label set: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. \nYour response should include only a list of tuples, in the order that\
+  \ the words appear in the input sentence, including punctuations, with each tuple\
+  \ containing the corresponding POS tag label for a word. \n\nSentence: {{tokens}}\
+  \ \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_ewe_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_fon.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_fon.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..da7eb7aee4ad2a0712cd49cf96546f69e26d8dc8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_fon.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: fon
+doc_to_text: "Please provide the POS tags for each word in the input sentence. The\
+  \ input will be a list of words in the sentence. The output format should be a list\
+  \ of tuples, where each tuple consists of a word from the input text and its corresponding\
+  \ POS tag label from the tag label set: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. \nYour response should include only a list of tuples, in the order that\
+  \ the words appear in the input sentence, including punctuations, with each tuple\
+  \ containing the corresponding POS tag label for a word. \n\nSentence: {{tokens}}\
+  \ \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_fon_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_hau.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..431ed8f1656568111d4206a7a33c954b51cfa743
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_hau.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: "Please provide the POS tags for each word in the input sentence. The\
+  \ input will be a list of words in the sentence. The output format should be a list\
+  \ of tuples, where each tuple consists of a word from the input text and its corresponding\
+  \ POS tag label from the tag label set: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. \nYour response should include only a list of tuples, in the order that\
+  \ the words appear in the input sentence, including punctuations, with each tuple\
+  \ containing the corresponding POS tag label for a word. \n\nSentence: {{tokens}}\
+  \ \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_hau_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_ibo.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0cb171fe3c93c8be6d6bee9b41e6596d769b5deb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_ibo.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: "Please provide the POS tags for each word in the input sentence. The\
+  \ input will be a list of words in the sentence. The output format should be a list\
+  \ of tuples, where each tuple consists of a word from the input text and its corresponding\
+  \ POS tag label from the tag label set: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. \nYour response should include only a list of tuples, in the order that\
+  \ the words appear in the input sentence, including punctuations, with each tuple\
+  \ containing the corresponding POS tag label for a word. \n\nSentence: {{tokens}}\
+  \ \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_ibo_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_kin.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dced04f22e7424c3d0c4f3a39f4cf58c331f759b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_kin.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_text: "Please provide the POS tags for each word in the input sentence. The\
+  \ input will be a list of words in the sentence. The output format should be a list\
+  \ of tuples, where each tuple consists of a word from the input text and its corresponding\
+  \ POS tag label from the tag label set: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. \nYour response should include only a list of tuples, in the order that\
+  \ the words appear in the input sentence, including punctuations, with each tuple\
+  \ containing the corresponding POS tag label for a word. \n\nSentence: {{tokens}}\
+  \ \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_kin_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_lug.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e773f6430c0f38d842c63ff8752a78d8a54dd87d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_lug.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: lug
+doc_to_text: "Please provide the POS tags for each word in the input sentence. The\
+  \ input will be a list of words in the sentence. The output format should be a list\
+  \ of tuples, where each tuple consists of a word from the input text and its corresponding\
+  \ POS tag label from the tag label set: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. \nYour response should include only a list of tuples, in the order that\
+  \ the words appear in the input sentence, including punctuations, with each tuple\
+  \ containing the corresponding POS tag label for a word. \n\nSentence: {{tokens}}\
+  \ \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_lug_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_luo.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_luo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4544e2b1bce03c8f8fc8d0e82c1c6fbeab6f3570
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_luo.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: luo
+doc_to_text: "Please provide the POS tags for each word in the input sentence. The\
+  \ input will be a list of words in the sentence. The output format should be a list\
+  \ of tuples, where each tuple consists of a word from the input text and its corresponding\
+  \ POS tag label from the tag label set: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. \nYour response should include only a list of tuples, in the order that\
+  \ the words appear in the input sentence, including punctuations, with each tuple\
+  \ containing the corresponding POS tag label for a word. \n\nSentence: {{tokens}}\
+  \ \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_luo_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_mos.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_mos.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b0c7d3f6a3cd272812926812588744bd737dbb51
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_mos.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: mos
+doc_to_text: "Please provide the POS tags for each word in the input sentence. The\
+  \ input will be a list of words in the sentence. The output format should be a list\
+  \ of tuples, where each tuple consists of a word from the input text and its corresponding\
+  \ POS tag label from the tag label set: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. \nYour response should include only a list of tuples, in the order that\
+  \ the words appear in the input sentence, including punctuations, with each tuple\
+  \ containing the corresponding POS tag label for a word. \n\nSentence: {{tokens}}\
+  \ \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_mos_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_nya.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_nya.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f8d4fcbf23feecfaa7ef927dc0a2d9c090370469
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_nya.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: nya
+doc_to_text: "Please provide the POS tags for each word in the input sentence. The\
+  \ input will be a list of words in the sentence. The output format should be a list\
+  \ of tuples, where each tuple consists of a word from the input text and its corresponding\
+  \ POS tag label from the tag label set: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. \nYour response should include only a list of tuples, in the order that\
+  \ the words appear in the input sentence, including punctuations, with each tuple\
+  \ containing the corresponding POS tag label for a word. \n\nSentence: {{tokens}}\
+  \ \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_nya_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_pcm.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_pcm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2d05924ee2ba0c702fbb17de84cce0ed03e536bb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_pcm.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: pcm
+doc_to_text: "Please provide the POS tags for each word in the input sentence. The\
+  \ input will be a list of words in the sentence. The output format should be a list\
+  \ of tuples, where each tuple consists of a word from the input text and its corresponding\
+  \ POS tag label from the tag label set: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. \nYour response should include only a list of tuples, in the order that\
+  \ the words appear in the input sentence, including punctuations, with each tuple\
+  \ containing the corresponding POS tag label for a word. \n\nSentence: {{tokens}}\
+  \ \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_pcm_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_sna.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7afa02f4f8b72801d5d782165a68694ef41cdc5a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_sna.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: sna
+doc_to_text: "Please provide the POS tags for each word in the input sentence. The\
+  \ input will be a list of words in the sentence. The output format should be a list\
+  \ of tuples, where each tuple consists of a word from the input text and its corresponding\
+  \ POS tag label from the tag label set: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. \nYour response should include only a list of tuples, in the order that\
+  \ the words appear in the input sentence, including punctuations, with each tuple\
+  \ containing the corresponding POS tag label for a word. \n\nSentence: {{tokens}}\
+  \ \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_sna_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_swa.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ab2f123e1a42759c4f600bb90c4f8450cbc84edf
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_swa.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: "Please provide the POS tags for each word in the input sentence. The\
+  \ input will be a list of words in the sentence. The output format should be a list\
+  \ of tuples, where each tuple consists of a word from the input text and its corresponding\
+  \ POS tag label from the tag label set: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. \nYour response should include only a list of tuples, in the order that\
+  \ the words appear in the input sentence, including punctuations, with each tuple\
+  \ containing the corresponding POS tag label for a word. \n\nSentence: {{tokens}}\
+  \ \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_swa_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_tsn.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_tsn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ca02f064a837e69871250046a44d5ed63253ec1f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_tsn.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: tsn
+doc_to_text: "Please provide the POS tags for each word in the input sentence. The\
+  \ input will be a list of words in the sentence. The output format should be a list\
+  \ of tuples, where each tuple consists of a word from the input text and its corresponding\
+  \ POS tag label from the tag label set: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. \nYour response should include only a list of tuples, in the order that\
+  \ the words appear in the input sentence, including punctuations, with each tuple\
+  \ containing the corresponding POS tag label for a word. \n\nSentence: {{tokens}}\
+  \ \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_tsn_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_twi.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f22c093639eefc5747c288506d2cb28f90cd6ca6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_twi.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_text: "Please provide the POS tags for each word in the input sentence. The\
+  \ input will be a list of words in the sentence. The output format should be a list\
+  \ of tuples, where each tuple consists of a word from the input text and its corresponding\
+  \ POS tag label from the tag label set: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. \nYour response should include only a list of tuples, in the order that\
+  \ the words appear in the input sentence, including punctuations, with each tuple\
+  \ containing the corresponding POS tag label for a word. \n\nSentence: {{tokens}}\
+  \ \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_twi_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_wol.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e0bdd23a8a2203243fa657388b7eae8a2be1a28b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_wol.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: wol
+doc_to_text: "Please provide the POS tags for each word in the input sentence. The\
+  \ input will be a list of words in the sentence. The output format should be a list\
+  \ of tuples, where each tuple consists of a word from the input text and its corresponding\
+  \ POS tag label from the tag label set: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. \nYour response should include only a list of tuples, in the order that\
+  \ the words appear in the input sentence, including punctuations, with each tuple\
+  \ containing the corresponding POS tag label for a word. \n\nSentence: {{tokens}}\
+  \ \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_wol_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_xho.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f712a874546298594bff74f47a85aa67bc5ae23b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_xho.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: xho
+doc_to_text: "Please provide the POS tags for each word in the input sentence. The\
+  \ input will be a list of words in the sentence. The output format should be a list\
+  \ of tuples, where each tuple consists of a word from the input text and its corresponding\
+  \ POS tag label from the tag label set: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. \nYour response should include only a list of tuples, in the order that\
+  \ the words appear in the input sentence, including punctuations, with each tuple\
+  \ containing the corresponding POS tag label for a word. \n\nSentence: {{tokens}}\
+  \ \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_xho_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bdca7a85d905f3e177b496b139ed9705f1a3e620
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_yaml
@@ -0,0 +1,32 @@
+tag:
+- masakhapos_tasks
+- masakhapos_prompt_1
+dataset_path: masakhane/masakhapos
+dataset_name: null
+dataset_kwargs: {trust_remote_code: True}
+output_type: generate_until
+generation_kwargs:
+  do_sample: false
+  until:
+  - </s>
+  - <|im_end|>
+validation_split: validation
+test_split: test
+fewshot_split: train
+doc_to_target: !function utils.doc_to_target
+should_decontaminate: true
+doc_to_decontamination_query: "Sentence: {{token}}\nOutput:"
+filter_list:
+  - filter:
+    - function: regex_pos
+    name: flexible-extract
+metric_list:
+  - metric: acc
+    aggregation: !function utils.acc_score
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_yor.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..efa8750a6200a2be388806f4f8da57f52f781b3c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_yor.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: "Please provide the POS tags for each word in the input sentence. The\
+  \ input will be a list of words in the sentence. The output format should be a list\
+  \ of tuples, where each tuple consists of a word from the input text and its corresponding\
+  \ POS tag label from the tag label set: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. \nYour response should include only a list of tuples, in the order that\
+  \ the words appear in the input sentence, including punctuations, with each tuple\
+  \ containing the corresponding POS tag label for a word. \n\nSentence: {{tokens}}\
+  \ \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_yor_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_zul.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..362c9934b856664dc1ca336d8420b170c5532813
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_zul.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: zul
+doc_to_text: "Please provide the POS tags for each word in the input sentence. The\
+  \ input will be a list of words in the sentence. The output format should be a list\
+  \ of tuples, where each tuple consists of a word from the input text and its corresponding\
+  \ POS tag label from the tag label set: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. \nYour response should include only a list of tuples, in the order that\
+  \ the words appear in the input sentence, including punctuations, with each tuple\
+  \ containing the corresponding POS tag label for a word. \n\nSentence: {{tokens}}\
+  \ \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_zul_prompt_1
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_1/utils.py b/lm_eval/tasks/afrobench/masakhapos/prompt_1/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ccc66d9cce30c1459494f0d5c21a71d1d3f58d4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_1/utils.py
@@ -0,0 +1,55 @@
+from itertools import chain
+
+from sklearn.metrics import accuracy_score
+
+from lm_eval.utils import weighted_f1_score
+
+
+def doc_to_target(doc):
+    pos_tag_map = {
+        0: "NOUN",
+        1: "PUNCT",
+        2: "ADP",
+        3: "NUM",
+        4: "SYM",
+        5: "SCONJ",
+        6: "ADJ",
+        7: "PART",
+        8: "DET",
+        9: "CCONJ",
+        10: "PROPN",
+        11: "PRON",
+        12: "X",
+        13: "_",
+        14: "ADV",
+        15: "INTJ",
+        16: "VERB",
+        17: "AUX",
+    }
+    return [pos_tag_map[tag] for tag in doc["upos"]]
+
+
+def acc_score(items):
+    unzipped_list = list(zip(*items))
+
+    golds, preds = unzipped_list[0], unzipped_list[1]
+
+    # Flatten preds' inner lists
+    flattened_preds = [list(chain.from_iterable(p)) for p in preds]
+
+    # Calculate the accuracy for each gold-pred pair
+    accuracy_scores = []
+    for gold, pred in zip(golds, flattened_preds):
+        # Ensure both lists are of the same length, otherwise truncate to match
+        min_length = min(len(gold), len(pred))
+        gold = gold[:min_length]
+        pred = pred[:min_length]
+
+        # Calculate accuracy for the current pair and add to the list
+        accuracy = accuracy_score(gold, pred)
+        accuracy_scores.append(accuracy)
+
+    mean_accuracy = (
+        sum(accuracy_scores) / len(accuracy_scores) if accuracy_scores else 0
+    )
+    return mean_accuracy
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_bam.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_bam.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bde25d7e5c36fa84add36210bf728999f9dafcb2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_bam.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: bam
+doc_to_text: "You are an expert in tagging words and sentences in Bambara with the\
+  \ right POS tag. \n\nPlease provide the POS tags for each word in the Bambara sentence.\
+  \ The input is a list of words in the sentence. POS tag label set: ['ADJ', 'ADP',\
+  \ 'ADV', 'AUX', 'CCONJ, 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT',\
+  \ 'SCONJ', 'SYM', 'VERB', 'X']. The output format should be a list of tuples, where\
+  \ each tuple consists of a word from the input text and its corresponding POS tag\
+  \ label from the POS tag label set provided\nYour response should include only a\
+  \ list of tuples, in the order that the words appear in the input sentence, including\
+  \ punctuations, with each tuple containing the corresponding POS tag label for a\
+  \ word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_bam_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_bbj.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_bbj.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8439e6b03f209094e973cf0f9faddfd1a32495b0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_bbj.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: bbj
+doc_to_text: "You are an expert in tagging words and sentences in Ghomala with the\
+  \ right POS tag. \n\nPlease provide the POS tags for each word in the Ghomala sentence.\
+  \ The input is a list of words in the sentence. POS tag label set: ['ADJ', 'ADP',\
+  \ 'ADV', 'AUX', 'CCONJ, 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT',\
+  \ 'SCONJ', 'SYM', 'VERB', 'X']. The output format should be a list of tuples, where\
+  \ each tuple consists of a word from the input text and its corresponding POS tag\
+  \ label from the POS tag label set provided\nYour response should include only a\
+  \ list of tuples, in the order that the words appear in the input sentence, including\
+  \ punctuations, with each tuple containing the corresponding POS tag label for a\
+  \ word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_bbj_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_ewe.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5ffa2ba95963fbe4cac38e5a419df3e98b140750
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_ewe.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: ewe
+doc_to_text: "You are an expert in tagging words and sentences in Ewe with the right\
+  \ POS tag. \n\nPlease provide the POS tags for each word in the Ewe sentence. The\
+  \ input is a list of words in the sentence. POS tag label set: ['ADJ', 'ADP', 'ADV',\
+  \ 'AUX', 'CCONJ, 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT',\
+  \ 'SCONJ', 'SYM', 'VERB', 'X']. The output format should be a list of tuples, where\
+  \ each tuple consists of a word from the input text and its corresponding POS tag\
+  \ label from the POS tag label set provided\nYour response should include only a\
+  \ list of tuples, in the order that the words appear in the input sentence, including\
+  \ punctuations, with each tuple containing the corresponding POS tag label for a\
+  \ word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_ewe_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_fon.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_fon.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..548f2de48255080669b96408c1975eff7958770b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_fon.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: fon
+doc_to_text: "You are an expert in tagging words and sentences in Fon with the right\
+  \ POS tag. \n\nPlease provide the POS tags for each word in the Fon sentence. The\
+  \ input is a list of words in the sentence. POS tag label set: ['ADJ', 'ADP', 'ADV',\
+  \ 'AUX', 'CCONJ, 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT',\
+  \ 'SCONJ', 'SYM', 'VERB', 'X']. The output format should be a list of tuples, where\
+  \ each tuple consists of a word from the input text and its corresponding POS tag\
+  \ label from the POS tag label set provided\nYour response should include only a\
+  \ list of tuples, in the order that the words appear in the input sentence, including\
+  \ punctuations, with each tuple containing the corresponding POS tag label for a\
+  \ word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_fon_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_hau.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4bc034571803b9fee3f6af8db6e567d64f2a2e61
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_hau.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: "You are an expert in tagging words and sentences in Hausa with the right\
+  \ POS tag. \n\nPlease provide the POS tags for each word in the Hausa sentence.\
+  \ The input is a list of words in the sentence. POS tag label set: ['ADJ', 'ADP',\
+  \ 'ADV', 'AUX', 'CCONJ, 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT',\
+  \ 'SCONJ', 'SYM', 'VERB', 'X']. The output format should be a list of tuples, where\
+  \ each tuple consists of a word from the input text and its corresponding POS tag\
+  \ label from the POS tag label set provided\nYour response should include only a\
+  \ list of tuples, in the order that the words appear in the input sentence, including\
+  \ punctuations, with each tuple containing the corresponding POS tag label for a\
+  \ word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_hau_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_ibo.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d0f5d357eabfeab7ccd993634be3f2baedfeab84
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_ibo.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: "You are an expert in tagging words and sentences in Igbo with the right\
+  \ POS tag. \n\nPlease provide the POS tags for each word in the Igbo sentence. The\
+  \ input is a list of words in the sentence. POS tag label set: ['ADJ', 'ADP', 'ADV',\
+  \ 'AUX', 'CCONJ, 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT',\
+  \ 'SCONJ', 'SYM', 'VERB', 'X']. The output format should be a list of tuples, where\
+  \ each tuple consists of a word from the input text and its corresponding POS tag\
+  \ label from the POS tag label set provided\nYour response should include only a\
+  \ list of tuples, in the order that the words appear in the input sentence, including\
+  \ punctuations, with each tuple containing the corresponding POS tag label for a\
+  \ word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_ibo_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_kin.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..95fd232a615dffbd964e0225bd01505bbbd2c396
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_kin.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_text: "You are an expert in tagging words and sentences in Kinyarwanda with\
+  \ the right POS tag. \n\nPlease provide the POS tags for each word in the Kinyarwanda\
+  \ sentence. The input is a list of words in the sentence. POS tag label set: ['ADJ',\
+  \ 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN',\
+  \ 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']. The output format should be a list of tuples,\
+  \ where each tuple consists of a word from the input text and its corresponding\
+  \ POS tag label from the POS tag label set provided\nYour response should include\
+  \ only a list of tuples, in the order that the words appear in the input sentence,\
+  \ including punctuations, with each tuple containing the corresponding POS tag label\
+  \ for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_kin_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_lug.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..21b02b10864503d1437208dc0a56f4ad6bb4e9d7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_lug.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: lug
+doc_to_text: "You are an expert in tagging words and sentences in Luganda with the\
+  \ right POS tag. \n\nPlease provide the POS tags for each word in the Luganda sentence.\
+  \ The input is a list of words in the sentence. POS tag label set: ['ADJ', 'ADP',\
+  \ 'ADV', 'AUX', 'CCONJ, 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT',\
+  \ 'SCONJ', 'SYM', 'VERB', 'X']. The output format should be a list of tuples, where\
+  \ each tuple consists of a word from the input text and its corresponding POS tag\
+  \ label from the POS tag label set provided\nYour response should include only a\
+  \ list of tuples, in the order that the words appear in the input sentence, including\
+  \ punctuations, with each tuple containing the corresponding POS tag label for a\
+  \ word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_lug_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_luo.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_luo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..42ccb34fec23488a68562f06ebe2e05811f4e057
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_luo.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: luo
+doc_to_text: "You are an expert in tagging words and sentences in Dholuo with the\
+  \ right POS tag. \n\nPlease provide the POS tags for each word in the Dholuo sentence.\
+  \ The input is a list of words in the sentence. POS tag label set: ['ADJ', 'ADP',\
+  \ 'ADV', 'AUX', 'CCONJ, 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT',\
+  \ 'SCONJ', 'SYM', 'VERB', 'X']. The output format should be a list of tuples, where\
+  \ each tuple consists of a word from the input text and its corresponding POS tag\
+  \ label from the POS tag label set provided\nYour response should include only a\
+  \ list of tuples, in the order that the words appear in the input sentence, including\
+  \ punctuations, with each tuple containing the corresponding POS tag label for a\
+  \ word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_luo_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_mos.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_mos.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cfa74aefef204c134d692d17913371137a696a1b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_mos.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: mos
+doc_to_text: "You are an expert in tagging words and sentences in Mossi with the right\
+  \ POS tag. \n\nPlease provide the POS tags for each word in the Mossi sentence.\
+  \ The input is a list of words in the sentence. POS tag label set: ['ADJ', 'ADP',\
+  \ 'ADV', 'AUX', 'CCONJ, 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT',\
+  \ 'SCONJ', 'SYM', 'VERB', 'X']. The output format should be a list of tuples, where\
+  \ each tuple consists of a word from the input text and its corresponding POS tag\
+  \ label from the POS tag label set provided\nYour response should include only a\
+  \ list of tuples, in the order that the words appear in the input sentence, including\
+  \ punctuations, with each tuple containing the corresponding POS tag label for a\
+  \ word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_mos_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_nya.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_nya.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..27de8386357d493a950920afb895edd9eb689adf
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_nya.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: nya
+doc_to_text: "You are an expert in tagging words and sentences in Chichewa with the\
+  \ right POS tag. \n\nPlease provide the POS tags for each word in the Chichewa sentence.\
+  \ The input is a list of words in the sentence. POS tag label set: ['ADJ', 'ADP',\
+  \ 'ADV', 'AUX', 'CCONJ, 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT',\
+  \ 'SCONJ', 'SYM', 'VERB', 'X']. The output format should be a list of tuples, where\
+  \ each tuple consists of a word from the input text and its corresponding POS tag\
+  \ label from the POS tag label set provided\nYour response should include only a\
+  \ list of tuples, in the order that the words appear in the input sentence, including\
+  \ punctuations, with each tuple containing the corresponding POS tag label for a\
+  \ word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_nya_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_pcm.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_pcm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0c532569d338696c50b8746c4b1ac9ded2b20d22
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_pcm.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: pcm
+doc_to_text: "You are an expert in tagging words and sentences in Nigerian Pidgin\
+  \ with the right POS tag. \n\nPlease provide the POS tags for each word in the Nigerian\
+  \ Pidgin sentence. The input is a list of words in the sentence. POS tag label set:\
+  \ ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON',\
+  \ 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']. The output format should be a\
+  \ list of tuples, where each tuple consists of a word from the input text and its\
+  \ corresponding POS tag label from the POS tag label set provided\nYour response\
+  \ should include only a list of tuples, in the order that the words appear in the\
+  \ input sentence, including punctuations, with each tuple containing the corresponding\
+  \ POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_pcm_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_sna.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c6c6467d81bfd873ed361f1bccab89710ccfd370
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_sna.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: sna
+doc_to_text: "You are an expert in tagging words and sentences in chiShona with the\
+  \ right POS tag. \n\nPlease provide the POS tags for each word in the chiShona sentence.\
+  \ The input is a list of words in the sentence. POS tag label set: ['ADJ', 'ADP',\
+  \ 'ADV', 'AUX', 'CCONJ, 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT',\
+  \ 'SCONJ', 'SYM', 'VERB', 'X']. The output format should be a list of tuples, where\
+  \ each tuple consists of a word from the input text and its corresponding POS tag\
+  \ label from the POS tag label set provided\nYour response should include only a\
+  \ list of tuples, in the order that the words appear in the input sentence, including\
+  \ punctuations, with each tuple containing the corresponding POS tag label for a\
+  \ word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_sna_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_swa.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b1ca8780834ded2c13edc50203f610c1b8147693
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_swa.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: "You are an expert in tagging words and sentences in Kiswahili with the\
+  \ right POS tag. \n\nPlease provide the POS tags for each word in the Kiswahili\
+  \ sentence. The input is a list of words in the sentence. POS tag label set: ['ADJ',\
+  \ 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN',\
+  \ 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']. The output format should be a list of tuples,\
+  \ where each tuple consists of a word from the input text and its corresponding\
+  \ POS tag label from the POS tag label set provided\nYour response should include\
+  \ only a list of tuples, in the order that the words appear in the input sentence,\
+  \ including punctuations, with each tuple containing the corresponding POS tag label\
+  \ for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_swa_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_tsn.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_tsn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a69886646284706e2b4cb11bab61a572efa726b5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_tsn.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: tsn
+doc_to_text: "You are an expert in tagging words and sentences in Setswana with the\
+  \ right POS tag. \n\nPlease provide the POS tags for each word in the Setswana sentence.\
+  \ The input is a list of words in the sentence. POS tag label set: ['ADJ', 'ADP',\
+  \ 'ADV', 'AUX', 'CCONJ, 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT',\
+  \ 'SCONJ', 'SYM', 'VERB', 'X']. The output format should be a list of tuples, where\
+  \ each tuple consists of a word from the input text and its corresponding POS tag\
+  \ label from the POS tag label set provided\nYour response should include only a\
+  \ list of tuples, in the order that the words appear in the input sentence, including\
+  \ punctuations, with each tuple containing the corresponding POS tag label for a\
+  \ word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_tsn_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_twi.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..22a6f414cdbd3485cb822a95f8b2a41012174907
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_twi.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_text: "You are an expert in tagging words and sentences in Twi with the right\
+  \ POS tag. \n\nPlease provide the POS tags for each word in the Twi sentence. The\
+  \ input is a list of words in the sentence. POS tag label set: ['ADJ', 'ADP', 'ADV',\
+  \ 'AUX', 'CCONJ, 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT',\
+  \ 'SCONJ', 'SYM', 'VERB', 'X']. The output format should be a list of tuples, where\
+  \ each tuple consists of a word from the input text and its corresponding POS tag\
+  \ label from the POS tag label set provided\nYour response should include only a\
+  \ list of tuples, in the order that the words appear in the input sentence, including\
+  \ punctuations, with each tuple containing the corresponding POS tag label for a\
+  \ word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_twi_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_wol.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e64fcc3dadaf548ecc4f936122dc9f042094fa6e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_wol.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: wol
+doc_to_text: "You are an expert in tagging words and sentences in Wolof with the right\
+  \ POS tag. \n\nPlease provide the POS tags for each word in the Wolof sentence.\
+  \ The input is a list of words in the sentence. POS tag label set: ['ADJ', 'ADP',\
+  \ 'ADV', 'AUX', 'CCONJ, 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT',\
+  \ 'SCONJ', 'SYM', 'VERB', 'X']. The output format should be a list of tuples, where\
+  \ each tuple consists of a word from the input text and its corresponding POS tag\
+  \ label from the POS tag label set provided\nYour response should include only a\
+  \ list of tuples, in the order that the words appear in the input sentence, including\
+  \ punctuations, with each tuple containing the corresponding POS tag label for a\
+  \ word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_wol_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_xho.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b0d8d8deda904adfe211b7a1b138742ba90c57a6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_xho.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: xho
+doc_to_text: "You are an expert in tagging words and sentences in isiXhosa with the\
+  \ right POS tag. \n\nPlease provide the POS tags for each word in the isiXhosa sentence.\
+  \ The input is a list of words in the sentence. POS tag label set: ['ADJ', 'ADP',\
+  \ 'ADV', 'AUX', 'CCONJ, 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT',\
+  \ 'SCONJ', 'SYM', 'VERB', 'X']. The output format should be a list of tuples, where\
+  \ each tuple consists of a word from the input text and its corresponding POS tag\
+  \ label from the POS tag label set provided\nYour response should include only a\
+  \ list of tuples, in the order that the words appear in the input sentence, including\
+  \ punctuations, with each tuple containing the corresponding POS tag label for a\
+  \ word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_xho_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..044fffdb895a8c2b05ddd96602dc8879b8579b4f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_yaml
@@ -0,0 +1,32 @@
+tag:
+- masakhapos_tasks
+- masakhapos_prompt_2
+dataset_path: masakhane/masakhapos
+dataset_name: null
+dataset_kwargs: {trust_remote_code: True}
+output_type: generate_until
+generation_kwargs:
+  do_sample: false
+  until:
+  - </s>
+  - <|im_end|>
+validation_split: validation
+test_split: test
+fewshot_split: train
+doc_to_target: !function utils.doc_to_target
+should_decontaminate: true
+doc_to_decontamination_query: "Sentence: {{token}}\nOutput:"
+filter_list:
+  - filter:
+    - function: regex_pos
+    name: flexible-extract
+metric_list:
+  - metric: acc
+    aggregation: !function utils.acc_score
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_yor.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1a9d1b78326ba004acfd95ba7f1c1682f240cb6e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_yor.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: "You are an expert in tagging words and sentences in Yoruba with the\
+  \ right POS tag. \n\nPlease provide the POS tags for each word in the Yoruba sentence.\
+  \ The input is a list of words in the sentence. POS tag label set: ['ADJ', 'ADP',\
+  \ 'ADV', 'AUX', 'CCONJ, 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT',\
+  \ 'SCONJ', 'SYM', 'VERB', 'X']. The output format should be a list of tuples, where\
+  \ each tuple consists of a word from the input text and its corresponding POS tag\
+  \ label from the POS tag label set provided\nYour response should include only a\
+  \ list of tuples, in the order that the words appear in the input sentence, including\
+  \ punctuations, with each tuple containing the corresponding POS tag label for a\
+  \ word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_yor_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_zul.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1aa1ca4c72ad780deb98fcd2a7d76ba4d6221f1f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_zul.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: zul
+doc_to_text: "You are an expert in tagging words and sentences in isiZulu with the\
+  \ right POS tag. \n\nPlease provide the POS tags for each word in the isiZulu sentence.\
+  \ The input is a list of words in the sentence. POS tag label set: ['ADJ', 'ADP',\
+  \ 'ADV', 'AUX', 'CCONJ, 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT',\
+  \ 'SCONJ', 'SYM', 'VERB', 'X']. The output format should be a list of tuples, where\
+  \ each tuple consists of a word from the input text and its corresponding POS tag\
+  \ label from the POS tag label set provided\nYour response should include only a\
+  \ list of tuples, in the order that the words appear in the input sentence, including\
+  \ punctuations, with each tuple containing the corresponding POS tag label for a\
+  \ word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_zul_prompt_2
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_2/utils.py b/lm_eval/tasks/afrobench/masakhapos/prompt_2/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ccc66d9cce30c1459494f0d5c21a71d1d3f58d4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_2/utils.py
@@ -0,0 +1,55 @@
+from itertools import chain
+
+from sklearn.metrics import accuracy_score
+
+from lm_eval.utils import weighted_f1_score
+
+
+def doc_to_target(doc):
+    pos_tag_map = {
+        0: "NOUN",
+        1: "PUNCT",
+        2: "ADP",
+        3: "NUM",
+        4: "SYM",
+        5: "SCONJ",
+        6: "ADJ",
+        7: "PART",
+        8: "DET",
+        9: "CCONJ",
+        10: "PROPN",
+        11: "PRON",
+        12: "X",
+        13: "_",
+        14: "ADV",
+        15: "INTJ",
+        16: "VERB",
+        17: "AUX",
+    }
+    return [pos_tag_map[tag] for tag in doc["upos"]]
+
+
+def acc_score(items):
+    unzipped_list = list(zip(*items))
+
+    golds, preds = unzipped_list[0], unzipped_list[1]
+
+    # Flatten preds' inner lists
+    flattened_preds = [list(chain.from_iterable(p)) for p in preds]
+
+    # Calculate the accuracy for each gold-pred pair
+    accuracy_scores = []
+    for gold, pred in zip(golds, flattened_preds):
+        # Ensure both lists are of the same length, otherwise truncate to match
+        min_length = min(len(gold), len(pred))
+        gold = gold[:min_length]
+        pred = pred[:min_length]
+
+        # Calculate accuracy for the current pair and add to the list
+        accuracy = accuracy_score(gold, pred)
+        accuracy_scores.append(accuracy)
+
+    mean_accuracy = (
+        sum(accuracy_scores) / len(accuracy_scores) if accuracy_scores else 0
+    )
+    return mean_accuracy
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_bam.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_bam.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..64bf664f58c9c3ebf4a5192c9f84909cfd7e97c1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_bam.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: bam
+doc_to_text: "Acting as a Bambara linguist and without making any corrections or changes\
+  \ to the text, perform a part of speech (POS) analysis of the sentences using the\
+  \ following POS tag label annotation ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. The input will be a list of words in the sentence. The output format should\
+  \ be a list of tuples, where each tuple consists of a word from the input text and\
+  \ its corresponding POS tag label from the POS tag label set provided\nYour response\
+  \ should include only a list of tuples, in the order that the words appear in the\
+  \ input sentence, including punctuations, with each tuple containing the corresponding\
+  \ POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_bam_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_bbj.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_bbj.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..50d00b6dd66e1e7f00205a455c6de3f7cc48bc43
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_bbj.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: bbj
+doc_to_text: "Acting as a Ghomala linguist and without making any corrections or changes\
+  \ to the text, perform a part of speech (POS) analysis of the sentences using the\
+  \ following POS tag label annotation ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. The input will be a list of words in the sentence. The output format should\
+  \ be a list of tuples, where each tuple consists of a word from the input text and\
+  \ its corresponding POS tag label from the POS tag label set provided\nYour response\
+  \ should include only a list of tuples, in the order that the words appear in the\
+  \ input sentence, including punctuations, with each tuple containing the corresponding\
+  \ POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_bbj_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_ewe.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c83ad4bad7d7f209c4541c067b8f3254e0869007
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_ewe.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: ewe
+doc_to_text: "Acting as a Ewe linguist and without making any corrections or changes\
+  \ to the text, perform a part of speech (POS) analysis of the sentences using the\
+  \ following POS tag label annotation ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. The input will be a list of words in the sentence. The output format should\
+  \ be a list of tuples, where each tuple consists of a word from the input text and\
+  \ its corresponding POS tag label from the POS tag label set provided\nYour response\
+  \ should include only a list of tuples, in the order that the words appear in the\
+  \ input sentence, including punctuations, with each tuple containing the corresponding\
+  \ POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_ewe_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_fon.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_fon.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b12efe16d71a494b3f71178a64347167ee315a3c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_fon.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: fon
+doc_to_text: "Acting as a Fon linguist and without making any corrections or changes\
+  \ to the text, perform a part of speech (POS) analysis of the sentences using the\
+  \ following POS tag label annotation ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. The input will be a list of words in the sentence. The output format should\
+  \ be a list of tuples, where each tuple consists of a word from the input text and\
+  \ its corresponding POS tag label from the POS tag label set provided\nYour response\
+  \ should include only a list of tuples, in the order that the words appear in the\
+  \ input sentence, including punctuations, with each tuple containing the corresponding\
+  \ POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_fon_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_hau.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..613384cf036ccb0232274c55521c30e27ee039b6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_hau.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: "Acting as a Hausa linguist and without making any corrections or changes\
+  \ to the text, perform a part of speech (POS) analysis of the sentences using the\
+  \ following POS tag label annotation ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. The input will be a list of words in the sentence. The output format should\
+  \ be a list of tuples, where each tuple consists of a word from the input text and\
+  \ its corresponding POS tag label from the POS tag label set provided\nYour response\
+  \ should include only a list of tuples, in the order that the words appear in the\
+  \ input sentence, including punctuations, with each tuple containing the corresponding\
+  \ POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_hau_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_ibo.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d7af7e36e150d1f80abdaee1aea1fb5bf5b093b9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_ibo.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: "Acting as a Igbo linguist and without making any corrections or changes\
+  \ to the text, perform a part of speech (POS) analysis of the sentences using the\
+  \ following POS tag label annotation ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. The input will be a list of words in the sentence. The output format should\
+  \ be a list of tuples, where each tuple consists of a word from the input text and\
+  \ its corresponding POS tag label from the POS tag label set provided\nYour response\
+  \ should include only a list of tuples, in the order that the words appear in the\
+  \ input sentence, including punctuations, with each tuple containing the corresponding\
+  \ POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_ibo_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_kin.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1255d99f002b1aa19209a89db5aefbff5ea69cc5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_kin.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_text: "Acting as a Kinyarwanda linguist and without making any corrections\
+  \ or changes to the text, perform a part of speech (POS) analysis of the sentences\
+  \ using the following POS tag label annotation ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ,\
+  \ 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM',\
+  \ 'VERB', 'X']. The input will be a list of words in the sentence. The output format\
+  \ should be a list of tuples, where each tuple consists of a word from the input\
+  \ text and its corresponding POS tag label from the POS tag label set provided\n\
+  Your response should include only a list of tuples, in the order that the words\
+  \ appear in the input sentence, including punctuations, with each tuple containing\
+  \ the corresponding POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_kin_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_lug.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0eb3fad69db8160e8d43b2803bbe418eda8462b9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_lug.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: lug
+doc_to_text: "Acting as a Luganda linguist and without making any corrections or changes\
+  \ to the text, perform a part of speech (POS) analysis of the sentences using the\
+  \ following POS tag label annotation ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. The input will be a list of words in the sentence. The output format should\
+  \ be a list of tuples, where each tuple consists of a word from the input text and\
+  \ its corresponding POS tag label from the POS tag label set provided\nYour response\
+  \ should include only a list of tuples, in the order that the words appear in the\
+  \ input sentence, including punctuations, with each tuple containing the corresponding\
+  \ POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_lug_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_luo.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_luo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6d9ceb84fa771e19e6232a62bfc3b2c092251b55
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_luo.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: luo
+doc_to_text: "Acting as a Dholuo linguist and without making any corrections or changes\
+  \ to the text, perform a part of speech (POS) analysis of the sentences using the\
+  \ following POS tag label annotation ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. The input will be a list of words in the sentence. The output format should\
+  \ be a list of tuples, where each tuple consists of a word from the input text and\
+  \ its corresponding POS tag label from the POS tag label set provided\nYour response\
+  \ should include only a list of tuples, in the order that the words appear in the\
+  \ input sentence, including punctuations, with each tuple containing the corresponding\
+  \ POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_luo_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_mos.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_mos.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..705e4d512e917aa9e532bebf8781f13b89d44017
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_mos.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: mos
+doc_to_text: "Acting as a Mossi linguist and without making any corrections or changes\
+  \ to the text, perform a part of speech (POS) analysis of the sentences using the\
+  \ following POS tag label annotation ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. The input will be a list of words in the sentence. The output format should\
+  \ be a list of tuples, where each tuple consists of a word from the input text and\
+  \ its corresponding POS tag label from the POS tag label set provided\nYour response\
+  \ should include only a list of tuples, in the order that the words appear in the\
+  \ input sentence, including punctuations, with each tuple containing the corresponding\
+  \ POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_mos_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_nya.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_nya.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fecb644d99aa1621c9ac5a6f34bcc87de7f0d377
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_nya.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: nya
+doc_to_text: "Acting as a Chichewa linguist and without making any corrections or\
+  \ changes to the text, perform a part of speech (POS) analysis of the sentences\
+  \ using the following POS tag label annotation ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ,\
+  \ 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM',\
+  \ 'VERB', 'X']. The input will be a list of words in the sentence. The output format\
+  \ should be a list of tuples, where each tuple consists of a word from the input\
+  \ text and its corresponding POS tag label from the POS tag label set provided\n\
+  Your response should include only a list of tuples, in the order that the words\
+  \ appear in the input sentence, including punctuations, with each tuple containing\
+  \ the corresponding POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_nya_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_pcm.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_pcm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9cfc76c52afc0407559e5c4141d57d586a814676
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_pcm.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: pcm
+doc_to_text: "Acting as a Nigerian Pidgin linguist and without making any corrections\
+  \ or changes to the text, perform a part of speech (POS) analysis of the sentences\
+  \ using the following POS tag label annotation ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ,\
+  \ 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM',\
+  \ 'VERB', 'X']. The input will be a list of words in the sentence. The output format\
+  \ should be a list of tuples, where each tuple consists of a word from the input\
+  \ text and its corresponding POS tag label from the POS tag label set provided\n\
+  Your response should include only a list of tuples, in the order that the words\
+  \ appear in the input sentence, including punctuations, with each tuple containing\
+  \ the corresponding POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_pcm_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_sna.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..947b68fe075c2a24000e0448df429bd12f69f159
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_sna.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: sna
+doc_to_text: "Acting as a chiShona linguist and without making any corrections or\
+  \ changes to the text, perform a part of speech (POS) analysis of the sentences\
+  \ using the following POS tag label annotation ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ,\
+  \ 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM',\
+  \ 'VERB', 'X']. The input will be a list of words in the sentence. The output format\
+  \ should be a list of tuples, where each tuple consists of a word from the input\
+  \ text and its corresponding POS tag label from the POS tag label set provided\n\
+  Your response should include only a list of tuples, in the order that the words\
+  \ appear in the input sentence, including punctuations, with each tuple containing\
+  \ the corresponding POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_sna_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_swa.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0cc2e6ef31096505c422692b1262d675580de849
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_swa.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: "Acting as a Kiswahili linguist and without making any corrections or\
+  \ changes to the text, perform a part of speech (POS) analysis of the sentences\
+  \ using the following POS tag label annotation ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ,\
+  \ 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM',\
+  \ 'VERB', 'X']. The input will be a list of words in the sentence. The output format\
+  \ should be a list of tuples, where each tuple consists of a word from the input\
+  \ text and its corresponding POS tag label from the POS tag label set provided\n\
+  Your response should include only a list of tuples, in the order that the words\
+  \ appear in the input sentence, including punctuations, with each tuple containing\
+  \ the corresponding POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_swa_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_tsn.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_tsn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a37aa2e611c87e94ca1e4444b7e583244c4598b3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_tsn.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: tsn
+doc_to_text: "Acting as a Setswana linguist and without making any corrections or\
+  \ changes to the text, perform a part of speech (POS) analysis of the sentences\
+  \ using the following POS tag label annotation ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ,\
+  \ 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM',\
+  \ 'VERB', 'X']. The input will be a list of words in the sentence. The output format\
+  \ should be a list of tuples, where each tuple consists of a word from the input\
+  \ text and its corresponding POS tag label from the POS tag label set provided\n\
+  Your response should include only a list of tuples, in the order that the words\
+  \ appear in the input sentence, including punctuations, with each tuple containing\
+  \ the corresponding POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_tsn_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_twi.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..40bf3c1700a025cfe56a1394d3f4c9dfa4f741be
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_twi.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_text: "Acting as a Twi linguist and without making any corrections or changes\
+  \ to the text, perform a part of speech (POS) analysis of the sentences using the\
+  \ following POS tag label annotation ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. The input will be a list of words in the sentence. The output format should\
+  \ be a list of tuples, where each tuple consists of a word from the input text and\
+  \ its corresponding POS tag label from the POS tag label set provided\nYour response\
+  \ should include only a list of tuples, in the order that the words appear in the\
+  \ input sentence, including punctuations, with each tuple containing the corresponding\
+  \ POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_twi_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_wol.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..97e98aa71dc4a13e913a13717c9749c218eabb3f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_wol.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: wol
+doc_to_text: "Acting as a Wolof linguist and without making any corrections or changes\
+  \ to the text, perform a part of speech (POS) analysis of the sentences using the\
+  \ following POS tag label annotation ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. The input will be a list of words in the sentence. The output format should\
+  \ be a list of tuples, where each tuple consists of a word from the input text and\
+  \ its corresponding POS tag label from the POS tag label set provided\nYour response\
+  \ should include only a list of tuples, in the order that the words appear in the\
+  \ input sentence, including punctuations, with each tuple containing the corresponding\
+  \ POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_wol_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_xho.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..72dafcfabbc51210bcc1678c27d88a656cd97416
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_xho.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: xho
+doc_to_text: "Acting as a isiXhosa linguist and without making any corrections or\
+  \ changes to the text, perform a part of speech (POS) analysis of the sentences\
+  \ using the following POS tag label annotation ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ,\
+  \ 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM',\
+  \ 'VERB', 'X']. The input will be a list of words in the sentence. The output format\
+  \ should be a list of tuples, where each tuple consists of a word from the input\
+  \ text and its corresponding POS tag label from the POS tag label set provided\n\
+  Your response should include only a list of tuples, in the order that the words\
+  \ appear in the input sentence, including punctuations, with each tuple containing\
+  \ the corresponding POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_xho_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..681b621601ed000230f869f1b8dfcd9a3c5db32a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_yaml
@@ -0,0 +1,32 @@
+tag:
+- masakhapos_tasks
+- masakhapos_prompt_3
+dataset_path: masakhane/masakhapos
+dataset_name: null
+dataset_kwargs: {trust_remote_code: True}
+output_type: generate_until
+generation_kwargs:
+  do_sample: false
+  until:
+  - </s>
+  - <|im_end|>
+validation_split: validation
+test_split: test
+fewshot_split: train
+doc_to_target: !function utils.doc_to_target
+should_decontaminate: true
+doc_to_decontamination_query: "Sentence: {{token}}\nOutput:"
+filter_list:
+  - filter:
+    - function: regex_pos
+    name: flexible-extract
+metric_list:
+  - metric: acc
+    aggregation: !function utils.acc_score
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_yor.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c11f48aa60f481bb966bbdb2ddba3da5d4c976f6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_yor.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: "Acting as a Yoruba linguist and without making any corrections or changes\
+  \ to the text, perform a part of speech (POS) analysis of the sentences using the\
+  \ following POS tag label annotation ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. The input will be a list of words in the sentence. The output format should\
+  \ be a list of tuples, where each tuple consists of a word from the input text and\
+  \ its corresponding POS tag label from the POS tag label set provided\nYour response\
+  \ should include only a list of tuples, in the order that the words appear in the\
+  \ input sentence, including punctuations, with each tuple containing the corresponding\
+  \ POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_yor_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_zul.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d89dcf412e4fb99f1f3d788cbdacdb08fe516806
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_zul.yaml
@@ -0,0 +1,14 @@
+# Generated by utils.py
+dataset_name: zul
+doc_to_text: "Acting as a isiZulu linguist and without making any corrections or changes\
+  \ to the text, perform a part of speech (POS) analysis of the sentences using the\
+  \ following POS tag label annotation ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. The input will be a list of words in the sentence. The output format should\
+  \ be a list of tuples, where each tuple consists of a word from the input text and\
+  \ its corresponding POS tag label from the POS tag label set provided\nYour response\
+  \ should include only a list of tuples, in the order that the words appear in the\
+  \ input sentence, including punctuations, with each tuple containing the corresponding\
+  \ POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_zul_prompt_3
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_3/utils.py b/lm_eval/tasks/afrobench/masakhapos/prompt_3/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ccc66d9cce30c1459494f0d5c21a71d1d3f58d4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_3/utils.py
@@ -0,0 +1,55 @@
+from itertools import chain
+
+from sklearn.metrics import accuracy_score
+
+from lm_eval.utils import weighted_f1_score
+
+
+def doc_to_target(doc):
+    pos_tag_map = {
+        0: "NOUN",
+        1: "PUNCT",
+        2: "ADP",
+        3: "NUM",
+        4: "SYM",
+        5: "SCONJ",
+        6: "ADJ",
+        7: "PART",
+        8: "DET",
+        9: "CCONJ",
+        10: "PROPN",
+        11: "PRON",
+        12: "X",
+        13: "_",
+        14: "ADV",
+        15: "INTJ",
+        16: "VERB",
+        17: "AUX",
+    }
+    return [pos_tag_map[tag] for tag in doc["upos"]]
+
+
+def acc_score(items):
+    unzipped_list = list(zip(*items))
+
+    golds, preds = unzipped_list[0], unzipped_list[1]
+
+    # Flatten preds' inner lists
+    flattened_preds = [list(chain.from_iterable(p)) for p in preds]
+
+    # Calculate the accuracy for each gold-pred pair
+    accuracy_scores = []
+    for gold, pred in zip(golds, flattened_preds):
+        # Ensure both lists are of the same length, otherwise truncate to match
+        min_length = min(len(gold), len(pred))
+        gold = gold[:min_length]
+        pred = pred[:min_length]
+
+        # Calculate accuracy for the current pair and add to the list
+        accuracy = accuracy_score(gold, pred)
+        accuracy_scores.append(accuracy)
+
+    mean_accuracy = (
+        sum(accuracy_scores) / len(accuracy_scores) if accuracy_scores else 0
+    )
+    return mean_accuracy
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_bam.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_bam.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..318a15074ff7a2624a388347a9b8304032631632
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_bam.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: bam
+doc_to_text: "Annotate each word in the provided sentence with the appropriate POS\
+  \ tag. The annotation list is given as: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. The input sentence will be a list of words in the sentence. The output format\
+  \ should be a list of tuples, where each tuple consists of a word from the input\
+  \ text and its corresponding POS tag label from the POS tag label set provided\n\
+  Your response should include only a list of tuples, in the order that the words\
+  \ appear in the input sentence, including punctuations, with each tuple containing\
+  \ the corresponding POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_bam_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_bbj.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_bbj.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..24680e2dbfb841086a49469a56b25d32e8efa1ef
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_bbj.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: bbj
+doc_to_text: "Annotate each word in the provided sentence with the appropriate POS\
+  \ tag. The annotation list is given as: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. The input sentence will be a list of words in the sentence. The output format\
+  \ should be a list of tuples, where each tuple consists of a word from the input\
+  \ text and its corresponding POS tag label from the POS tag label set provided\n\
+  Your response should include only a list of tuples, in the order that the words\
+  \ appear in the input sentence, including punctuations, with each tuple containing\
+  \ the corresponding POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_bbj_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_ewe.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..748232217a473bbf3e977a8d63722c59bbbfc405
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_ewe.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: ewe
+doc_to_text: "Annotate each word in the provided sentence with the appropriate POS\
+  \ tag. The annotation list is given as: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. The input sentence will be a list of words in the sentence. The output format\
+  \ should be a list of tuples, where each tuple consists of a word from the input\
+  \ text and its corresponding POS tag label from the POS tag label set provided\n\
+  Your response should include only a list of tuples, in the order that the words\
+  \ appear in the input sentence, including punctuations, with each tuple containing\
+  \ the corresponding POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_ewe_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_fon.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_fon.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2deca67ef9ffbe8af1afdbb783dc130bff2d8c49
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_fon.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: fon
+doc_to_text: "Annotate each word in the provided sentence with the appropriate POS\
+  \ tag. The annotation list is given as: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. The input sentence will be a list of words in the sentence. The output format\
+  \ should be a list of tuples, where each tuple consists of a word from the input\
+  \ text and its corresponding POS tag label from the POS tag label set provided\n\
+  Your response should include only a list of tuples, in the order that the words\
+  \ appear in the input sentence, including punctuations, with each tuple containing\
+  \ the corresponding POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_fon_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_hau.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8a1f5b77a23e3452a8e865234fe49216cc44984e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_hau.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: "Annotate each word in the provided sentence with the appropriate POS\
+  \ tag. The annotation list is given as: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. The input sentence will be a list of words in the sentence. The output format\
+  \ should be a list of tuples, where each tuple consists of a word from the input\
+  \ text and its corresponding POS tag label from the POS tag label set provided\n\
+  Your response should include only a list of tuples, in the order that the words\
+  \ appear in the input sentence, including punctuations, with each tuple containing\
+  \ the corresponding POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_hau_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_ibo.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..789b0897fe29df8f65c6ee73e5620ef247352da7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_ibo.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: "Annotate each word in the provided sentence with the appropriate POS\
+  \ tag. The annotation list is given as: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. The input sentence will be a list of words in the sentence. The output format\
+  \ should be a list of tuples, where each tuple consists of a word from the input\
+  \ text and its corresponding POS tag label from the POS tag label set provided\n\
+  Your response should include only a list of tuples, in the order that the words\
+  \ appear in the input sentence, including punctuations, with each tuple containing\
+  \ the corresponding POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_ibo_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_kin.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1486b4fa916864eac76fa5908399696e783fa108
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_kin.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_text: "Annotate each word in the provided sentence with the appropriate POS\
+  \ tag. The annotation list is given as: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. The input sentence will be a list of words in the sentence. The output format\
+  \ should be a list of tuples, where each tuple consists of a word from the input\
+  \ text and its corresponding POS tag label from the POS tag label set provided\n\
+  Your response should include only a list of tuples, in the order that the words\
+  \ appear in the input sentence, including punctuations, with each tuple containing\
+  \ the corresponding POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_kin_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_lug.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a80c56029aae6ebc263957323d55c9186c1f503a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_lug.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: lug
+doc_to_text: "Annotate each word in the provided sentence with the appropriate POS\
+  \ tag. The annotation list is given as: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. The input sentence will be a list of words in the sentence. The output format\
+  \ should be a list of tuples, where each tuple consists of a word from the input\
+  \ text and its corresponding POS tag label from the POS tag label set provided\n\
+  Your response should include only a list of tuples, in the order that the words\
+  \ appear in the input sentence, including punctuations, with each tuple containing\
+  \ the corresponding POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_lug_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_luo.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_luo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3136f885164f970a7ce5cc3da802fb6cbb1f51e8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_luo.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: luo
+doc_to_text: "Annotate each word in the provided sentence with the appropriate POS\
+  \ tag. The annotation list is given as: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. The input sentence will be a list of words in the sentence. The output format\
+  \ should be a list of tuples, where each tuple consists of a word from the input\
+  \ text and its corresponding POS tag label from the POS tag label set provided\n\
+  Your response should include only a list of tuples, in the order that the words\
+  \ appear in the input sentence, including punctuations, with each tuple containing\
+  \ the corresponding POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_luo_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_mos.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_mos.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..24ae470cacd0ece662ebe5109f3d67d8669741fb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_mos.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: mos
+doc_to_text: "Annotate each word in the provided sentence with the appropriate POS\
+  \ tag. The annotation list is given as: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. The input sentence will be a list of words in the sentence. The output format\
+  \ should be a list of tuples, where each tuple consists of a word from the input\
+  \ text and its corresponding POS tag label from the POS tag label set provided\n\
+  Your response should include only a list of tuples, in the order that the words\
+  \ appear in the input sentence, including punctuations, with each tuple containing\
+  \ the corresponding POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_mos_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_nya.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_nya.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..616c003d477322972eb955fc479ed333bf96001b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_nya.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: nya
+doc_to_text: "Annotate each word in the provided sentence with the appropriate POS\
+  \ tag. The annotation list is given as: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. The input sentence will be a list of words in the sentence. The output format\
+  \ should be a list of tuples, where each tuple consists of a word from the input\
+  \ text and its corresponding POS tag label from the POS tag label set provided\n\
+  Your response should include only a list of tuples, in the order that the words\
+  \ appear in the input sentence, including punctuations, with each tuple containing\
+  \ the corresponding POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_nya_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_pcm.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_pcm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dcaae1189f0aeb79f965e37e6f59d8f52a7f1416
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_pcm.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: pcm
+doc_to_text: "Annotate each word in the provided sentence with the appropriate POS\
+  \ tag. The annotation list is given as: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. The input sentence will be a list of words in the sentence. The output format\
+  \ should be a list of tuples, where each tuple consists of a word from the input\
+  \ text and its corresponding POS tag label from the POS tag label set provided\n\
+  Your response should include only a list of tuples, in the order that the words\
+  \ appear in the input sentence, including punctuations, with each tuple containing\
+  \ the corresponding POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_pcm_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_sna.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..07237cee90d2275e0d400695efc13ef077a6fbc3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_sna.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: sna
+doc_to_text: "Annotate each word in the provided sentence with the appropriate POS\
+  \ tag. The annotation list is given as: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. The input sentence will be a list of words in the sentence. The output format\
+  \ should be a list of tuples, where each tuple consists of a word from the input\
+  \ text and its corresponding POS tag label from the POS tag label set provided\n\
+  Your response should include only a list of tuples, in the order that the words\
+  \ appear in the input sentence, including punctuations, with each tuple containing\
+  \ the corresponding POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_sna_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_swa.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c937299bf5f7db6bd864be8c718744802f32a834
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_swa.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: "Annotate each word in the provided sentence with the appropriate POS\
+  \ tag. The annotation list is given as: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. The input sentence will be a list of words in the sentence. The output format\
+  \ should be a list of tuples, where each tuple consists of a word from the input\
+  \ text and its corresponding POS tag label from the POS tag label set provided\n\
+  Your response should include only a list of tuples, in the order that the words\
+  \ appear in the input sentence, including punctuations, with each tuple containing\
+  \ the corresponding POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_swa_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_tsn.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_tsn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e1bc5ad546a49e699732aab50dde24130a6b9a81
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_tsn.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: tsn
+doc_to_text: "Annotate each word in the provided sentence with the appropriate POS\
+  \ tag. The annotation list is given as: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. The input sentence will be a list of words in the sentence. The output format\
+  \ should be a list of tuples, where each tuple consists of a word from the input\
+  \ text and its corresponding POS tag label from the POS tag label set provided\n\
+  Your response should include only a list of tuples, in the order that the words\
+  \ appear in the input sentence, including punctuations, with each tuple containing\
+  \ the corresponding POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_tsn_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_twi.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bf3a523b9319a84f14f675ab26f88027d4f40315
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_twi.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_text: "Annotate each word in the provided sentence with the appropriate POS\
+  \ tag. The annotation list is given as: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. The input sentence will be a list of words in the sentence. The output format\
+  \ should be a list of tuples, where each tuple consists of a word from the input\
+  \ text and its corresponding POS tag label from the POS tag label set provided\n\
+  Your response should include only a list of tuples, in the order that the words\
+  \ appear in the input sentence, including punctuations, with each tuple containing\
+  \ the corresponding POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_twi_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_wol.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d427cee3cdb444f9f5c75c06f27209baca9459fa
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_wol.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: wol
+doc_to_text: "Annotate each word in the provided sentence with the appropriate POS\
+  \ tag. The annotation list is given as: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. The input sentence will be a list of words in the sentence. The output format\
+  \ should be a list of tuples, where each tuple consists of a word from the input\
+  \ text and its corresponding POS tag label from the POS tag label set provided\n\
+  Your response should include only a list of tuples, in the order that the words\
+  \ appear in the input sentence, including punctuations, with each tuple containing\
+  \ the corresponding POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_wol_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_xho.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4b6525f98b3ae37542ce700b83caaa072e3f6f3f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_xho.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: xho
+doc_to_text: "Annotate each word in the provided sentence with the appropriate POS\
+  \ tag. The annotation list is given as: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. The input sentence will be a list of words in the sentence. The output format\
+  \ should be a list of tuples, where each tuple consists of a word from the input\
+  \ text and its corresponding POS tag label from the POS tag label set provided\n\
+  Your response should include only a list of tuples, in the order that the words\
+  \ appear in the input sentence, including punctuations, with each tuple containing\
+  \ the corresponding POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_xho_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ba62938696ba16d383965dbdca203f048b5e0738
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_yaml
@@ -0,0 +1,32 @@
+tag:
+- masakhapos_tasks
+- masakhapos_prompt_4
+dataset_path: masakhane/masakhapos
+dataset_name: null
+dataset_kwargs: {trust_remote_code: True}
+output_type: generate_until
+generation_kwargs:
+  do_sample: false
+  until:
+  - </s>
+  - <|im_end|>
+validation_split: validation
+test_split: test
+fewshot_split: train
+doc_to_target: !function utils.doc_to_target
+should_decontaminate: true
+doc_to_decontamination_query: "Sentence: {{token}}\nOutput:"
+filter_list:
+  - filter:
+    - function: regex_pos
+    name: flexible-extract
+metric_list:
+  - metric: acc
+    aggregation: !function utils.acc_score
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_yor.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a7d70f674ad0fdd5ddd5a11ae7df41a4b428b738
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_yor.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: "Annotate each word in the provided sentence with the appropriate POS\
+  \ tag. The annotation list is given as: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. The input sentence will be a list of words in the sentence. The output format\
+  \ should be a list of tuples, where each tuple consists of a word from the input\
+  \ text and its corresponding POS tag label from the POS tag label set provided\n\
+  Your response should include only a list of tuples, in the order that the words\
+  \ appear in the input sentence, including punctuations, with each tuple containing\
+  \ the corresponding POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_yor_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_zul.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2a03cc5d5dc809cab290adbb90d1ef4188d861f7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_zul.yaml
@@ -0,0 +1,13 @@
+# Generated by utils.py
+dataset_name: zul
+doc_to_text: "Annotate each word in the provided sentence with the appropriate POS\
+  \ tag. The annotation list is given as: ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ, 'DET',\
+  \ 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB',\
+  \ 'X']. The input sentence will be a list of words in the sentence. The output format\
+  \ should be a list of tuples, where each tuple consists of a word from the input\
+  \ text and its corresponding POS tag label from the POS tag label set provided\n\
+  Your response should include only a list of tuples, in the order that the words\
+  \ appear in the input sentence, including punctuations, with each tuple containing\
+  \ the corresponding POS tag label for a word. \n\nSentence: {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_zul_prompt_4
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_4/utils.py b/lm_eval/tasks/afrobench/masakhapos/prompt_4/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ccc66d9cce30c1459494f0d5c21a71d1d3f58d4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_4/utils.py
@@ -0,0 +1,55 @@
+from itertools import chain
+
+from sklearn.metrics import accuracy_score
+
+from lm_eval.utils import weighted_f1_score
+
+
+def doc_to_target(doc):
+    pos_tag_map = {
+        0: "NOUN",
+        1: "PUNCT",
+        2: "ADP",
+        3: "NUM",
+        4: "SYM",
+        5: "SCONJ",
+        6: "ADJ",
+        7: "PART",
+        8: "DET",
+        9: "CCONJ",
+        10: "PROPN",
+        11: "PRON",
+        12: "X",
+        13: "_",
+        14: "ADV",
+        15: "INTJ",
+        16: "VERB",
+        17: "AUX",
+    }
+    return [pos_tag_map[tag] for tag in doc["upos"]]
+
+
+def acc_score(items):
+    unzipped_list = list(zip(*items))
+
+    golds, preds = unzipped_list[0], unzipped_list[1]
+
+    # Flatten preds' inner lists
+    flattened_preds = [list(chain.from_iterable(p)) for p in preds]
+
+    # Calculate the accuracy for each gold-pred pair
+    accuracy_scores = []
+    for gold, pred in zip(golds, flattened_preds):
+        # Ensure both lists are of the same length, otherwise truncate to match
+        min_length = min(len(gold), len(pred))
+        gold = gold[:min_length]
+        pred = pred[:min_length]
+
+        # Calculate accuracy for the current pair and add to the list
+        accuracy = accuracy_score(gold, pred)
+        accuracy_scores.append(accuracy)
+
+    mean_accuracy = (
+        sum(accuracy_scores) / len(accuracy_scores) if accuracy_scores else 0
+    )
+    return mean_accuracy
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_bam.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_bam.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4cd65c90efa0d394c0e613e62dee4c6d95dce124
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_bam.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: bam
+doc_to_text: "Given the following sentence, identify the part of speech (POS) for\
+  \ each word. Use the following POS tag set: \nNOUN: Noun (person, place, thing),\
+  \ \nVERB: Verb (action, state), \nADJ: Adjective (describes a noun), \nADV: Adverb\
+  \ (modifies a verb, adjective, or adverb), \nPRON: Pronoun (replaces a noun), \n\
+  DET: Determiner (introduces a noun), \nADP: Adposition (preposition or postposition),\
+  \ \nCCONJ: Conjunction (connects words, phrases, clauses)\nPUNCT: Punctuation, \n\
+  PROPN: Proper Noun, \nAUX: Auxiliary verb (helper verb), \nSCONJ: Subordinating\
+  \ conjunction \nPART: Particle, \nSYM: Symbol, \nINTJ: Interjection, \nNUM: Numeral,\
+  \ \nX: others. The output format should be a list of tuples, where each tuple consists\
+  \ of a word from the input text and its corresponding POS tag label key only from\
+  \ the POS tag set provided\nYour response should include only a list of tuples,\
+  \ in the order that the words appear in the input sentence, including punctuations,\
+  \ with each tuple containing the corresponding POS tag label for a word. \n\nSentence:\
+  \ {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_bam_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_bbj.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_bbj.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..969406dcbd1b4863244ba19446bf846eda017e8f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_bbj.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: bbj
+doc_to_text: "Given the following sentence, identify the part of speech (POS) for\
+  \ each word. Use the following POS tag set: \nNOUN: Noun (person, place, thing),\
+  \ \nVERB: Verb (action, state), \nADJ: Adjective (describes a noun), \nADV: Adverb\
+  \ (modifies a verb, adjective, or adverb), \nPRON: Pronoun (replaces a noun), \n\
+  DET: Determiner (introduces a noun), \nADP: Adposition (preposition or postposition),\
+  \ \nCCONJ: Conjunction (connects words, phrases, clauses)\nPUNCT: Punctuation, \n\
+  PROPN: Proper Noun, \nAUX: Auxiliary verb (helper verb), \nSCONJ: Subordinating\
+  \ conjunction \nPART: Particle, \nSYM: Symbol, \nINTJ: Interjection, \nNUM: Numeral,\
+  \ \nX: others. The output format should be a list of tuples, where each tuple consists\
+  \ of a word from the input text and its corresponding POS tag label key only from\
+  \ the POS tag set provided\nYour response should include only a list of tuples,\
+  \ in the order that the words appear in the input sentence, including punctuations,\
+  \ with each tuple containing the corresponding POS tag label for a word. \n\nSentence:\
+  \ {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_bbj_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_ewe.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..aacc83ee0f47aec3f6dd93fadacdc12177a24cfd
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_ewe.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: ewe
+doc_to_text: "Given the following sentence, identify the part of speech (POS) for\
+  \ each word. Use the following POS tag set: \nNOUN: Noun (person, place, thing),\
+  \ \nVERB: Verb (action, state), \nADJ: Adjective (describes a noun), \nADV: Adverb\
+  \ (modifies a verb, adjective, or adverb), \nPRON: Pronoun (replaces a noun), \n\
+  DET: Determiner (introduces a noun), \nADP: Adposition (preposition or postposition),\
+  \ \nCCONJ: Conjunction (connects words, phrases, clauses)\nPUNCT: Punctuation, \n\
+  PROPN: Proper Noun, \nAUX: Auxiliary verb (helper verb), \nSCONJ: Subordinating\
+  \ conjunction \nPART: Particle, \nSYM: Symbol, \nINTJ: Interjection, \nNUM: Numeral,\
+  \ \nX: others. The output format should be a list of tuples, where each tuple consists\
+  \ of a word from the input text and its corresponding POS tag label key only from\
+  \ the POS tag set provided\nYour response should include only a list of tuples,\
+  \ in the order that the words appear in the input sentence, including punctuations,\
+  \ with each tuple containing the corresponding POS tag label for a word. \n\nSentence:\
+  \ {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_ewe_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_fon.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_fon.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..642d1d0acd90761c2cbb04d3987dafa43e9ab1f2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_fon.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: fon
+doc_to_text: "Given the following sentence, identify the part of speech (POS) for\
+  \ each word. Use the following POS tag set: \nNOUN: Noun (person, place, thing),\
+  \ \nVERB: Verb (action, state), \nADJ: Adjective (describes a noun), \nADV: Adverb\
+  \ (modifies a verb, adjective, or adverb), \nPRON: Pronoun (replaces a noun), \n\
+  DET: Determiner (introduces a noun), \nADP: Adposition (preposition or postposition),\
+  \ \nCCONJ: Conjunction (connects words, phrases, clauses)\nPUNCT: Punctuation, \n\
+  PROPN: Proper Noun, \nAUX: Auxiliary verb (helper verb), \nSCONJ: Subordinating\
+  \ conjunction \nPART: Particle, \nSYM: Symbol, \nINTJ: Interjection, \nNUM: Numeral,\
+  \ \nX: others. The output format should be a list of tuples, where each tuple consists\
+  \ of a word from the input text and its corresponding POS tag label key only from\
+  \ the POS tag set provided\nYour response should include only a list of tuples,\
+  \ in the order that the words appear in the input sentence, including punctuations,\
+  \ with each tuple containing the corresponding POS tag label for a word. \n\nSentence:\
+  \ {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_fon_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_hau.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b2c07ce71d205c9a4236fbd2777ef44d624683e5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_hau.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: "Given the following sentence, identify the part of speech (POS) for\
+  \ each word. Use the following POS tag set: \nNOUN: Noun (person, place, thing),\
+  \ \nVERB: Verb (action, state), \nADJ: Adjective (describes a noun), \nADV: Adverb\
+  \ (modifies a verb, adjective, or adverb), \nPRON: Pronoun (replaces a noun), \n\
+  DET: Determiner (introduces a noun), \nADP: Adposition (preposition or postposition),\
+  \ \nCCONJ: Conjunction (connects words, phrases, clauses)\nPUNCT: Punctuation, \n\
+  PROPN: Proper Noun, \nAUX: Auxiliary verb (helper verb), \nSCONJ: Subordinating\
+  \ conjunction \nPART: Particle, \nSYM: Symbol, \nINTJ: Interjection, \nNUM: Numeral,\
+  \ \nX: others. The output format should be a list of tuples, where each tuple consists\
+  \ of a word from the input text and its corresponding POS tag label key only from\
+  \ the POS tag set provided\nYour response should include only a list of tuples,\
+  \ in the order that the words appear in the input sentence, including punctuations,\
+  \ with each tuple containing the corresponding POS tag label for a word. \n\nSentence:\
+  \ {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_hau_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_ibo.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bef4b9941243e2f41332bb2410bd42a815e497bb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_ibo.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: "Given the following sentence, identify the part of speech (POS) for\
+  \ each word. Use the following POS tag set: \nNOUN: Noun (person, place, thing),\
+  \ \nVERB: Verb (action, state), \nADJ: Adjective (describes a noun), \nADV: Adverb\
+  \ (modifies a verb, adjective, or adverb), \nPRON: Pronoun (replaces a noun), \n\
+  DET: Determiner (introduces a noun), \nADP: Adposition (preposition or postposition),\
+  \ \nCCONJ: Conjunction (connects words, phrases, clauses)\nPUNCT: Punctuation, \n\
+  PROPN: Proper Noun, \nAUX: Auxiliary verb (helper verb), \nSCONJ: Subordinating\
+  \ conjunction \nPART: Particle, \nSYM: Symbol, \nINTJ: Interjection, \nNUM: Numeral,\
+  \ \nX: others. The output format should be a list of tuples, where each tuple consists\
+  \ of a word from the input text and its corresponding POS tag label key only from\
+  \ the POS tag set provided\nYour response should include only a list of tuples,\
+  \ in the order that the words appear in the input sentence, including punctuations,\
+  \ with each tuple containing the corresponding POS tag label for a word. \n\nSentence:\
+  \ {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_ibo_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_kin.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1983540b6a1d4dc21930d883d81fd53e778ca6a0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_kin.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: kin
+doc_to_text: "Given the following sentence, identify the part of speech (POS) for\
+  \ each word. Use the following POS tag set: \nNOUN: Noun (person, place, thing),\
+  \ \nVERB: Verb (action, state), \nADJ: Adjective (describes a noun), \nADV: Adverb\
+  \ (modifies a verb, adjective, or adverb), \nPRON: Pronoun (replaces a noun), \n\
+  DET: Determiner (introduces a noun), \nADP: Adposition (preposition or postposition),\
+  \ \nCCONJ: Conjunction (connects words, phrases, clauses)\nPUNCT: Punctuation, \n\
+  PROPN: Proper Noun, \nAUX: Auxiliary verb (helper verb), \nSCONJ: Subordinating\
+  \ conjunction \nPART: Particle, \nSYM: Symbol, \nINTJ: Interjection, \nNUM: Numeral,\
+  \ \nX: others. The output format should be a list of tuples, where each tuple consists\
+  \ of a word from the input text and its corresponding POS tag label key only from\
+  \ the POS tag set provided\nYour response should include only a list of tuples,\
+  \ in the order that the words appear in the input sentence, including punctuations,\
+  \ with each tuple containing the corresponding POS tag label for a word. \n\nSentence:\
+  \ {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_kin_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_lug.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..55b9210a54621ee792db781b085a208f8384b0ba
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_lug.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: lug
+doc_to_text: "Given the following sentence, identify the part of speech (POS) for\
+  \ each word. Use the following POS tag set: \nNOUN: Noun (person, place, thing),\
+  \ \nVERB: Verb (action, state), \nADJ: Adjective (describes a noun), \nADV: Adverb\
+  \ (modifies a verb, adjective, or adverb), \nPRON: Pronoun (replaces a noun), \n\
+  DET: Determiner (introduces a noun), \nADP: Adposition (preposition or postposition),\
+  \ \nCCONJ: Conjunction (connects words, phrases, clauses)\nPUNCT: Punctuation, \n\
+  PROPN: Proper Noun, \nAUX: Auxiliary verb (helper verb), \nSCONJ: Subordinating\
+  \ conjunction \nPART: Particle, \nSYM: Symbol, \nINTJ: Interjection, \nNUM: Numeral,\
+  \ \nX: others. The output format should be a list of tuples, where each tuple consists\
+  \ of a word from the input text and its corresponding POS tag label key only from\
+  \ the POS tag set provided\nYour response should include only a list of tuples,\
+  \ in the order that the words appear in the input sentence, including punctuations,\
+  \ with each tuple containing the corresponding POS tag label for a word. \n\nSentence:\
+  \ {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_lug_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_luo.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_luo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5a17e407c3f20cd80bae0b9673455fc242cfa19c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_luo.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: luo
+doc_to_text: "Given the following sentence, identify the part of speech (POS) for\
+  \ each word. Use the following POS tag set: \nNOUN: Noun (person, place, thing),\
+  \ \nVERB: Verb (action, state), \nADJ: Adjective (describes a noun), \nADV: Adverb\
+  \ (modifies a verb, adjective, or adverb), \nPRON: Pronoun (replaces a noun), \n\
+  DET: Determiner (introduces a noun), \nADP: Adposition (preposition or postposition),\
+  \ \nCCONJ: Conjunction (connects words, phrases, clauses)\nPUNCT: Punctuation, \n\
+  PROPN: Proper Noun, \nAUX: Auxiliary verb (helper verb), \nSCONJ: Subordinating\
+  \ conjunction \nPART: Particle, \nSYM: Symbol, \nINTJ: Interjection, \nNUM: Numeral,\
+  \ \nX: others. The output format should be a list of tuples, where each tuple consists\
+  \ of a word from the input text and its corresponding POS tag label key only from\
+  \ the POS tag set provided\nYour response should include only a list of tuples,\
+  \ in the order that the words appear in the input sentence, including punctuations,\
+  \ with each tuple containing the corresponding POS tag label for a word. \n\nSentence:\
+  \ {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_luo_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_mos.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_mos.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..43479749d5848f86898081e8ba751942f44b74e2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_mos.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: mos
+doc_to_text: "Given the following sentence, identify the part of speech (POS) for\
+  \ each word. Use the following POS tag set: \nNOUN: Noun (person, place, thing),\
+  \ \nVERB: Verb (action, state), \nADJ: Adjective (describes a noun), \nADV: Adverb\
+  \ (modifies a verb, adjective, or adverb), \nPRON: Pronoun (replaces a noun), \n\
+  DET: Determiner (introduces a noun), \nADP: Adposition (preposition or postposition),\
+  \ \nCCONJ: Conjunction (connects words, phrases, clauses)\nPUNCT: Punctuation, \n\
+  PROPN: Proper Noun, \nAUX: Auxiliary verb (helper verb), \nSCONJ: Subordinating\
+  \ conjunction \nPART: Particle, \nSYM: Symbol, \nINTJ: Interjection, \nNUM: Numeral,\
+  \ \nX: others. The output format should be a list of tuples, where each tuple consists\
+  \ of a word from the input text and its corresponding POS tag label key only from\
+  \ the POS tag set provided\nYour response should include only a list of tuples,\
+  \ in the order that the words appear in the input sentence, including punctuations,\
+  \ with each tuple containing the corresponding POS tag label for a word. \n\nSentence:\
+  \ {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_mos_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_nya.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_nya.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7d2d0ec114db2080efcfbc76c1d63511d2a9ae07
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_nya.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: nya
+doc_to_text: "Given the following sentence, identify the part of speech (POS) for\
+  \ each word. Use the following POS tag set: \nNOUN: Noun (person, place, thing),\
+  \ \nVERB: Verb (action, state), \nADJ: Adjective (describes a noun), \nADV: Adverb\
+  \ (modifies a verb, adjective, or adverb), \nPRON: Pronoun (replaces a noun), \n\
+  DET: Determiner (introduces a noun), \nADP: Adposition (preposition or postposition),\
+  \ \nCCONJ: Conjunction (connects words, phrases, clauses)\nPUNCT: Punctuation, \n\
+  PROPN: Proper Noun, \nAUX: Auxiliary verb (helper verb), \nSCONJ: Subordinating\
+  \ conjunction \nPART: Particle, \nSYM: Symbol, \nINTJ: Interjection, \nNUM: Numeral,\
+  \ \nX: others. The output format should be a list of tuples, where each tuple consists\
+  \ of a word from the input text and its corresponding POS tag label key only from\
+  \ the POS tag set provided\nYour response should include only a list of tuples,\
+  \ in the order that the words appear in the input sentence, including punctuations,\
+  \ with each tuple containing the corresponding POS tag label for a word. \n\nSentence:\
+  \ {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_nya_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_pcm.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_pcm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cd5ea9278b841721283b09b5920f8d395674b81f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_pcm.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: pcm
+doc_to_text: "Given the following sentence, identify the part of speech (POS) for\
+  \ each word. Use the following POS tag set: \nNOUN: Noun (person, place, thing),\
+  \ \nVERB: Verb (action, state), \nADJ: Adjective (describes a noun), \nADV: Adverb\
+  \ (modifies a verb, adjective, or adverb), \nPRON: Pronoun (replaces a noun), \n\
+  DET: Determiner (introduces a noun), \nADP: Adposition (preposition or postposition),\
+  \ \nCCONJ: Conjunction (connects words, phrases, clauses)\nPUNCT: Punctuation, \n\
+  PROPN: Proper Noun, \nAUX: Auxiliary verb (helper verb), \nSCONJ: Subordinating\
+  \ conjunction \nPART: Particle, \nSYM: Symbol, \nINTJ: Interjection, \nNUM: Numeral,\
+  \ \nX: others. The output format should be a list of tuples, where each tuple consists\
+  \ of a word from the input text and its corresponding POS tag label key only from\
+  \ the POS tag set provided\nYour response should include only a list of tuples,\
+  \ in the order that the words appear in the input sentence, including punctuations,\
+  \ with each tuple containing the corresponding POS tag label for a word. \n\nSentence:\
+  \ {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_pcm_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_sna.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3cc21f0cf87014b3eea6e0e6dcddbc38450066fa
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_sna.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: sna
+doc_to_text: "Given the following sentence, identify the part of speech (POS) for\
+  \ each word. Use the following POS tag set: \nNOUN: Noun (person, place, thing),\
+  \ \nVERB: Verb (action, state), \nADJ: Adjective (describes a noun), \nADV: Adverb\
+  \ (modifies a verb, adjective, or adverb), \nPRON: Pronoun (replaces a noun), \n\
+  DET: Determiner (introduces a noun), \nADP: Adposition (preposition or postposition),\
+  \ \nCCONJ: Conjunction (connects words, phrases, clauses)\nPUNCT: Punctuation, \n\
+  PROPN: Proper Noun, \nAUX: Auxiliary verb (helper verb), \nSCONJ: Subordinating\
+  \ conjunction \nPART: Particle, \nSYM: Symbol, \nINTJ: Interjection, \nNUM: Numeral,\
+  \ \nX: others. The output format should be a list of tuples, where each tuple consists\
+  \ of a word from the input text and its corresponding POS tag label key only from\
+  \ the POS tag set provided\nYour response should include only a list of tuples,\
+  \ in the order that the words appear in the input sentence, including punctuations,\
+  \ with each tuple containing the corresponding POS tag label for a word. \n\nSentence:\
+  \ {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_sna_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_swa.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b08dacdef6912bf10bc3136726f28229eeb43d30
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_swa.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: swa
+doc_to_text: "Given the following sentence, identify the part of speech (POS) for\
+  \ each word. Use the following POS tag set: \nNOUN: Noun (person, place, thing),\
+  \ \nVERB: Verb (action, state), \nADJ: Adjective (describes a noun), \nADV: Adverb\
+  \ (modifies a verb, adjective, or adverb), \nPRON: Pronoun (replaces a noun), \n\
+  DET: Determiner (introduces a noun), \nADP: Adposition (preposition or postposition),\
+  \ \nCCONJ: Conjunction (connects words, phrases, clauses)\nPUNCT: Punctuation, \n\
+  PROPN: Proper Noun, \nAUX: Auxiliary verb (helper verb), \nSCONJ: Subordinating\
+  \ conjunction \nPART: Particle, \nSYM: Symbol, \nINTJ: Interjection, \nNUM: Numeral,\
+  \ \nX: others. The output format should be a list of tuples, where each tuple consists\
+  \ of a word from the input text and its corresponding POS tag label key only from\
+  \ the POS tag set provided\nYour response should include only a list of tuples,\
+  \ in the order that the words appear in the input sentence, including punctuations,\
+  \ with each tuple containing the corresponding POS tag label for a word. \n\nSentence:\
+  \ {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_swa_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_tsn.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_tsn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bbc20d6ea0ab4e613dc077eac01227c1d8ca198a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_tsn.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: tsn
+doc_to_text: "Given the following sentence, identify the part of speech (POS) for\
+  \ each word. Use the following POS tag set: \nNOUN: Noun (person, place, thing),\
+  \ \nVERB: Verb (action, state), \nADJ: Adjective (describes a noun), \nADV: Adverb\
+  \ (modifies a verb, adjective, or adverb), \nPRON: Pronoun (replaces a noun), \n\
+  DET: Determiner (introduces a noun), \nADP: Adposition (preposition or postposition),\
+  \ \nCCONJ: Conjunction (connects words, phrases, clauses)\nPUNCT: Punctuation, \n\
+  PROPN: Proper Noun, \nAUX: Auxiliary verb (helper verb), \nSCONJ: Subordinating\
+  \ conjunction \nPART: Particle, \nSYM: Symbol, \nINTJ: Interjection, \nNUM: Numeral,\
+  \ \nX: others. The output format should be a list of tuples, where each tuple consists\
+  \ of a word from the input text and its corresponding POS tag label key only from\
+  \ the POS tag set provided\nYour response should include only a list of tuples,\
+  \ in the order that the words appear in the input sentence, including punctuations,\
+  \ with each tuple containing the corresponding POS tag label for a word. \n\nSentence:\
+  \ {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_tsn_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_twi.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..11af3b877758759dc4d4eb34fbf8f99421d54f7b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_twi.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: twi
+doc_to_text: "Given the following sentence, identify the part of speech (POS) for\
+  \ each word. Use the following POS tag set: \nNOUN: Noun (person, place, thing),\
+  \ \nVERB: Verb (action, state), \nADJ: Adjective (describes a noun), \nADV: Adverb\
+  \ (modifies a verb, adjective, or adverb), \nPRON: Pronoun (replaces a noun), \n\
+  DET: Determiner (introduces a noun), \nADP: Adposition (preposition or postposition),\
+  \ \nCCONJ: Conjunction (connects words, phrases, clauses)\nPUNCT: Punctuation, \n\
+  PROPN: Proper Noun, \nAUX: Auxiliary verb (helper verb), \nSCONJ: Subordinating\
+  \ conjunction \nPART: Particle, \nSYM: Symbol, \nINTJ: Interjection, \nNUM: Numeral,\
+  \ \nX: others. The output format should be a list of tuples, where each tuple consists\
+  \ of a word from the input text and its corresponding POS tag label key only from\
+  \ the POS tag set provided\nYour response should include only a list of tuples,\
+  \ in the order that the words appear in the input sentence, including punctuations,\
+  \ with each tuple containing the corresponding POS tag label for a word. \n\nSentence:\
+  \ {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_twi_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_wol.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ca294724bced107ce05490ba52be94f9d73b5f74
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_wol.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: wol
+doc_to_text: "Given the following sentence, identify the part of speech (POS) for\
+  \ each word. Use the following POS tag set: \nNOUN: Noun (person, place, thing),\
+  \ \nVERB: Verb (action, state), \nADJ: Adjective (describes a noun), \nADV: Adverb\
+  \ (modifies a verb, adjective, or adverb), \nPRON: Pronoun (replaces a noun), \n\
+  DET: Determiner (introduces a noun), \nADP: Adposition (preposition or postposition),\
+  \ \nCCONJ: Conjunction (connects words, phrases, clauses)\nPUNCT: Punctuation, \n\
+  PROPN: Proper Noun, \nAUX: Auxiliary verb (helper verb), \nSCONJ: Subordinating\
+  \ conjunction \nPART: Particle, \nSYM: Symbol, \nINTJ: Interjection, \nNUM: Numeral,\
+  \ \nX: others. The output format should be a list of tuples, where each tuple consists\
+  \ of a word from the input text and its corresponding POS tag label key only from\
+  \ the POS tag set provided\nYour response should include only a list of tuples,\
+  \ in the order that the words appear in the input sentence, including punctuations,\
+  \ with each tuple containing the corresponding POS tag label for a word. \n\nSentence:\
+  \ {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_wol_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_xho.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..345354c3c3bf7efdca51ee328c23b29f26dd5daa
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_xho.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: xho
+doc_to_text: "Given the following sentence, identify the part of speech (POS) for\
+  \ each word. Use the following POS tag set: \nNOUN: Noun (person, place, thing),\
+  \ \nVERB: Verb (action, state), \nADJ: Adjective (describes a noun), \nADV: Adverb\
+  \ (modifies a verb, adjective, or adverb), \nPRON: Pronoun (replaces a noun), \n\
+  DET: Determiner (introduces a noun), \nADP: Adposition (preposition or postposition),\
+  \ \nCCONJ: Conjunction (connects words, phrases, clauses)\nPUNCT: Punctuation, \n\
+  PROPN: Proper Noun, \nAUX: Auxiliary verb (helper verb), \nSCONJ: Subordinating\
+  \ conjunction \nPART: Particle, \nSYM: Symbol, \nINTJ: Interjection, \nNUM: Numeral,\
+  \ \nX: others. The output format should be a list of tuples, where each tuple consists\
+  \ of a word from the input text and its corresponding POS tag label key only from\
+  \ the POS tag set provided\nYour response should include only a list of tuples,\
+  \ in the order that the words appear in the input sentence, including punctuations,\
+  \ with each tuple containing the corresponding POS tag label for a word. \n\nSentence:\
+  \ {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_xho_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..df148e8a8ab567d65dc12a36f60a0b3f753b8c86
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_yaml
@@ -0,0 +1,32 @@
+tag:
+- masakhapos_tasks
+- masakhapos_prompt_5
+dataset_path: masakhane/masakhapos
+dataset_name: null
+dataset_kwargs: {trust_remote_code: True}
+output_type: generate_until
+generation_kwargs:
+  do_sample: false
+  until:
+  - </s>
+  - <|im_end|>
+validation_split: validation
+test_split: test
+fewshot_split: train
+doc_to_target: !function utils.doc_to_target
+should_decontaminate: true
+doc_to_decontamination_query: "Sentence: {{token}}\nOutput:"
+filter_list:
+  - filter:
+    - function: regex_pos
+    name: flexible-extract
+metric_list:
+  - metric: acc
+    aggregation: !function utils.acc_score
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_yor.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..84bb266af31906017d066df113e7ca999579f744
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_yor.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: "Given the following sentence, identify the part of speech (POS) for\
+  \ each word. Use the following POS tag set: \nNOUN: Noun (person, place, thing),\
+  \ \nVERB: Verb (action, state), \nADJ: Adjective (describes a noun), \nADV: Adverb\
+  \ (modifies a verb, adjective, or adverb), \nPRON: Pronoun (replaces a noun), \n\
+  DET: Determiner (introduces a noun), \nADP: Adposition (preposition or postposition),\
+  \ \nCCONJ: Conjunction (connects words, phrases, clauses)\nPUNCT: Punctuation, \n\
+  PROPN: Proper Noun, \nAUX: Auxiliary verb (helper verb), \nSCONJ: Subordinating\
+  \ conjunction \nPART: Particle, \nSYM: Symbol, \nINTJ: Interjection, \nNUM: Numeral,\
+  \ \nX: others. The output format should be a list of tuples, where each tuple consists\
+  \ of a word from the input text and its corresponding POS tag label key only from\
+  \ the POS tag set provided\nYour response should include only a list of tuples,\
+  \ in the order that the words appear in the input sentence, including punctuations,\
+  \ with each tuple containing the corresponding POS tag label for a word. \n\nSentence:\
+  \ {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_yor_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_zul.yaml b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4e400bfe74d1505f9335dcd6baf3ff21c949b8b3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_zul.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: zul
+doc_to_text: "Given the following sentence, identify the part of speech (POS) for\
+  \ each word. Use the following POS tag set: \nNOUN: Noun (person, place, thing),\
+  \ \nVERB: Verb (action, state), \nADJ: Adjective (describes a noun), \nADV: Adverb\
+  \ (modifies a verb, adjective, or adverb), \nPRON: Pronoun (replaces a noun), \n\
+  DET: Determiner (introduces a noun), \nADP: Adposition (preposition or postposition),\
+  \ \nCCONJ: Conjunction (connects words, phrases, clauses)\nPUNCT: Punctuation, \n\
+  PROPN: Proper Noun, \nAUX: Auxiliary verb (helper verb), \nSCONJ: Subordinating\
+  \ conjunction \nPART: Particle, \nSYM: Symbol, \nINTJ: Interjection, \nNUM: Numeral,\
+  \ \nX: others. The output format should be a list of tuples, where each tuple consists\
+  \ of a word from the input text and its corresponding POS tag label key only from\
+  \ the POS tag set provided\nYour response should include only a list of tuples,\
+  \ in the order that the words appear in the input sentence, including punctuations,\
+  \ with each tuple containing the corresponding POS tag label for a word. \n\nSentence:\
+  \ {{tokens}} \nOutput: "
+include: masakhapos_yaml
+task: masakhapos_zul_prompt_5
diff --git a/lm_eval/tasks/afrobench/masakhapos/prompt_5/utils.py b/lm_eval/tasks/afrobench/masakhapos/prompt_5/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ccc66d9cce30c1459494f0d5c21a71d1d3f58d4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_5/utils.py
@@ -0,0 +1,55 @@
+from itertools import chain
+
+from sklearn.metrics import accuracy_score
+
+from lm_eval.utils import weighted_f1_score
+
+
+def doc_to_target(doc):
+    pos_tag_map = {
+        0: "NOUN",
+        1: "PUNCT",
+        2: "ADP",
+        3: "NUM",
+        4: "SYM",
+        5: "SCONJ",
+        6: "ADJ",
+        7: "PART",
+        8: "DET",
+        9: "CCONJ",
+        10: "PROPN",
+        11: "PRON",
+        12: "X",
+        13: "_",
+        14: "ADV",
+        15: "INTJ",
+        16: "VERB",
+        17: "AUX",
+    }
+    return [pos_tag_map[tag] for tag in doc["upos"]]
+
+
+def acc_score(items):
+    unzipped_list = list(zip(*items))
+
+    golds, preds = unzipped_list[0], unzipped_list[1]
+
+    # Flatten preds' inner lists
+    flattened_preds = [list(chain.from_iterable(p)) for p in preds]
+
+    # Calculate the accuracy for each gold-pred pair
+    accuracy_scores = []
+    for gold, pred in zip(golds, flattened_preds):
+        # Ensure both lists are of the same length, otherwise truncate to match
+        min_length = min(len(gold), len(pred))
+        gold = gold[:min_length]
+        pred = pred[:min_length]
+
+        # Calculate accuracy for the current pair and add to the list
+        accuracy = accuracy_score(gold, pred)
+        accuracy_scores.append(accuracy)
+
+    mean_accuracy = (
+        sum(accuracy_scores) / len(accuracy_scores) if accuracy_scores else 0
+    )
+    return mean_accuracy
diff --git a/lm_eval/tasks/afrobench/masakhapos/utils.py b/lm_eval/tasks/afrobench/masakhapos/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7976f846c42a3b8d347553cacc97779dea15671
--- /dev/null
+++ b/lm_eval/tasks/afrobench/masakhapos/utils.py
@@ -0,0 +1,40 @@
+from lm_eval.utils import weighted_f1_score
+
+
+def doc_to_text(doc):
+    output = """Please provide the POS tags for each word in the input sentence. The input will be a list of words in
+    the sentence. The output format should be a list of tuples, where each tuple consists of a word from the input text
+    and its corresponding POS tag label from the tag label set: ["ADJ", "ADP", "ADV", "AUX", "CCONJ, "DET", "INTJ",
+    "NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT" "SCONJ", "SYM", "VERB", "X"]. \nYour response should include only a
+    list of tuples, in the order that the words appear in the input sentence, with each tuple containing the
+    corresponding POS tag label for a word.
+
+    Input: {tokens}
+    Output: """
+
+    text = output.format(subject=doc["tokens"])
+    return text
+
+
+def doc_to_target(doc):
+    pos_tag_map = {
+        0: "NOUN",
+        1: "PUNCT",
+        2: "ADP",
+        3: "NUM",
+        4: "SYM",
+        5: "SCONJ",
+        6: "ADJ",
+        7: "PART",
+        8: "DET",
+        9: "CCONJ",
+        10: "PROPN",
+        11: "PRON",
+        12: "X",
+        13: "_",
+        14: "ADV",
+        15: "INTJ",
+        16: "VERB",
+        17: "AUX",
+    }
+    return [pos_tag_map[tag] for tag in doc["upos"]]
diff --git a/lm_eval/tasks/afrobench/naijarc/README.md b/lm_eval/tasks/afrobench/naijarc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f6f98178b8ee2a0f60e818a93d520fb67d748bce
--- /dev/null
+++ b/lm_eval/tasks/afrobench/naijarc/README.md
@@ -0,0 +1,25 @@
+#
+
+## Paper
+Title: `NaijaRC: A Multi-choice Reading Comprehension Dataset for Nigerian Languages`
+
+Paper Link: https://arxiv.org/abs/2308.09768
+
+## Abstract
+>In this paper, we create NaijaRC: a new multi-choice Reading Comprehension dataset for three native Nigeria languages that is based on high-school reading comprehension examination. We provide baseline results by performing cross-lingual transfer using existing English RACE and Belebele training dataset based on a pre-trained encoder-only model. Additionally, we provide results by prompting large language models (LLMs) like GPT-4.
+
+HomePage: https://huggingface.co/datasets/aremuadeolajr/NaijaRC
+
+### Citation
+
+```
+@misc{aremu2024naijarcmultichoicereadingcomprehension,
+      title={NaijaRC: A Multi-choice Reading Comprehension Dataset for Nigerian Languages},
+      author={Anuoluwapo Aremu and Jesujoba O. Alabi and Daud Abolade and Nkechinyere F. Aguobi and Shamsuddeen Hassan Muhammad and David Ifeoluwa Adelani},
+      year={2024},
+      eprint={2308.09768},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2308.09768},
+}
+```
diff --git a/lm_eval/tasks/afrobench/naijarc/naijarc.yaml b/lm_eval/tasks/afrobench/naijarc/naijarc.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4230ed64941418151913be985ebd809060ebe6a8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/naijarc/naijarc.yaml
@@ -0,0 +1,13 @@
+group: naijarc
+task:
+  - naijarc_prompt_1
+  - naijarc_prompt_2
+  - naijarc_prompt_3
+  - naijarc_prompt_4
+  - naijarc_prompt_5
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1
diff --git a/lm_eval/tasks/afrobench/naijarc/prompt_1/naijarc b/lm_eval/tasks/afrobench/naijarc/prompt_1/naijarc
new file mode 100644
index 0000000000000000000000000000000000000000..b077e3bb5c92cd6aaade7621b93511bf2851ab72
--- /dev/null
+++ b/lm_eval/tasks/afrobench/naijarc/prompt_1/naijarc
@@ -0,0 +1,24 @@
+tag:
+    - naijarc_tasks
+    - naijarc_prompt_1
+    - RC_tasks
+dataset_path: Davlan/NaijaRC
+dataset_name: null
+output_type: multiple_choice
+test_split: test
+fewshot_split: test
+fewshot_config:
+  sampler: first_n
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(Answer.strip())}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{question}}"
+doc_to_choice: ["A", "B", "C", "D"]
+metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+  - metric: acc_norm
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/naijarc/prompt_1/naijarc_hau.yaml b/lm_eval/tasks/afrobench/naijarc/prompt_1/naijarc_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1144a9a2d58eab36de778b1939c6b925e671210d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/naijarc/prompt_1/naijarc_hau.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: 'P: {{story}}
+
+  Q: {{question.strip()}}
+
+  A: {{options_A}}
+
+  B: {{options_B}}
+
+  C: {{options_C}}
+
+  D: {{options_D}}
+
+  Please choose the correct answer from the options above:'
+include: naijarc
+task: naijarc_hau_prompt_1
diff --git a/lm_eval/tasks/afrobench/naijarc/prompt_1/naijarc_ibo.yaml b/lm_eval/tasks/afrobench/naijarc/prompt_1/naijarc_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1db685234f5dc17f4cf6ac355a802d4d9329d191
--- /dev/null
+++ b/lm_eval/tasks/afrobench/naijarc/prompt_1/naijarc_ibo.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: 'P: {{story}}
+
+  Q: {{question.strip()}}
+
+  A: {{options_A}}
+
+  B: {{options_B}}
+
+  C: {{options_C}}
+
+  D: {{options_D}}
+
+  Please choose the correct answer from the options above:'
+include: naijarc
+task: naijarc_ibo_prompt_1
diff --git a/lm_eval/tasks/afrobench/naijarc/prompt_1/naijarc_yor.yaml b/lm_eval/tasks/afrobench/naijarc/prompt_1/naijarc_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2bb83fea0ad9cb686266f87d064a1f4902984288
--- /dev/null
+++ b/lm_eval/tasks/afrobench/naijarc/prompt_1/naijarc_yor.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: 'P: {{story}}
+
+  Q: {{question.strip()}}
+
+  A: {{options_A}}
+
+  B: {{options_B}}
+
+  C: {{options_C}}
+
+  D: {{options_D}}
+
+  Please choose the correct answer from the options above:'
+include: naijarc
+task: naijarc_yor_prompt_1
diff --git a/lm_eval/tasks/afrobench/naijarc/prompt_2/naijarc b/lm_eval/tasks/afrobench/naijarc/prompt_2/naijarc
new file mode 100644
index 0000000000000000000000000000000000000000..3a8ec09a94a68295544a7afc613b34f96f4f7082
--- /dev/null
+++ b/lm_eval/tasks/afrobench/naijarc/prompt_2/naijarc
@@ -0,0 +1,23 @@
+tag:
+    - naijarc_tasks
+    - naijarc_prompt_2
+    - RC_tasks
+dataset_path: Davlan/NaijaRC
+dataset_name: null
+output_type: multiple_choice
+test_split: test
+fewshot_config:
+  sampler: first_n
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(Answer.strip())}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{question}}"
+doc_to_choice: ["A", "B", "C", "D"]
+metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+  - metric: acc_norm
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/naijarc/prompt_2/naijarc_hau.yaml b/lm_eval/tasks/afrobench/naijarc/prompt_2/naijarc_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a1d94db4025c21351b28a9a538efb77cb18aaadf
--- /dev/null
+++ b/lm_eval/tasks/afrobench/naijarc/prompt_2/naijarc_hau.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: 'Passage: {{story}}
+
+  Question: {{question.strip()}}
+
+  1: {{options_A}}
+
+  2: {{options_B}}
+
+  3: {{options_C}}
+
+  4: {{options_D}}
+
+  Please select the correct answer from the given choices:'
+include: naijarc
+task: naijarc_hau_prompt_2
diff --git a/lm_eval/tasks/afrobench/naijarc/prompt_2/naijarc_ibo.yaml b/lm_eval/tasks/afrobench/naijarc/prompt_2/naijarc_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8384fad18343389dd8a22a1b7d2ae21e1de0e22e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/naijarc/prompt_2/naijarc_ibo.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: 'Passage: {{story}}
+
+  Question: {{question.strip()}}
+
+  1: {{options_A}}
+
+  2: {{options_B}}
+
+  3: {{options_C}}
+
+  4: {{options_D}}
+
+  Please select the correct answer from the given choices:'
+include: naijarc
+task: naijarc_ibo_prompt_2
diff --git a/lm_eval/tasks/afrobench/naijarc/prompt_2/naijarc_yor.yaml b/lm_eval/tasks/afrobench/naijarc/prompt_2/naijarc_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..88b1c198185945ce82a619f7b06b5777d27083aa
--- /dev/null
+++ b/lm_eval/tasks/afrobench/naijarc/prompt_2/naijarc_yor.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: 'Passage: {{story}}
+
+  Question: {{question.strip()}}
+
+  1: {{options_A}}
+
+  2: {{options_B}}
+
+  3: {{options_C}}
+
+  4: {{options_D}}
+
+  Please select the correct answer from the given choices:'
+include: naijarc
+task: naijarc_yor_prompt_2
diff --git a/lm_eval/tasks/afrobench/naijarc/prompt_3/naijarc b/lm_eval/tasks/afrobench/naijarc/prompt_3/naijarc
new file mode 100644
index 0000000000000000000000000000000000000000..06746a4314ecf5700b09020482eda0698fe2a126
--- /dev/null
+++ b/lm_eval/tasks/afrobench/naijarc/prompt_3/naijarc
@@ -0,0 +1,23 @@
+tag:
+    - naijarc_tasks
+    - naijarc_prompt_3
+    - RC_tasks
+dataset_path: Davlan/NaijaRC
+dataset_name: null
+output_type: multiple_choice
+test_split: test
+fewshot_config:
+  sampler: first_n
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(Answer.strip())}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{question}}"
+doc_to_choice: ["A", "B", "C", "D"]
+metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+  - metric: acc_norm
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/naijarc/prompt_3/naijarc_hau.yaml b/lm_eval/tasks/afrobench/naijarc/prompt_3/naijarc_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fb4b443124950e9ba6a7df1896111a68a257e7ed
--- /dev/null
+++ b/lm_eval/tasks/afrobench/naijarc/prompt_3/naijarc_hau.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: 'Context: {{story}}
+
+  Query: {{question.strip()}}
+
+  Option A: {{options_A}}
+
+  Option B: {{options_B}}
+
+  Option C: {{options_C}}
+
+  Option D: {{options_D}}
+
+  Please indicate the correct option from the list above:'
+include: naijarc
+task: naijarc_hau_prompt_3
diff --git a/lm_eval/tasks/afrobench/naijarc/prompt_3/naijarc_ibo.yaml b/lm_eval/tasks/afrobench/naijarc/prompt_3/naijarc_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dad37fe953e6056fa58a9dd006d5d79de29002a7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/naijarc/prompt_3/naijarc_ibo.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: 'Context: {{story}}
+
+  Query: {{question.strip()}}
+
+  Option A: {{options_A}}
+
+  Option B: {{options_B}}
+
+  Option C: {{options_C}}
+
+  Option D: {{options_D}}
+
+  Please indicate the correct option from the list above:'
+include: naijarc
+task: naijarc_ibo_prompt_3
diff --git a/lm_eval/tasks/afrobench/naijarc/prompt_3/naijarc_yor.yaml b/lm_eval/tasks/afrobench/naijarc/prompt_3/naijarc_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5ab84a8b5dcaf181c72b1db050f53281eeb26600
--- /dev/null
+++ b/lm_eval/tasks/afrobench/naijarc/prompt_3/naijarc_yor.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: 'Context: {{story}}
+
+  Query: {{question.strip()}}
+
+  Option A: {{options_A}}
+
+  Option B: {{options_B}}
+
+  Option C: {{options_C}}
+
+  Option D: {{options_D}}
+
+  Please indicate the correct option from the list above:'
+include: naijarc
+task: naijarc_yor_prompt_3
diff --git a/lm_eval/tasks/afrobench/naijarc/prompt_4/naijarc b/lm_eval/tasks/afrobench/naijarc/prompt_4/naijarc
new file mode 100644
index 0000000000000000000000000000000000000000..27bbc8c90c54954073b905cb3161bab83a83a203
--- /dev/null
+++ b/lm_eval/tasks/afrobench/naijarc/prompt_4/naijarc
@@ -0,0 +1,23 @@
+tag:
+    - naijarc_tasks
+    - naijarc_prompt_4
+    - RC_tasks
+dataset_path: Davlan/NaijaRC
+dataset_name: null
+output_type: multiple_choice
+test_split: test
+fewshot_config:
+  sampler: first_n
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(Answer.strip())}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{question}}"
+doc_to_choice: ["A", "B", "C", "D"]
+metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+  - metric: acc_norm
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/naijarc/prompt_4/naijarc_hau.yaml b/lm_eval/tasks/afrobench/naijarc/prompt_4/naijarc_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4f846a8cf42bcb903dbf957218996db34cccf4ea
--- /dev/null
+++ b/lm_eval/tasks/afrobench/naijarc/prompt_4/naijarc_hau.yaml
@@ -0,0 +1,21 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: '{{story}}
+
+  Based on the above passage, answer the following question:
+
+  {{question.strip()}}
+
+  Choices:
+
+  A) {{options_A}}
+
+  B) {{options_B}}
+
+  C) {{options_C}}
+
+  D) {{options_D}}
+
+  Please provide the correct answer from the choices given:'
+include: naijarc
+task: naijarc_hau_prompt_4
diff --git a/lm_eval/tasks/afrobench/naijarc/prompt_4/naijarc_ibo.yaml b/lm_eval/tasks/afrobench/naijarc/prompt_4/naijarc_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..926d7a8f1615a83902e98ff65633e0fd19838d8d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/naijarc/prompt_4/naijarc_ibo.yaml
@@ -0,0 +1,21 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: '{{story}}
+
+  Based on the above passage, answer the following question:
+
+  {{question.strip()}}
+
+  Choices:
+
+  A) {{options_A}}
+
+  B) {{options_B}}
+
+  C) {{options_C}}
+
+  D) {{options_D}}
+
+  Please provide the correct answer from the choices given:'
+include: naijarc
+task: naijarc_ibo_prompt_4
diff --git a/lm_eval/tasks/afrobench/naijarc/prompt_4/naijarc_yor.yaml b/lm_eval/tasks/afrobench/naijarc/prompt_4/naijarc_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..13ad793cbdd9544de9cc50c861ef72c04226f32b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/naijarc/prompt_4/naijarc_yor.yaml
@@ -0,0 +1,21 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: '{{story}}
+
+  Based on the above passage, answer the following question:
+
+  {{question.strip()}}
+
+  Choices:
+
+  A) {{options_A}}
+
+  B) {{options_B}}
+
+  C) {{options_C}}
+
+  D) {{options_D}}
+
+  Please provide the correct answer from the choices given:'
+include: naijarc
+task: naijarc_yor_prompt_4
diff --git a/lm_eval/tasks/afrobench/naijarc/prompt_5/naijarc b/lm_eval/tasks/afrobench/naijarc/prompt_5/naijarc
new file mode 100644
index 0000000000000000000000000000000000000000..0aa06d3452b44af6333b30ffd82f5ae610440ec2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/naijarc/prompt_5/naijarc
@@ -0,0 +1,23 @@
+tag:
+    - naijarc_tasks
+    - naijarc_prompt_5
+    - RC_tasks
+dataset_path: Davlan/NaijaRC
+dataset_name: null
+output_type: multiple_choice
+test_split: test
+fewshot_config:
+  sampler: first_n
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(Answer.strip())}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{question}}"
+doc_to_choice: ["A", "B", "C", "D"]
+metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+  - metric: acc_norm
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/naijarc/prompt_5/naijarc_hau.yaml b/lm_eval/tasks/afrobench/naijarc/prompt_5/naijarc_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c6ba82f92825183d3c78079d68cd2a44444dde95
--- /dev/null
+++ b/lm_eval/tasks/afrobench/naijarc/prompt_5/naijarc_hau.yaml
@@ -0,0 +1,19 @@
+# Generated by utils.py
+dataset_name: hau
+doc_to_text: 'Read the passage: {{story}}
+
+  Then answer the question: {{question.strip()}}
+
+  Options:
+
+  A. {{options_A}}
+
+  B. {{options_B}}
+
+  C. {{options_C}}
+
+  D. {{options_D}}
+
+  Please choose the correct option from the above list:'
+include: naijarc
+task: naijarc_hau_prompt_5
diff --git a/lm_eval/tasks/afrobench/naijarc/prompt_5/naijarc_ibo.yaml b/lm_eval/tasks/afrobench/naijarc/prompt_5/naijarc_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b527dc1f70c59aef74de17aa82052978658ddf97
--- /dev/null
+++ b/lm_eval/tasks/afrobench/naijarc/prompt_5/naijarc_ibo.yaml
@@ -0,0 +1,19 @@
+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: 'Read the passage: {{story}}
+
+  Then answer the question: {{question.strip()}}
+
+  Options:
+
+  A. {{options_A}}
+
+  B. {{options_B}}
+
+  C. {{options_C}}
+
+  D. {{options_D}}
+
+  Please choose the correct option from the above list:'
+include: naijarc
+task: naijarc_ibo_prompt_5
diff --git a/lm_eval/tasks/afrobench/naijarc/prompt_5/naijarc_yor.yaml b/lm_eval/tasks/afrobench/naijarc/prompt_5/naijarc_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0959e3277d10fb768565622143eee4e9728fd3c1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/naijarc/prompt_5/naijarc_yor.yaml
@@ -0,0 +1,19 @@
+# Generated by utils.py
+dataset_name: yor
+doc_to_text: 'Read the passage: {{story}}
+
+  Then answer the question: {{question.strip()}}
+
+  Options:
+
+  A. {{options_A}}
+
+  B. {{options_B}}
+
+  C. {{options_C}}
+
+  D. {{options_D}}
+
+  Please choose the correct option from the above list:'
+include: naijarc
+task: naijarc_yor_prompt_5
diff --git a/lm_eval/tasks/afrobench/naijarc/utils.py b/lm_eval/tasks/afrobench/naijarc/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad636a8e882286a7b504e6889c083fb7d8e36ad3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/naijarc/utils.py
@@ -0,0 +1,93 @@
+import argparse
+import os
+
+import yaml
+
+
+def prompt_func(mode, lang):
+    prompt_map = {
+        "prompt_1": "P: {{story}}\nQ: {{question.strip()}}\nA: {{options_A}}\nB: {{options_B}}\nC: {{options_C}}\nD: {{options_D}}\nPlease choose the correct answer from the options above:",
+        "prompt_2": "Passage: {{story}}\nQuestion: {{question.strip()}}\n1: {{options_A}}\n2: {{options_B}}\n3: {{options_C}}\n4: {{options_D}}\nPlease select the correct answer from the given choices:",
+        "prompt_3": "Context: {{story}}\nQuery: {{question.strip()}}\nOption A: {{options_A}}\nOption B: {{options_B}}\nOption C: {{options_C}}\nOption D: {{options_D}}\nPlease indicate the correct option from the list above:",
+        "prompt_4": "{{story}}\nBased on the above passage, answer the following question:\n{{question.strip()}}\nChoices:\nA) {{options_A}}\nB) {{options_B}}\nC) {{options_C}}\nD) {{options_D}}\nPlease provide the correct answer from the choices given:",
+        "prompt_5": "Read the passage: {{story}}\nThen answer the question: {{question.strip()}}\nOptions:\nA. {{options_A}}\nB. {{options_B}}\nC. {{options_C}}\nD. {{options_D}}\nPlease choose the correct option from the above list:",
+    }
+    return prompt_map[mode]
+
+
+def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
+    """
+    Generate a yaml file for each language.
+
+    :param output_dir: The directory to output the files to.
+    :param overwrite: Whether to overwrite files if they already exist.
+    """
+    err = []
+    languages = {
+        "hau": "Hausa",
+        "ibo": "Igbo",
+        "yor": "Yoruba",
+    }
+
+    for lang in languages.keys():
+        try:
+            file_name = f"naijarc_{lang}.yaml"
+            task_name = f"naijarc_{lang}_{mode}"
+            yaml_template = "naijarc"
+            yaml_details = {
+                "include": yaml_template,
+                "task": task_name,
+                "dataset_name": lang,
+                "doc_to_text": prompt_func(mode, languages[lang]),
+            }
+            file_path = os.path.join(output_dir, mode)
+            os.makedirs(file_path, exist_ok=True)
+
+            with open(
+                f"{output_dir}/{mode}/{file_name}",
+                "w" if overwrite else "x",
+                encoding="utf8",
+            ) as f:
+                f.write("# Generated by utils.py\n")
+                yaml.dump(
+                    yaml_details,
+                    f,
+                    allow_unicode=True,
+                )
+        except FileExistsError:
+            err.append(file_name)
+
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist (use --overwrite flag):"
+            f" {', '.join(err)}"
+        )
+
+
+def main() -> None:
+    """Parse CLI args and generate language-specific yaml files."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--overwrite",
+        default=True,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default="./",
+        help="Directory to write yaml files to",
+    )
+    parser.add_argument(
+        "--mode",
+        default="prompt_1",
+        choices=["prompt_1", "prompt_2", "prompt_3", "prompt_4", "prompt_5"],
+        help="Prompt number",
+    )
+    args = parser.parse_args()
+
+    gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite, mode=args.mode)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lm_eval/tasks/afrobench/nollysenti/README.md b/lm_eval/tasks/afrobench/nollysenti/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..fa2413190b57192fe7a4a4250bf9fb41eb5950a3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/README.md
@@ -0,0 +1,35 @@
+#
+
+## Paper
+Title: `NollySenti: Leveraging Transfer Learning and Machine Translation for Nigerian Movie Sentiment Classification`
+
+Paper Link: https://aclanthology.org/2023.acl-short.85/
+
+## Abstract
+>Africa has over 2000 indigenous languages but they are under-represented in NLP research due to lack of datasets. In recent years, there have been progress in developing labelled corpora for African languages. However, they are often available in a single domain and may not generalize to other domains. In this paper, we focus on the task of sentiment classification for cross-domain adaptation. We create a new dataset, Nollywood movie reviews for five languages widely spoken in Nigeria (English, Hausa, Igbo, Nigerian Pidgin, and Yoruba). We provide an extensive empirical evaluation using classical machine learning methods and pre-trained language models. By leveraging transfer learning, we compare the performance of cross-domain adaptation from Twitter domain, and cross-lingual adaptation from English language. Our evaluation shows that transfer from English in the same target domain leads to more than 5% improvement in accuracy compared to transfer from Twitter in the same language. To further mitigate the domain difference, we leverage machine translation from English to other Nigerian languages, which leads to a further improvement of 7% over cross-lingual evaluation. While machine translation to low-resource languages are often of low quality, our analysis shows that sentiment related words are often preserved.
+
+HomePage: https://github.com/IyanuSh/NollySenti
+
+### Citation
+
+```
+@inproceedings{shode-etal-2023-nollysenti,
+    title = "{N}olly{S}enti: Leveraging Transfer Learning and Machine Translation for {N}igerian Movie Sentiment Classification",
+    author = "Shode, Iyanuoluwa  and
+      Adelani, David Ifeoluwa  and
+      Peng, JIng  and
+      Feldman, Anna",
+    editor = "Rogers, Anna  and
+      Boyd-Graber, Jordan  and
+      Okazaki, Naoaki",
+    booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
+    month = jul,
+    year = "2023",
+    address = "Toronto, Canada",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2023.acl-short.85/",
+    doi = "10.18653/v1/2023.acl-short.85",
+    pages = "986--998",
+    abstract = "Africa has over 2000 indigenous languages but they are under-represented in NLP research due to lack of datasets. In recent years, there have been progress in developing labelled corpora for African languages. However, they are often available in a single domain and may not generalize to other domains. In this paper, we focus on the task of sentiment classification for cross-domain adaptation. We create a new dataset, Nollywood movie reviews for five languages widely spoken in Nigeria (English, Hausa, Igbo, Nigerian Pidgin, and Yoruba). We provide an extensive empirical evaluation using classical machine learning methods and pre-trained language models. By leveraging transfer learning, we compare the performance of cross-domain adaptation from Twitter domain, and cross-lingual adaptation from English language. Our evaluation shows that transfer from English in the same target domain leads to more than 5{\%} improvement in accuracy compared to transfer from Twitter in the same language. To further mitigate the domain difference, we leverage machine translation from English to other Nigerian languages, which leads to a further improvement of 7{\%} over cross-lingual evaluation. While machine translation to low-resource languages are often of low quality, our analysis shows that sentiment related words are often preserved."
+}
+```
diff --git a/lm_eval/tasks/afrobench/nollysenti/nollysenti.yaml b/lm_eval/tasks/afrobench/nollysenti/nollysenti.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7fb1326258af24566aff25c0478f9cba513fd8b7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/nollysenti.yaml
@@ -0,0 +1,13 @@
+group: nollysenti
+task:
+  - nollysenti_prompt_1
+  - nollysenti_prompt_2
+  - nollysenti_prompt_3
+  - nollysenti_prompt_4
+  - nollysenti_prompt_5
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1
diff --git a/lm_eval/tasks/afrobench/nollysenti/prompt_1/nollysenti b/lm_eval/tasks/afrobench/nollysenti/prompt_1/nollysenti
new file mode 100644
index 0000000000000000000000000000000000000000..0476cdc0e8a5f5fc3a886423f5b0052c0918b4c9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_1/nollysenti
@@ -0,0 +1,38 @@
+tag:
+    - afrobench_sentiment_tasks
+    - nollysenti_prompt_1
+dataset_path: Davlan/nollysenti
+dataset_kwargs: {trust_remote_code: True}
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: validation
+doc_to_text: 'Does this movie description "{{review}}" have a Positive or Negative sentiment? Labels only\n'
+doc_to_target: label
+doc_to_choice:
+    - "positive"
+    - "negative"
+should_decontaminate: true
+doc_to_decontamination_query: review
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/nollysenti/prompt_1/nollysenti_eng.yaml b/lm_eval/tasks/afrobench/nollysenti/prompt_1/nollysenti_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5cf3a85f0dc5b40221d33dedad85f669055f913e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_1/nollysenti_eng.yaml
@@ -0,0 +1,3 @@
+dataset_name: en
+include: nollysenti
+task: nollysenti_eng_prompt_1
diff --git a/lm_eval/tasks/afrobench/nollysenti/prompt_1/nollysenti_hau.yaml b/lm_eval/tasks/afrobench/nollysenti/prompt_1/nollysenti_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..157e97dbe5106cdad11dfc3202d08663816f0730
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_1/nollysenti_hau.yaml
@@ -0,0 +1,3 @@
+dataset_name: ha
+include: nollysenti
+task: nollysenti_hau_prompt_1
diff --git a/lm_eval/tasks/afrobench/nollysenti/prompt_1/nollysenti_ibo.yaml b/lm_eval/tasks/afrobench/nollysenti/prompt_1/nollysenti_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..77c9bfd45f08c0876cf19b4da09d6d5cbc29e3c4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_1/nollysenti_ibo.yaml
@@ -0,0 +1,3 @@
+dataset_name: ig
+include: nollysenti
+task: nollysenti_ibo_prompt_1
diff --git a/lm_eval/tasks/afrobench/nollysenti/prompt_1/nollysenti_pcm.yaml b/lm_eval/tasks/afrobench/nollysenti/prompt_1/nollysenti_pcm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..536301688c02f9ca8ef4f576d9874ad624abe8fa
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_1/nollysenti_pcm.yaml
@@ -0,0 +1,3 @@
+dataset_name: pcm
+include: nollysenti
+task: nollysenti_pcm_prompt_1
diff --git a/lm_eval/tasks/afrobench/nollysenti/prompt_1/nollysenti_yor.yaml b/lm_eval/tasks/afrobench/nollysenti/prompt_1/nollysenti_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6dc1cfabadc7019a92b7d023982641ac60a0b9c2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_1/nollysenti_yor.yaml
@@ -0,0 +1,3 @@
+dataset_name: yo
+include: nollysenti
+task: nollysenti_yor_prompt_1
diff --git a/lm_eval/tasks/afrobench/nollysenti/prompt_1/utils.py b/lm_eval/tasks/afrobench/nollysenti/prompt_1/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e735e2deb1f9c53152c072615aebe8ba3acb90b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_1/utils.py
@@ -0,0 +1 @@
+from lm_eval.utils import weighted_f1_score
diff --git a/lm_eval/tasks/afrobench/nollysenti/prompt_2/nollysenti b/lm_eval/tasks/afrobench/nollysenti/prompt_2/nollysenti
new file mode 100644
index 0000000000000000000000000000000000000000..76f664fee41316e4b8cf10faca4498c1e1c22916
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_2/nollysenti
@@ -0,0 +1,37 @@
+tag:
+    - afrobench_sentiment_tasks
+    - nollysenti_prompt_2
+dataset_path: Davlan/nollysenti
+dataset_kwargs: {trust_remote_code: True}
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: validation
+doc_to_target: label
+doc_to_choice:
+    - "positive"
+    - "negative"
+should_decontaminate: true
+doc_to_decontamination_query: review
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/nollysenti/prompt_2/nollysenti_eng.yaml b/lm_eval/tasks/afrobench/nollysenti/prompt_2/nollysenti_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ac3bb04d137a207aad2ac307bd2eefc7e5effc2d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_2/nollysenti_eng.yaml
@@ -0,0 +1,4 @@
+dataset_name: en
+include: nollysenti
+doc_to_text: 'Does this English movie description; "{{review}}" have a Positive or Negative sentiment? Labels only\n'
+task: nollysenti_eng_prompt_2
diff --git a/lm_eval/tasks/afrobench/nollysenti/prompt_2/nollysenti_hau.yaml b/lm_eval/tasks/afrobench/nollysenti/prompt_2/nollysenti_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f87bce673c68bacdcf3e516bb58c116ada8209e4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_2/nollysenti_hau.yaml
@@ -0,0 +1,4 @@
+dataset_name: ha
+include: nollysenti
+doc_to_text: 'Does this Hausa movie description; "{{review}}" have a Positive or Negative sentiment? Labels only\n'
+task: nollysenti_hau_prompt_2
diff --git a/lm_eval/tasks/afrobench/nollysenti/prompt_2/nollysenti_ibo.yaml b/lm_eval/tasks/afrobench/nollysenti/prompt_2/nollysenti_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2f7ae185dff1e0108d5d4b6d0bd5fa318c3c182b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_2/nollysenti_ibo.yaml
@@ -0,0 +1,4 @@
+dataset_name: ig
+include: nollysenti
+doc_to_text: 'Does this Igbo movie description; "{{review}}" have a Positive or Negative sentiment? Labels only\n'
+task: nollysenti_ibo_prompt_2
diff --git a/lm_eval/tasks/afrobench/nollysenti/prompt_2/nollysenti_pcm.yaml b/lm_eval/tasks/afrobench/nollysenti/prompt_2/nollysenti_pcm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b0305c7673fc5f2a527f96205a2b6730efff4db3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_2/nollysenti_pcm.yaml
@@ -0,0 +1,4 @@
+dataset_name: pcm
+include: nollysenti
+doc_to_text: 'Does this Naija Pidgin movie description; "{{review}}" have a Positive or Negative sentiment? Labels only\n'
+task: nollysenti_pcm_prompt_2
diff --git a/lm_eval/tasks/afrobench/nollysenti/prompt_2/nollysenti_yor.yaml b/lm_eval/tasks/afrobench/nollysenti/prompt_2/nollysenti_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..03c89d8bd05dec45bfc07f5af8c2dc8ed76388ae
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_2/nollysenti_yor.yaml
@@ -0,0 +1,4 @@
+dataset_name: yo
+include: nollysenti
+doc_to_text: 'Does this Yoruba movie description; "{{review}}" have a Positive or Negative sentiment? Labels only\n'
+task: nollysenti_yor_prompt_2
diff --git a/lm_eval/tasks/afrobench/nollysenti/prompt_2/utils.py b/lm_eval/tasks/afrobench/nollysenti/prompt_2/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e735e2deb1f9c53152c072615aebe8ba3acb90b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_2/utils.py
@@ -0,0 +1 @@
+from lm_eval.utils import weighted_f1_score
diff --git a/lm_eval/tasks/afrobench/nollysenti/prompt_3/nollysenti b/lm_eval/tasks/afrobench/nollysenti/prompt_3/nollysenti
new file mode 100644
index 0000000000000000000000000000000000000000..472928acdc7b964d60fbd0eb992af298319afcc4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_3/nollysenti
@@ -0,0 +1,37 @@
+tag:
+    - afrobench_sentiment_tasks
+    - nollysenti_prompt_3
+dataset_path: Davlan/nollysenti
+dataset_kwargs: {trust_remote_code: True}
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: validation
+doc_to_target: label
+doc_to_choice:
+    - "positive"
+    - "negative"
+should_decontaminate: true
+doc_to_decontamination_query: review
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/nollysenti/prompt_3/nollysenti_eng.yaml b/lm_eval/tasks/afrobench/nollysenti/prompt_3/nollysenti_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..df21a145c99fb1e7612868276e481724503460bc
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_3/nollysenti_eng.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: en
+doc_to_text: "You are an assistant able to detect sentiment in movie reviews. \n\nGiven\
+  \ the sentiment labels Positive or Negative; what is the sentiment of the\
+  \ English statement below? Return only the labels\n\nReview: {{review}}\n"
+include: nollysenti
+task: nollysenti_eng_prompt_3
diff --git a/lm_eval/tasks/afrobench/nollysenti/prompt_3/nollysenti_hau.yaml b/lm_eval/tasks/afrobench/nollysenti/prompt_3/nollysenti_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5d15488d6e25022a68dae9874a3b77598fd22dc0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_3/nollysenti_hau.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ha
+doc_to_text: "You are an assistant able to detect sentiment in movie reviews. \n\nGiven\
+  \ the sentiment labels Positive or Negative; what is the sentiment of the\
+  \ Hausa statement below? Return only the labels\n\nReview: {{review}}\n"
+include: nollysenti
+task: nollysenti_hau_prompt_3
diff --git a/lm_eval/tasks/afrobench/nollysenti/prompt_3/nollysenti_ibo.yaml b/lm_eval/tasks/afrobench/nollysenti/prompt_3/nollysenti_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2f6bb7b29581858b860b5919afbab5e5b22ebc28
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_3/nollysenti_ibo.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ig
+doc_to_text: "You are an assistant able to detect sentiment in movie reviews. \n\nGiven\
+  \ the sentiment labels Positive or Negative; what is the sentiment of the\
+  \ Igbo statement below? Return only the labels\n\nReview: {{review}}\n"
+include: nollysenti
+task: nollysenti_ibo_prompt_3
diff --git a/lm_eval/tasks/afrobench/nollysenti/prompt_3/nollysenti_pcm.yaml b/lm_eval/tasks/afrobench/nollysenti/prompt_3/nollysenti_pcm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f98519f3ed329da73ab2272fd33305670d8f2ec1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_3/nollysenti_pcm.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: pcm
+doc_to_text: "You are an assistant able to detect sentiment in movie reviews. \n\nGiven\
+  \ the sentiment labels Positive or Negative; what is the sentiment of the\
+  \ Naija Pidgin statement below? Return only the labels\n\nReview: {{review}}\n"
+include: nollysenti
+task: nollysenti_pcm_prompt_3
diff --git a/lm_eval/tasks/afrobench/nollysenti/prompt_3/nollysenti_yor.yaml b/lm_eval/tasks/afrobench/nollysenti/prompt_3/nollysenti_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fd64d1eda4fa7048690527046e71c6af21eb0d51
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_3/nollysenti_yor.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: yo
+doc_to_text: "You are an assistant able to detect sentiment in movie reviews. \n\nGiven\
+  \ the sentiment labels Positive or Negative; what is the sentiment of the\
+  \ Yoruba statement below? Return only the labels\n\nReview: {{review}}\n"
+include: nollysenti
+task: nollysenti_yor_prompt_3
diff --git a/lm_eval/tasks/afrobench/nollysenti/prompt_3/utils.py b/lm_eval/tasks/afrobench/nollysenti/prompt_3/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e735e2deb1f9c53152c072615aebe8ba3acb90b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_3/utils.py
@@ -0,0 +1 @@
+from lm_eval.utils import weighted_f1_score
diff --git a/lm_eval/tasks/afrobench/nollysenti/prompt_4/nollysenti b/lm_eval/tasks/afrobench/nollysenti/prompt_4/nollysenti
new file mode 100644
index 0000000000000000000000000000000000000000..de1bb486dc1c84ea828d1cb99deb16af6e3f1644
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_4/nollysenti
@@ -0,0 +1,37 @@
+tag:
+    - afrobench_sentiment_tasks
+    - nollysenti_prompt_4
+dataset_path: Davlan/nollysenti
+dataset_kwargs: {trust_remote_code: True}
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: validation
+doc_to_target: label
+doc_to_choice:
+    - "positive"
+    - "negative"
+should_decontaminate: true
+doc_to_decontamination_query: review
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/nollysenti/prompt_4/nollysenti_eng.yaml b/lm_eval/tasks/afrobench/nollysenti/prompt_4/nollysenti_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d8e01ab6efb4450b392b7d6278088c7f74114f61
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_4/nollysenti_eng.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: en
+doc_to_text: "Label the following text as Positive, or Negative. Provide\
+  \ only the label as your response. \n\ntext: {{review}} \nlabel: \n"
+include: nollysenti
+task: nollysenti_eng_prompt_4
diff --git a/lm_eval/tasks/afrobench/nollysenti/prompt_4/nollysenti_hau.yaml b/lm_eval/tasks/afrobench/nollysenti/prompt_4/nollysenti_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..abc9570484fbd79acebb9ba2b7be840bb9391c4d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_4/nollysenti_hau.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ha
+doc_to_text: "Label the following text as Positive, or Negative. Provide\
+  \ only the label as your response. \n\ntext: {{review}} \nlabel: \n"
+include: nollysenti
+task: nollysenti_hau_prompt_4
diff --git a/lm_eval/tasks/afrobench/nollysenti/prompt_4/nollysenti_ibo.yaml b/lm_eval/tasks/afrobench/nollysenti/prompt_4/nollysenti_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8962cf729075203d9c853470791aa15f7eb97023
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_4/nollysenti_ibo.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ig
+doc_to_text: "Label the following text as Positive, or Negative. Provide\
+  \ only the label as your response. \n\ntext: {{review}} \nlabel: \n"
+include: nollysenti
+task: nollysenti_ibo_prompt_4
diff --git a/lm_eval/tasks/afrobench/nollysenti/prompt_4/nollysenti_pcm.yaml b/lm_eval/tasks/afrobench/nollysenti/prompt_4/nollysenti_pcm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..36d43b795461972411b56413b1bc11386cc34d78
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_4/nollysenti_pcm.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: pcm
+doc_to_text: "Label the following text as Positive, or Negative. Provide\
+  \ only the label as your response. \n\ntext: {{review}} \nlabel: \n"
+include: nollysenti
+task: nollysenti_pcm_prompt_4
diff --git a/lm_eval/tasks/afrobench/nollysenti/prompt_4/nollysenti_yor.yaml b/lm_eval/tasks/afrobench/nollysenti/prompt_4/nollysenti_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2c100c4dd367e2d610e8881d0d7d932c3473f38c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_4/nollysenti_yor.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: yo
+doc_to_text: "Label the following text as Positive, or Negative. Provide\
+  \ only the label as your response. \n\ntext: {{review}} \nlabel: \n"
+include: nollysenti
+task: nollysenti_yor_prompt_4
diff --git a/lm_eval/tasks/afrobench/nollysenti/prompt_4/utils.py b/lm_eval/tasks/afrobench/nollysenti/prompt_4/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e735e2deb1f9c53152c072615aebe8ba3acb90b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_4/utils.py
@@ -0,0 +1 @@
+from lm_eval.utils import weighted_f1_score
diff --git a/lm_eval/tasks/afrobench/nollysenti/prompt_5/nollysenti b/lm_eval/tasks/afrobench/nollysenti/prompt_5/nollysenti
new file mode 100644
index 0000000000000000000000000000000000000000..2e25f2f088edcb81f754f3b7fd7f9a5e92e18b12
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_5/nollysenti
@@ -0,0 +1,37 @@
+tag:
+    - afrobench_sentiment_tasks
+    - nollysenti_prompt_5
+dataset_path: Davlan/nollysenti
+dataset_kwargs: {trust_remote_code: True}
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: validation
+doc_to_target: label
+doc_to_choice:
+    - "positive"
+    - "negative"
+should_decontaminate: true
+doc_to_decontamination_query: review
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/nollysenti/prompt_5/nollysenti_eng.yaml b/lm_eval/tasks/afrobench/nollysenti/prompt_5/nollysenti_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d485ffe154c61f91924a5c0015e5defeb8ea83a2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_5/nollysenti_eng.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: en
+doc_to_text: "You are tasked with performing sentiment classification on the following\
+  \ English text. For each input, classify the sentiment as positive, negative.\
+  \ Use the following guidelines: \n\n Positive: The text expresses happiness,\
+  \ satisfaction, or optimism. \nNegative: The text conveys disappointment, dissatisfaction,\
+  \ or pessimism. \n\nIf the text contains both positive and negative sentiments, choose\
+  \ the dominant sentiment. For ambiguous or unclear sentiments, select the label\
+  \ that best reflects the overall tone. Please provide a single classification for\
+  \ each input.\n\ntext: {{review}} \nlabel: \n"
+include: nollysenti
+task: nollysenti_eng_prompt_5
diff --git a/lm_eval/tasks/afrobench/nollysenti/prompt_5/nollysenti_hau.yaml b/lm_eval/tasks/afrobench/nollysenti/prompt_5/nollysenti_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7ed16af77a33c39aa1569a38047ef92091837152
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_5/nollysenti_hau.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: ha
+doc_to_text: "You are tasked with performing sentiment classification on the following\
+  \ Hausa text. For each input, classify the sentiment as positive, negative.\
+  \ Use the following guidelines: \n\n Positive: The text expresses happiness,\
+  \ satisfaction, or optimism. \nNegative: The text conveys disappointment, dissatisfaction,\
+  \ or pessimism. \n\nIf the text contains both positive and negative sentiments, choose\
+  \ the dominant sentiment. For ambiguous or unclear sentiments, select the label\
+  \ that best reflects the overall tone. Please provide a single classification for\
+  \ each input.\n\ntext: {{review}} \nlabel: \n"
+include: nollysenti
+task: nollysenti_hau_prompt_5
diff --git a/lm_eval/tasks/afrobench/nollysenti/prompt_5/nollysenti_ibo.yaml b/lm_eval/tasks/afrobench/nollysenti/prompt_5/nollysenti_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c75f26900298951c5934b17964ca0cd744d86726
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_5/nollysenti_ibo.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: ig
+doc_to_text: "You are tasked with performing sentiment classification on the following\
+  \ Igbo text. For each input, classify the sentiment as positive, negative.\
+  \ Use the following guidelines: \n\n Positive: The text expresses happiness,\
+  \ satisfaction, or optimism. \nNegative: The text conveys disappointment, dissatisfaction,\
+  \ or pessimism. \n\nIf the text contains both positive and negative sentiments, choose\
+  \ the dominant sentiment. For ambiguous or unclear sentiments, select the label\
+  \ that best reflects the overall tone. Please provide a single classification for\
+  \ each input. \n\ntext: {{review}} \nlabel: \n"
+include: nollysenti
+task: nollysenti_ibo_prompt_5
diff --git a/lm_eval/tasks/afrobench/nollysenti/prompt_5/nollysenti_pcm.yaml b/lm_eval/tasks/afrobench/nollysenti/prompt_5/nollysenti_pcm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..29b5cda0b66b083a2cbcdf8d6750d447e7890519
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_5/nollysenti_pcm.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: pcm
+doc_to_text: "You are tasked with performing sentiment classification on the following\
+  \ Naija Pidgin text. For each input, classify the sentiment as positive, negative.\
+  \ Use the following guidelines: \n\n Positive: The text expresses happiness,\
+  \ satisfaction, or optimism. \nNegative: The text conveys disappointment, dissatisfaction,\
+  \ or pessimism. \n\nIf the text contains both positive and negative sentiments, choose\
+  \ the dominant sentiment. For ambiguous or unclear sentiments, select the label\
+  \ that best reflects the overall tone. Please provide a single classification for\
+  \ each input. \n\ntext: {{review}} \nlabel: \n"
+include: nollysenti
+task: nollysenti_pcm_prompt_5
diff --git a/lm_eval/tasks/afrobench/nollysenti/prompt_5/nollysenti_yor.yaml b/lm_eval/tasks/afrobench/nollysenti/prompt_5/nollysenti_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f1aea0284e191356e15db16036a4d1abfbc1c5aa
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_5/nollysenti_yor.yaml
@@ -0,0 +1,12 @@
+# Generated by utils.py
+dataset_name: yo
+doc_to_text: "You are tasked with performing sentiment classification on the following\
+  \ Yoruba text. For each input, classify the sentiment as positive, negative.\
+  \ Use the following guidelines: \n\n Positive: The text expresses happiness,\
+  \ satisfaction, or optimism. \nNegative: The text conveys disappointment, dissatisfaction,\
+  \ or pessimism. \n\nIf the text contains both positive and negative sentiments, choose\
+  \ the dominant sentiment. For ambiguous or unclear sentiments, select the label\
+  \ that best reflects the overall tone. Please provide a single classification for\
+  \ each input. \n\ntext: {{review}} \nlabel: \n"
+include: nollysenti
+task: nollysenti_yor_prompt_5
diff --git a/lm_eval/tasks/afrobench/nollysenti/prompt_5/utils.py b/lm_eval/tasks/afrobench/nollysenti/prompt_5/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e735e2deb1f9c53152c072615aebe8ba3acb90b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_5/utils.py
@@ -0,0 +1 @@
+from lm_eval.utils import weighted_f1_score
diff --git a/lm_eval/tasks/afrobench/ntrex/README.md b/lm_eval/tasks/afrobench/ntrex/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d68cf8c99cb4d7cb8c68eb7d015e6cb26daca3cb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/README.md
@@ -0,0 +1,38 @@
+#
+
+## Paper
+Title: `NTREX-128 – News Test References for MT Evaluation of 128 Languages`
+
+Paper Link: https://aclanthology.org/2022.sumeval-1.4/
+
+## Abstract
+>We release NTREX-128, a data set for machine translation (MT) evaluation from English into a total of 128 target languages. The paper describes the data creation process and proposes a quality filtering method based on human evaluation. We show experimental results which confirm that the directionality of test sets translation indeed plays an important role wrt. the usefulness of the corresponding metrics’ scores. Thus, we recommend that the NTREX-128 data set should be used for evaluation of Englishsourced translation models but not in reverse direction. The test set release introduces another benchmark for the evaluation of massively multilingual machine translation research.
+
+HomePage: https://github.com/MicrosoftTranslator/NTREX
+
+### Citation
+
+```
+@inproceedings{federmann-etal-2022-ntrex,
+    title = "{NTREX}-128 {--} News Test References for {MT} Evaluation of 128 Languages",
+    author = "Federmann, Christian  and
+      Kocmi, Tom  and
+      Xin, Ying",
+    editor = "Ahuja, Kabir  and
+      Anastasopoulos, Antonios  and
+      Patra, Barun  and
+      Neubig, Graham  and
+      Choudhury, Monojit  and
+      Dandapat, Sandipan  and
+      Sitaram, Sunayana  and
+      Chaudhary, Vishrav",
+    booktitle = "Proceedings of the First Workshop on Scaling Up Multilingual Evaluation",
+    month = nov,
+    year = "2022",
+    address = "Online",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2022.sumeval-1.4/",
+    doi = "10.18653/v1/2022.sumeval-1.4",
+    pages = "21--24"
+}
+```
diff --git a/lm_eval/tasks/afrobench/ntrex/gen_utils.py b/lm_eval/tasks/afrobench/ntrex/gen_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba549de25b69b0892f6e80c923c44f7ca001cd79
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/gen_utils.py
@@ -0,0 +1,171 @@
+import argparse
+import os
+
+import yaml
+
+
+class FunctionTag:
+    def __init__(self, value):
+        self.value = value
+
+
+def prompt_func(mode, lang, lang_dict):
+    language_column_name = f"sentence_{lang}"
+    prompt_map = {
+        "prompt_1": f"{lang_dict[lang]}: {{{{{language_column_name}}}}} \nEnglish: ",
+        "prompt_1_reverse": f"English: {{{{sentence_eng_Latn}}}} \n{lang_dict[lang]}: ",
+        "prompt_2": f"You are a translation expert. Translate the following {lang_dict[lang]} sentences to English \n"
+        f"{lang_dict[lang]}: {{{{{language_column_name}}}}}\nEnglish: ",
+        "prompt_2_reverse": f"You are a translation expert. Translate the following English sentences to "
+        f"{lang_dict[lang]} "
+        "\nEnglish: {{sentence_eng_Latn}} "
+        f"\n{lang_dict[lang]}: ",
+        "prompt_3": f"As a {lang_dict[lang]} and English linguist, translate the following {lang_dict[lang]} sentences "
+        f"to English \n{lang_dict[lang]}: {{{{{language_column_name}}}}}\nEnglish: ",
+        "prompt_3_reverse": f"As a {lang_dict[lang]} and English linguist, translate the following English sentences to "
+        f"{lang_dict[lang]} "
+        "\nEnglish: {{sentence_eng_Latn}} "
+        f"\n{lang_dict[lang]}: ",
+    }
+    return prompt_map[mode]
+
+
+def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str, reverse: bool) -> None:
+    """
+    Generate a yaml file for each language.
+
+    :param output_dir: The directory to output the files to.
+    :param overwrite: Whether to overwrite files if they already exist.
+    """
+    err = []
+    languages = {
+        "afr_Latn": "Afrikaans",
+        "amh_Ethi": "Amharic",
+        "arb_Arab": "Arabic",
+        "bem_Latn": "Bemba",
+        "ewe_Latn": "Ewe",
+        "fra_Latn": "French",
+        "hau_Latn": "Hausa",
+        "ibo_Latn": "Igbo",
+        "kin_Latn": "Kinyarwanda",
+        "mey_Arab": "Hassaniya Arabic",
+        "mlg_Latn": "Malagasy",
+        "msa_Latn": "Malay",
+        "nde_Latn": "North Ndebele",
+        "nso_Latn": "Northern Sotho",
+        "nya_Latn": "Chichewa",
+        "orm_Ethi": "Oromo",
+        "shi_Arab": "Tachelhit",
+        "sna_Latn": "Shona (Latin)",
+        "som_Latn": "Somali",
+        "ssw_Latn": "Swati",
+        "swa_Latn": "Swahili",
+        "tam_Taml": "Tamil",
+        "tel_Telu": "Telugu",
+        "tir_Ethi": "Tigrinya",
+        "ton_Latn": "Tongan",
+        "tsn_Latn": "Tswana",
+        "urd_Arab": "Urdu",
+        "ven_Latn": "Venda",
+        "wol_Latn": "Wolof",
+        "xho_Latn": "Xhosa",
+        "yor_Latn": "Yoruba",
+        "zul_Latn": "Zulu",
+    }
+
+    for lang in languages.keys():
+        try:
+            if not reverse:
+                file_name = f"ntrex_{lang}-eng_Latn.yaml"
+                task_name = f"ntrex_{lang}-eng_Latn_{mode}"
+                yaml_template = "ntrex"
+                yaml_details = {
+                    "include": yaml_template,
+                    "dataset_name": f"{lang}",
+                    "task": task_name,
+                    "doc_to_target": "sentence_eng_Latn",
+                    "doc_to_text": prompt_func(mode, lang, languages),
+                }
+                os.makedirs(f"{output_dir}/{mode}/african-english", exist_ok=True)
+                with open(
+                    f"{output_dir}/{mode}/african-english/{file_name}",
+                    "w" if overwrite else "x",
+                    encoding="utf8",
+                ) as f:
+                    f.write("# Generated by utils.py\n")
+                    yaml.dump(
+                        yaml_details,
+                        f,
+                        allow_unicode=True,
+                    )
+            else:
+                file_name = f"ntrex_eng_Latn-{lang}.yaml"
+                task_name = f"ntrex_eng_Latn-{lang}_{mode}"
+                yaml_template = "ntrex"
+                yaml_details = {
+                    "include": yaml_template,
+                    "dataset_name": f"{lang}",
+                    "task": task_name,
+                    "doc_to_target": f"sentence_{lang}",
+                    "doc_to_text": prompt_func(f"{mode}_reverse", lang, languages),
+                }
+                os.makedirs(f"{output_dir}/{mode}/english-african", exist_ok=True)
+                with open(
+                    f"{output_dir}/{mode}/english-african/{file_name}",
+                    "w" if overwrite else "x",
+                    encoding="utf8",
+                ) as f:
+                    f.write("# Generated by utils.py\n")
+                    yaml.dump(
+                        yaml_details,
+                        f,
+                        allow_unicode=True,
+                    )
+        except FileExistsError:
+            err.append(file_name)
+
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist (use --overwrite flag):"
+            f" {', '.join(err)}"
+        )
+
+
+def main() -> None:
+    """Parse CLI args and generate language-specific yaml files."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--overwrite",
+        default=True,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default="./",
+        help="Directory to write yaml files to",
+    )
+    parser.add_argument(
+        "--mode",
+        default="prompt_1",
+        choices=["prompt_1", "prompt_2", "prompt_3"],
+        help="Prompt number",
+    )
+    parser.add_argument(
+        "--reverse",
+        default=False,
+        choices=[True, False],
+        help="Reverse the translation direction",
+    )
+    args = parser.parse_args()
+
+    gen_lang_yamls(
+        output_dir=args.output_dir,
+        overwrite=args.overwrite,
+        mode=args.mode,
+        reverse=args.reverse,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lm_eval/tasks/afrobench/ntrex/ntrex.yaml b/lm_eval/tasks/afrobench/ntrex/ntrex.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c30b08cea2ffdbf775cfeeb8957c47e9e807518a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/ntrex.yaml
@@ -0,0 +1,14 @@
+group: african_ntrex
+task:
+  - ntrex_eng-afr_prompt_1
+  - ntrex_eng-afr_prompt_2
+  - ntrex_eng-afr_prompt_3
+  - ntrex_afr-eng_prompt_1
+  - ntrex_afr-eng_prompt_2
+  - ntrex_afr-eng_prompt_3
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex
new file mode 100644
index 0000000000000000000000000000000000000000..3c2659d752c9f14412d23f3c1e553fbb03a16b03
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex
@@ -0,0 +1,26 @@
+tag:
+- ntrex_tasks
+- ntrex_afr-eng
+- ntrex_afr-eng_prompt_1
+- afrobench_MT_tasks
+dataset_path: masakhane/ntrex_african
+dataset_kwargs: {trust_remote_code: True}
+output_type: generate_until
+validation_split: test
+fewshot_split: test
+test_split: test
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: chrf
+    aggregation: chrf
+    higher_is_better: true
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+repeats: 1
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_afr_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_afr_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..eb11904366801d649186548e124027489497a4cb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_afr_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: afr_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Afrikaans: {{sentence_afr_Latn}} \nEnglish: "
+include: ntrex
+task: ntrex_afr_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_amh_Ethi-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_amh_Ethi-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0114a212b89bee62243b3adedad49066998d1785
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_amh_Ethi-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: amh_Ethi
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Amharic: {{sentence_amh_Ethi}} \nEnglish: "
+include: ntrex
+task: ntrex_amh_Ethi-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_arb_Arab-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_arb_Arab-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4ddc8c4bbd403a3b83c15172d119ae183247c522
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_arb_Arab-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: arb_Arab
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Arabic: {{sentence_arb_Arab}} \nEnglish: "
+include: ntrex
+task: ntrex_arb_Arab-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_bem_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_bem_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c33ab35a18175300ffbf938b2431652ecf86017e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_bem_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: bem_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Bemba: {{sentence_bem_Latn}} \nEnglish: "
+include: ntrex
+task: ntrex_bem_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_ewe_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_ewe_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c5f69c0051ac2292ef1282ac6c8844ee61bc5148
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_ewe_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ewe_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Ewe: {{sentence_ewe_Latn}} \nEnglish: "
+include: ntrex
+task: ntrex_ewe_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_fra_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_fra_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fa3fad61684684f7155bf40704397cff7d5bcbc8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_fra_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: fra_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "French: {{sentence_fra_Latn}} \nEnglish: "
+include: ntrex
+task: ntrex_fra_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_hau_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_hau_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8b6d0f28b84d4c89d96f3db9de8478201265fade
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_hau_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: hau_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Hausa: {{sentence_hau_Latn}} \nEnglish: "
+include: ntrex
+task: ntrex_hau_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_ibo_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_ibo_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..992598614c1d9fb0929ca024260a31b953a1204e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_ibo_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ibo_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Igbo: {{sentence_ibo_Latn}} \nEnglish: "
+include: ntrex
+task: ntrex_ibo_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_kin_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_kin_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..eee96a62b961371c1fd1f069e97cd94ebef5b4d5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_kin_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: kin_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Kinyarwanda: {{sentence_kin_Latn}} \nEnglish: "
+include: ntrex
+task: ntrex_kin_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_mey_Arab-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_mey_Arab-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6650e644ad9b84df3c93bb6622543f8984bc4f8b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_mey_Arab-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: mey_Arab
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Hassaniya Arabic: {{sentence_mey_Arab}} \nEnglish: "
+include: ntrex
+task: ntrex_mey_Arab-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_mlg_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_mlg_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..375522c5c8560747a2775ec380b4964296dec7e3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_mlg_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: mlg_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Malagasy: {{sentence_mlg_Latn}} \nEnglish: "
+include: ntrex
+task: ntrex_mlg_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_msa_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_msa_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..65aaaa8014abf84963112a1b7f0239f4129c20bb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_msa_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: msa_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Malay: {{sentence_msa_Latn}} \nEnglish: "
+include: ntrex
+task: ntrex_msa_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_nde_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_nde_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d63548fb439470b4d46fb7225fa521f31becc77f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_nde_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: nde_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "North Ndebele: {{sentence_nde_Latn}} \nEnglish: "
+include: ntrex
+task: ntrex_nde_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_nso_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_nso_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4cf1cccf8a2562b0c958457561c7c4c9a5ae6776
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_nso_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: nso_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Northern Sotho: {{sentence_nso_Latn}} \nEnglish: "
+include: ntrex
+task: ntrex_nso_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_nya_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_nya_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ee4ac6d73f198367a96c684921e6e65e9a0adea7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_nya_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: nya_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Chichewa: {{sentence_nya_Latn}} \nEnglish: "
+include: ntrex
+task: ntrex_nya_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_orm_Ethi-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_orm_Ethi-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..446873065b536f58bfa12e5886f49edd1b7ea5ee
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_orm_Ethi-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: orm_Ethi
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Oromo: {{sentence_orm_Ethi}} \nEnglish: "
+include: ntrex
+task: ntrex_orm_Ethi-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_shi_Arab-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_shi_Arab-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..10972893f3f453f91d12845d9fea3e43558c1fc4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_shi_Arab-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: shi_Arab
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Tachelhit: {{sentence_shi_Arab}} \nEnglish: "
+include: ntrex
+task: ntrex_shi_Arab-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_sna_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_sna_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..63d83528835e8ae79f82d09007a4494ccaf1229c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_sna_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: sna_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Shona (Latin): {{sentence_sna_Latn}} \nEnglish: "
+include: ntrex
+task: ntrex_sna_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_som_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_som_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d6eb91e0310fcebd6483a3d43aca793e3a6934b9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_som_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: som_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Somali: {{sentence_som_Latn}} \nEnglish: "
+include: ntrex
+task: ntrex_som_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_ssw_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_ssw_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..48c5c10973911aa3b779071ffa96513e1e1f7a7a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_ssw_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ssw_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Swati: {{sentence_ssw_Latn}} \nEnglish: "
+include: ntrex
+task: ntrex_ssw_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_swa_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_swa_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..863222f7325fab67ff5afe3a13bef0cc0f4df035
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_swa_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: swa_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Swahili: {{sentence_swa_Latn}} \nEnglish: "
+include: ntrex
+task: ntrex_swa_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_tam_Taml-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_tam_Taml-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..993b480f20e34eab5f1c4cdfb644e09e0e978264
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_tam_Taml-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: tam_Taml
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Tamil: {{sentence_tam_Taml}} \nEnglish: "
+include: ntrex
+task: ntrex_tam_Taml-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_tel_Telu-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_tel_Telu-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d91e9a1f762a013ed992d04a2c9e9f0049d8f7eb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_tel_Telu-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: tel_Telu
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Telugu: {{sentence_tel_Telu}} \nEnglish: "
+include: ntrex
+task: ntrex_tel_Telu-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_tir_Ethi-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_tir_Ethi-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f23f332c1ca392e44c62638d8e39a79f8839b54d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_tir_Ethi-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: tir_Ethi
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Tigrinya: {{sentence_tir_Ethi}} \nEnglish: "
+include: ntrex
+task: ntrex_tir_Ethi-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_ton_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_ton_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5676a1a99997aca3d0bfc4120003ccb4edef3099
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_ton_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ton_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Tongan: {{sentence_ton_Latn}} \nEnglish: "
+include: ntrex
+task: ntrex_ton_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_tsn_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_tsn_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..caa0f9e57b1d93a4c074cca1c816ded7a93c3eb6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_tsn_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: tsn_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Tswana: {{sentence_tsn_Latn}} \nEnglish: "
+include: ntrex
+task: ntrex_tsn_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_urd_Arab-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_urd_Arab-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4e07e6787868ec0a56e7b79b2246fcd2211c19d1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_urd_Arab-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: urd_Arab
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Urdu: {{sentence_urd_Arab}} \nEnglish: "
+include: ntrex
+task: ntrex_urd_Arab-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_ven_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_ven_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7ba8ceaf4921b087cf38dce53a8c9bb49c359389
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_ven_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ven_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Venda: {{sentence_ven_Latn}} \nEnglish: "
+include: ntrex
+task: ntrex_ven_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_wol_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_wol_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8dcacb69de3f8fd83c5714494665cfb7f8cc7be1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_wol_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: wol_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Wolof: {{sentence_wol_Latn}} \nEnglish: "
+include: ntrex
+task: ntrex_wol_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_xho_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_xho_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1b6abc9dcbf53879148418592fd155f95026bba8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_xho_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: xho_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Xhosa: {{sentence_xho_Latn}} \nEnglish: "
+include: ntrex
+task: ntrex_xho_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_yor_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_yor_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e98aecd5b188aabf46c2c00b9a126616fee55f6f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_yor_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: yor_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Yoruba: {{sentence_yor_Latn}} \nEnglish: "
+include: ntrex
+task: ntrex_yor_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_zul_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_zul_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a38abee1148ad1b77a5395afa48621070ad3c239
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex_zul_Latn-eng_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: zul_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "Zulu: {{sentence_zul_Latn}} \nEnglish: "
+include: ntrex
+task: ntrex_zul_Latn-eng_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex
new file mode 100644
index 0000000000000000000000000000000000000000..2b5aa84f990e10804a9cdc8ca69901bfb55e5d71
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex
@@ -0,0 +1,26 @@
+tag:
+- ntrex_tasks
+- ntrex_eng-afr
+- ntrex_eng-afr_prompt_1
+- afrobench_MT_tasks
+dataset_path: masakhane/ntrex_african
+dataset_kwargs: {trust_remote_code: True}
+output_type: generate_until
+validation_split: test
+fewshot_split: test
+test_split: test
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: chrf
+    aggregation: chrf
+    higher_is_better: true
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+repeats: 1
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-afr_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-afr_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..40471f80151bacf355f8bf8ff617027f9da68ef7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-afr_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: afr_Latn
+doc_to_target: sentence_afr_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nAfrikaans: "
+include: ntrex
+task: ntrex_eng_Latn-afr_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-amh_Ethi.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-amh_Ethi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6e4dfba5dc799649532e9e6b28c862b25afb9566
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-amh_Ethi.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: amh_Ethi
+doc_to_target: sentence_amh_Ethi
+doc_to_text: "English: {{sentence_eng_Latn}} \nAmharic: "
+include: ntrex
+task: ntrex_eng_Latn-amh_Ethi_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-arb_Arab.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-arb_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1a248a9ac6da1668ce1fab555fb7ad586cf0acaa
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-arb_Arab.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: arb_Arab
+doc_to_target: sentence_arb_Arab
+doc_to_text: "English: {{sentence_eng_Latn}} \nArabic: "
+include: ntrex
+task: ntrex_eng_Latn-arb_Arab_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-bem_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-bem_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..035c682256b81ca9cc7dda1aebfc9ac130a75762
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-bem_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: bem_Latn
+doc_to_target: sentence_bem_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nBemba: "
+include: ntrex
+task: ntrex_eng_Latn-bem_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-ewe_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-ewe_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f5deae5c56b3bb203b372298207e7fa8d79cfb58
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-ewe_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ewe_Latn
+doc_to_target: sentence_ewe_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nEwe: "
+include: ntrex
+task: ntrex_eng_Latn-ewe_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-fra_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-fra_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cf079cf440f75a35edbea04e8afa0703ab0eea7b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-fra_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: fra_Latn
+doc_to_target: sentence_fra_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nFrench: "
+include: ntrex
+task: ntrex_eng_Latn-fra_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-hau_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-hau_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..377acbfb8ef84be01d8657907a33d1f141b66795
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-hau_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: hau_Latn
+doc_to_target: sentence_hau_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nHausa: "
+include: ntrex
+task: ntrex_eng_Latn-hau_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-ibo_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-ibo_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1c3a14dfa2200c29eb83825a6efb202905e6e78f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-ibo_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ibo_Latn
+doc_to_target: sentence_ibo_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nIgbo: "
+include: ntrex
+task: ntrex_eng_Latn-ibo_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-kin_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-kin_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ec14399e37649d7671f81f5348d74e76235df4e3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-kin_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: kin_Latn
+doc_to_target: sentence_kin_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nKinyarwanda: "
+include: ntrex
+task: ntrex_eng_Latn-kin_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-mey_Arab.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-mey_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fb696cc5ac25f1f43c276c34e26b97b7c82efaee
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-mey_Arab.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: mey_Arab
+doc_to_target: sentence_mey_Arab
+doc_to_text: "English: {{sentence_eng_Latn}} \nHassaniya Arabic: "
+include: ntrex
+task: ntrex_eng_Latn-mey_Arab_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-mlg_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-mlg_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..035c98c373ff6738310cb280cd617df60c8b6a2a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-mlg_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: mlg_Latn
+doc_to_target: sentence_mlg_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nMalagasy: "
+include: ntrex
+task: ntrex_eng_Latn-mlg_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-msa_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-msa_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c4c6b7d7f1f904ce5fe6061eb7c4c8caef86a8af
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-msa_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: msa_Latn
+doc_to_target: sentence_msa_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nMalay: "
+include: ntrex
+task: ntrex_eng_Latn-msa_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-nde_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-nde_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c66b44beee186f47ea9f8b4d62776d60e4be3ba9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-nde_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: nde_Latn
+doc_to_target: sentence_nde_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nNorth Ndebele: "
+include: ntrex
+task: ntrex_eng_Latn-nde_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-nso_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-nso_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ab6cf296c3959910f99b2edd6354d49259da7ce4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-nso_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: nso_Latn
+doc_to_target: sentence_nso_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nNorthern Sotho: "
+include: ntrex
+task: ntrex_eng_Latn-nso_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-nya_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-nya_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..74cbd1ffed9675feaff5ead68f147fc2572b4edd
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-nya_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: nya_Latn
+doc_to_target: sentence_nya_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nChichewa: "
+include: ntrex
+task: ntrex_eng_Latn-nya_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-orm_Ethi.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-orm_Ethi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ad875cab5b7012caecd06b99a8d7047ad50c403c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-orm_Ethi.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: orm_Ethi
+doc_to_target: sentence_orm_Ethi
+doc_to_text: "English: {{sentence_eng_Latn}} \nOromo: "
+include: ntrex
+task: ntrex_eng_Latn-orm_Ethi_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-shi_Arab.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-shi_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5441bbdb6ea535f01c71753b9df5ee3290a7cac3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-shi_Arab.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: shi_Arab
+doc_to_target: sentence_shi_Arab
+doc_to_text: "English: {{sentence_eng_Latn}} \nTachelhit: "
+include: ntrex
+task: ntrex_eng_Latn-shi_Arab_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-sna_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-sna_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0bed0f6c195e7945329b7d26b50bb5d2abd62c90
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-sna_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: sna_Latn
+doc_to_target: sentence_sna_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nShona (Latin): "
+include: ntrex
+task: ntrex_eng_Latn-sna_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-som_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-som_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5e4aafdfc79bd2e31747847ec081ae15f3799dc3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-som_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: som_Latn
+doc_to_target: sentence_som_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nSomali: "
+include: ntrex
+task: ntrex_eng_Latn-som_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-ssw_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-ssw_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fa18ebf233e0cdbfd5b7d692356f0eacc1cf669a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-ssw_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ssw_Latn
+doc_to_target: sentence_ssw_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nSwati: "
+include: ntrex
+task: ntrex_eng_Latn-ssw_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-swa_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-swa_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a7079ec01354ca1d56fa593c4b2a5dab668f5c0c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-swa_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: swa_Latn
+doc_to_target: sentence_swa_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nSwahili: "
+include: ntrex
+task: ntrex_eng_Latn-swa_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-tam_Taml.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-tam_Taml.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b7e42a36beee8d83d057b6daf7b6cfa488b2d90f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-tam_Taml.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: tam_Taml
+doc_to_target: sentence_tam_Taml
+doc_to_text: "English: {{sentence_eng_Latn}} \nTamil: "
+include: ntrex
+task: ntrex_eng_Latn-tam_Taml_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-tel_Telu.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-tel_Telu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..db8eb6b20ef17fb518b1c45a8753e72f205a7e41
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-tel_Telu.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: tel_Telu
+doc_to_target: sentence_tel_Telu
+doc_to_text: "English: {{sentence_eng_Latn}} \nTelugu: "
+include: ntrex
+task: ntrex_eng_Latn-tel_Telu_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-tir_Ethi.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-tir_Ethi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..45c6ae84c642d58db1ebdbf45feb112c4e872bea
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-tir_Ethi.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: tir_Ethi
+doc_to_target: sentence_tir_Ethi
+doc_to_text: "English: {{sentence_eng_Latn}} \nTigrinya: "
+include: ntrex
+task: ntrex_eng_Latn-tir_Ethi_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-ton_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-ton_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0a680a2c67f5226248043a9d8325b94f7fa4ad57
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-ton_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ton_Latn
+doc_to_target: sentence_ton_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nTongan: "
+include: ntrex
+task: ntrex_eng_Latn-ton_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-tsn_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-tsn_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d5a7a4ca261a1b8bfcdd1614eaa167c81c46c1d0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-tsn_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: tsn_Latn
+doc_to_target: sentence_tsn_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nTswana: "
+include: ntrex
+task: ntrex_eng_Latn-tsn_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-urd_Arab.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-urd_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4ee69ded9fac3efbc400bbf39aadb529eee26e3d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-urd_Arab.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: urd_Arab
+doc_to_target: sentence_urd_Arab
+doc_to_text: "English: {{sentence_eng_Latn}} \nUrdu: "
+include: ntrex
+task: ntrex_eng_Latn-urd_Arab_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-ven_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-ven_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4277ce08a5d44f22996d704e0bfbd7461103a0ab
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-ven_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ven_Latn
+doc_to_target: sentence_ven_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nVenda: "
+include: ntrex
+task: ntrex_eng_Latn-ven_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-wol_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-wol_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dea533ee5e959705c664d5b6e2ee10244c81d3f1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-wol_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: wol_Latn
+doc_to_target: sentence_wol_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nWolof: "
+include: ntrex
+task: ntrex_eng_Latn-wol_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-xho_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-xho_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..62ab64bfd5e1a6d7a92e1491824047c4853b7e56
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-xho_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: xho_Latn
+doc_to_target: sentence_xho_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nXhosa: "
+include: ntrex
+task: ntrex_eng_Latn-xho_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-yor_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-yor_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9d96624ae3b9faeacd9b13bf8dcbaf95dafd1040
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-yor_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: yor_Latn
+doc_to_target: sentence_yor_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nYoruba: "
+include: ntrex
+task: ntrex_eng_Latn-yor_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-zul_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-zul_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..db60fb59821685f837f5f184647564f3e18f4927
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex_eng_Latn-zul_Latn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: zul_Latn
+doc_to_target: sentence_zul_Latn
+doc_to_text: "English: {{sentence_eng_Latn}} \nZulu: "
+include: ntrex
+task: ntrex_eng_Latn-zul_Latn_prompt_1
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex
new file mode 100644
index 0000000000000000000000000000000000000000..3dc29226bf4677ee34836dbc0c5c206cbb1744bd
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex
@@ -0,0 +1,25 @@
+tag:
+- ntrex_afr-eng
+- ntrex_afr-eng_prompt_2
+- afrobench_MT_tasks
+dataset_path: masakhane/ntrex_african
+dataset_kwargs: {trust_remote_code: True}
+output_type: generate_until
+validation_split: test
+fewshot_split: test
+test_split: test
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: chrf
+    aggregation: chrf
+    higher_is_better: true
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+repeats: 1
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_afr_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_afr_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..16cfc7d5d0811aec8fca3bcbc7a436f74391cda5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_afr_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: afr_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Afrikaans sentences\
+  \ to English \nAfrikaans: {{sentence_afr_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_afr_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_amh_Ethi-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_amh_Ethi-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..20e88c366d9c477928abda6bebd2a73d26d00e36
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_amh_Ethi-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: amh_Ethi
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Amharic sentences\
+  \ to English \nAmharic: {{sentence_amh_Ethi}}\nEnglish: "
+include: ntrex
+task: ntrex_amh_Ethi-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_arb_Arab-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_arb_Arab-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a88a478a12a99d5910360dab8b6fa6fac1b78601
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_arb_Arab-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: arb_Arab
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Arabic sentences\
+  \ to English \nArabic: {{sentence_arb_Arab}}\nEnglish: "
+include: ntrex
+task: ntrex_arb_Arab-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_bem_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_bem_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3e114a3464d6cb98baf2374ccaacbc45c3f91240
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_bem_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: bem_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Bemba sentences\
+  \ to English \nBemba: {{sentence_bem_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_bem_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_ewe_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_ewe_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5e4facd5106291d0fe52d5315d1f6a88a6f32afe
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_ewe_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ewe_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Ewe sentences\
+  \ to English \nEwe: {{sentence_ewe_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_ewe_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_fra_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_fra_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ad46aedf727a431a166cac1b9ec45be707feb9bb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_fra_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: fra_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following French sentences\
+  \ to English \nFrench: {{sentence_fra_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_fra_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_hau_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_hau_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..018a63963e8aeedeb3457a49cbf3d97adf4e8c82
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_hau_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: hau_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Hausa sentences\
+  \ to English \nHausa: {{sentence_hau_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_hau_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_ibo_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_ibo_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0b93d2d863d60ded18b4e746badafe81e9a3e917
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_ibo_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ibo_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Igbo sentences\
+  \ to English \nIgbo: {{sentence_ibo_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_ibo_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_kin_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_kin_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..45b18a640b749e848a8d7df9c01ac2121afb5c2d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_kin_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kin_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Kinyarwanda sentences\
+  \ to English \nKinyarwanda: {{sentence_kin_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_kin_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_mey_Arab-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_mey_Arab-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d155b62c828b30e1505e194d3a93960ed707c1aa
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_mey_Arab-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: mey_Arab
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Hassaniya Arabic\
+  \ sentences to English \nHassaniya Arabic: {{sentence_mey_Arab}}\nEnglish: "
+include: ntrex
+task: ntrex_mey_Arab-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_mlg_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_mlg_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..10a7507bae076af1c5aec92ec0db65da9b94f876
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_mlg_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: mlg_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Malagasy sentences\
+  \ to English \nMalagasy: {{sentence_mlg_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_mlg_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_msa_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_msa_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..be65a0ff07f372df2e3027373aebd4e0176e14ee
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_msa_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: msa_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Malay sentences\
+  \ to English \nMalay: {{sentence_msa_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_msa_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_nde_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_nde_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c4a39fc2c31bc63eb27fdbfb78edaa8c8c59e0ee
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_nde_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: nde_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following North Ndebele\
+  \ sentences to English \nNorth Ndebele: {{sentence_nde_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_nde_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_nso_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_nso_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..290122fab7df120e79d478e81d3cc39cc60e61fe
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_nso_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: nso_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Northern Sotho\
+  \ sentences to English \nNorthern Sotho: {{sentence_nso_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_nso_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_nya_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_nya_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..de365e011b86d650b6defb5a6fd7abb4a7a0feef
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_nya_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: nya_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Chichewa sentences\
+  \ to English \nChichewa: {{sentence_nya_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_nya_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_orm_Ethi-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_orm_Ethi-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ebe353d1fc9de1f7f89e4f783a57afe0a3699e7f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_orm_Ethi-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: orm_Ethi
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Oromo sentences\
+  \ to English \nOromo: {{sentence_orm_Ethi}}\nEnglish: "
+include: ntrex
+task: ntrex_orm_Ethi-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_shi_Arab-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_shi_Arab-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b2db11ae54e39b0e8a5c5669489fdcd5a81bce29
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_shi_Arab-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: shi_Arab
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Tachelhit sentences\
+  \ to English \nTachelhit: {{sentence_shi_Arab}}\nEnglish: "
+include: ntrex
+task: ntrex_shi_Arab-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_sna_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_sna_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..25600d6347d1973e1a3c4c8f236093044f2f83fb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_sna_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sna_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Shona (Latin)\
+  \ sentences to English \nShona (Latin): {{sentence_sna_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_sna_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_som_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_som_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0ea6a71d9a423fad3ff285bbc53b3d7f440fac57
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_som_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: som_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Somali sentences\
+  \ to English \nSomali: {{sentence_som_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_som_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_ssw_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_ssw_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b2e690a6a49b0e355df71f413522e6905f7601d7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_ssw_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ssw_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Swati sentences\
+  \ to English \nSwati: {{sentence_ssw_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_ssw_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_swa_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_swa_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2e609435f8482f5e2f4daa5253fef21dedcf36a3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_swa_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: swa_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Swahili sentences\
+  \ to English \nSwahili: {{sentence_swa_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_swa_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_tam_Taml-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_tam_Taml-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e2c9f278f1353e66e341329e7fc9686169ab309e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_tam_Taml-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: tam_Taml
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Tamil sentences\
+  \ to English \nTamil: {{sentence_tam_Taml}}\nEnglish: "
+include: ntrex
+task: ntrex_tam_Taml-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_tel_Telu-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_tel_Telu-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..15dc359805fb8570302d31ac432f5fd557cca2b9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_tel_Telu-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: tel_Telu
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Telugu sentences\
+  \ to English \nTelugu: {{sentence_tel_Telu}}\nEnglish: "
+include: ntrex
+task: ntrex_tel_Telu-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_tir_Ethi-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_tir_Ethi-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5f0bb2b835f9ad4e577b7e415ab7cfea484c1f44
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_tir_Ethi-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: tir_Ethi
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Tigrinya sentences\
+  \ to English \nTigrinya: {{sentence_tir_Ethi}}\nEnglish: "
+include: ntrex
+task: ntrex_tir_Ethi-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_ton_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_ton_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..84f7d281eea533cb634ea958922ca7041a6e24f8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_ton_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ton_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Tongan sentences\
+  \ to English \nTongan: {{sentence_ton_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_ton_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_tsn_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_tsn_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a56996418d193d01df084b07b144f240ec45e7b7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_tsn_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: tsn_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Tswana sentences\
+  \ to English \nTswana: {{sentence_tsn_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_tsn_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_urd_Arab-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_urd_Arab-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..47a47875969c6bb7d8032570beee3dcc4303b734
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_urd_Arab-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: urd_Arab
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Urdu sentences\
+  \ to English \nUrdu: {{sentence_urd_Arab}}\nEnglish: "
+include: ntrex
+task: ntrex_urd_Arab-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_ven_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_ven_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5f27b185d8d5c2c2fc0e7b4eb273f5fd8c2241ca
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_ven_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ven_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Venda sentences\
+  \ to English \nVenda: {{sentence_ven_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_ven_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_wol_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_wol_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fa2da55ce1e8cc1b76e87dda701ae9e12e2976b3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_wol_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: wol_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Wolof sentences\
+  \ to English \nWolof: {{sentence_wol_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_wol_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_xho_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_xho_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b504cd3126d1a422f94c10b7677c7bd92f0d9311
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_xho_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: xho_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Xhosa sentences\
+  \ to English \nXhosa: {{sentence_xho_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_xho_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_yor_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_yor_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..03c4cbacd791035cd1757d5ca0ed14b546b445e0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_yor_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: yor_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Yoruba sentences\
+  \ to English \nYoruba: {{sentence_yor_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_yor_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_zul_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_zul_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..760abb6f0253c993c37413c88c8dfced632cdd84
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex_zul_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: zul_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "You are a translation expert. Translate the following Zulu sentences\
+  \ to English \nZulu: {{sentence_zul_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_zul_Latn-eng_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex
new file mode 100644
index 0000000000000000000000000000000000000000..8dd411c3b78988b12ea421df33cf6aaa6caee91c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex
@@ -0,0 +1,25 @@
+tag:
+- ntrex_eng-afr
+- ntrex_eng-afr_prompt_2
+- afrobench_MT_tasks
+dataset_path: masakhane/ntrex_african
+dataset_kwargs: {trust_remote_code: True}
+output_type: generate_until
+validation_split: test
+fewshot_split: test
+test_split: test
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: chrf
+    aggregation: chrf
+    higher_is_better: true
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+repeats: 1
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-afr_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-afr_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..678e5b21721a6da7f67401d2c26f65c89e3bbf83
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-afr_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: afr_Latn
+doc_to_target: sentence_afr_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Afrikaans \nEnglish: {{sentence_eng_Latn}} \nAfrikaans: "
+include: ntrex
+task: ntrex_eng_Latn-afr_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-amh_Ethi.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-amh_Ethi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a5ae3dd1acdea962bb7f99533ddbe7255133a97b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-amh_Ethi.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: amh_Ethi
+doc_to_target: sentence_amh_Ethi
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Amharic \nEnglish: {{sentence_eng_Latn}} \nAmharic: "
+include: ntrex
+task: ntrex_eng_Latn-amh_Ethi_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-arb_Arab.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-arb_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..303ccf471d5c180220c3985909377ba0227bbe41
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-arb_Arab.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: arb_Arab
+doc_to_target: sentence_arb_Arab
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Arabic \nEnglish: {{sentence_eng_Latn}} \nArabic: "
+include: ntrex
+task: ntrex_eng_Latn-arb_Arab_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-bem_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-bem_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7992529aebb8b6ae9e2e55d2cfb89e142d047791
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-bem_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: bem_Latn
+doc_to_target: sentence_bem_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Bemba \nEnglish: {{sentence_eng_Latn}} \nBemba: "
+include: ntrex
+task: ntrex_eng_Latn-bem_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-ewe_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-ewe_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3de8e8ebf4d0d15d24a0313d1793f26f7719167d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-ewe_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ewe_Latn
+doc_to_target: sentence_ewe_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Ewe \nEnglish: {{sentence_eng_Latn}} \nEwe: "
+include: ntrex
+task: ntrex_eng_Latn-ewe_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-fra_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-fra_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fc230efe275e4712c7453f77d66290f44702b75d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-fra_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: fra_Latn
+doc_to_target: sentence_fra_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to French \nEnglish: {{sentence_eng_Latn}} \nFrench: "
+include: ntrex
+task: ntrex_eng_Latn-fra_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-hau_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-hau_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..009d1a51893395026f0ed3d3f93e1a16c50abacc
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-hau_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: hau_Latn
+doc_to_target: sentence_hau_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Hausa \nEnglish: {{sentence_eng_Latn}} \nHausa: "
+include: ntrex
+task: ntrex_eng_Latn-hau_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-ibo_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-ibo_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a2b27ab59197ab7a8b7069e83ac2186cac6d1510
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-ibo_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ibo_Latn
+doc_to_target: sentence_ibo_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Igbo \nEnglish: {{sentence_eng_Latn}} \nIgbo: "
+include: ntrex
+task: ntrex_eng_Latn-ibo_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-kin_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-kin_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f76077d3af55e07910b0c3aff74d400b3500b530
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-kin_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kin_Latn
+doc_to_target: sentence_kin_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Kinyarwanda \nEnglish: {{sentence_eng_Latn}} \nKinyarwanda: "
+include: ntrex
+task: ntrex_eng_Latn-kin_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-mey_Arab.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-mey_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2c5b2abaedbf8808a1ee5d15af9c3be837cbe63e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-mey_Arab.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: mey_Arab
+doc_to_target: sentence_mey_Arab
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Hassaniya Arabic \nEnglish: {{sentence_eng_Latn}} \nHassaniya Arabic: "
+include: ntrex
+task: ntrex_eng_Latn-mey_Arab_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-mlg_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-mlg_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1d25afa1a04a13d8a3bf6e911ac151e7ac1da51f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-mlg_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: mlg_Latn
+doc_to_target: sentence_mlg_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Malagasy \nEnglish: {{sentence_eng_Latn}} \nMalagasy: "
+include: ntrex
+task: ntrex_eng_Latn-mlg_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-msa_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-msa_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c7b7972bdaa207e0a34812496a40b8524da0305b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-msa_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: msa_Latn
+doc_to_target: sentence_msa_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Malay \nEnglish: {{sentence_eng_Latn}} \nMalay: "
+include: ntrex
+task: ntrex_eng_Latn-msa_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-nde_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-nde_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..31252c02f9f29312c18039903aa67f26e95499b1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-nde_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: nde_Latn
+doc_to_target: sentence_nde_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to North Ndebele \nEnglish: {{sentence_eng_Latn}} \nNorth Ndebele: "
+include: ntrex
+task: ntrex_eng_Latn-nde_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-nso_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-nso_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b8daba4d8fc456e8a54fe14296d5762be002c3fb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-nso_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: nso_Latn
+doc_to_target: sentence_nso_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Northern Sotho \nEnglish: {{sentence_eng_Latn}} \nNorthern Sotho: "
+include: ntrex
+task: ntrex_eng_Latn-nso_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-nya_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-nya_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fe01ef879c5c52414bc39372103da5c5bff038fe
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-nya_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: nya_Latn
+doc_to_target: sentence_nya_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Chichewa \nEnglish: {{sentence_eng_Latn}} \nChichewa: "
+include: ntrex
+task: ntrex_eng_Latn-nya_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-orm_Ethi.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-orm_Ethi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f78e4db60165e893242dea30d04503e5ae46ffb9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-orm_Ethi.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: orm_Ethi
+doc_to_target: sentence_orm_Ethi
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Oromo \nEnglish: {{sentence_eng_Latn}} \nOromo: "
+include: ntrex
+task: ntrex_eng_Latn-orm_Ethi_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-shi_Arab.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-shi_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..57ea6c0480bc9bb9458cc4d6fa92215d67a518b0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-shi_Arab.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: shi_Arab
+doc_to_target: sentence_shi_Arab
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Tachelhit \nEnglish: {{sentence_eng_Latn}} \nTachelhit: "
+include: ntrex
+task: ntrex_eng_Latn-shi_Arab_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-sna_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-sna_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..399668d33c648f472ea6d980f8ebf2e659726b65
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-sna_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sna_Latn
+doc_to_target: sentence_sna_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Shona (Latin) \nEnglish: {{sentence_eng_Latn}} \nShona (Latin): "
+include: ntrex
+task: ntrex_eng_Latn-sna_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-som_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-som_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8a29749aaeeda6c166b3a1cfecf843ef2f2ddfb3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-som_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: som_Latn
+doc_to_target: sentence_som_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Somali \nEnglish: {{sentence_eng_Latn}} \nSomali: "
+include: ntrex
+task: ntrex_eng_Latn-som_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-ssw_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-ssw_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a67ffdeee1465a9cd19b126e3a53a0e6ac054d05
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-ssw_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ssw_Latn
+doc_to_target: sentence_ssw_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Swati \nEnglish: {{sentence_eng_Latn}} \nSwati: "
+include: ntrex
+task: ntrex_eng_Latn-ssw_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-swa_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-swa_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0be54151da2d262039dd2c77753f0def8810e528
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-swa_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: swa_Latn
+doc_to_target: sentence_swa_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Swahili \nEnglish: {{sentence_eng_Latn}} \nSwahili: "
+include: ntrex
+task: ntrex_eng_Latn-swa_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-tam_Taml.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-tam_Taml.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..387ac60dafe76aaf13adde1adb9830613172054a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-tam_Taml.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: tam_Taml
+doc_to_target: sentence_tam_Taml
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Tamil \nEnglish: {{sentence_eng_Latn}} \nTamil: "
+include: ntrex
+task: ntrex_eng_Latn-tam_Taml_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-tel_Telu.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-tel_Telu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7b1de396146f0c21caab39fea111fadfd53fce53
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-tel_Telu.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: tel_Telu
+doc_to_target: sentence_tel_Telu
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Telugu \nEnglish: {{sentence_eng_Latn}} \nTelugu: "
+include: ntrex
+task: ntrex_eng_Latn-tel_Telu_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-tir_Ethi.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-tir_Ethi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..da402211718e0126e2281d32f1991c946b2a23fe
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-tir_Ethi.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: tir_Ethi
+doc_to_target: sentence_tir_Ethi
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Tigrinya \nEnglish: {{sentence_eng_Latn}} \nTigrinya: "
+include: ntrex
+task: ntrex_eng_Latn-tir_Ethi_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-ton_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-ton_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f8c466929bee40fb7ba2f5b000310925908251fd
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-ton_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ton_Latn
+doc_to_target: sentence_ton_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Tongan \nEnglish: {{sentence_eng_Latn}} \nTongan: "
+include: ntrex
+task: ntrex_eng_Latn-ton_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-tsn_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-tsn_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ca918e1de6790bff10cabd103e546651643686d3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-tsn_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: tsn_Latn
+doc_to_target: sentence_tsn_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Tswana \nEnglish: {{sentence_eng_Latn}} \nTswana: "
+include: ntrex
+task: ntrex_eng_Latn-tsn_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-urd_Arab.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-urd_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8539df766542c9a7263ad62992b5fe619de2f23e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-urd_Arab.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: urd_Arab
+doc_to_target: sentence_urd_Arab
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Urdu \nEnglish: {{sentence_eng_Latn}} \nUrdu: "
+include: ntrex
+task: ntrex_eng_Latn-urd_Arab_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-ven_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-ven_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e923b12ce695253b39965bda6352121271514123
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-ven_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ven_Latn
+doc_to_target: sentence_ven_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Venda \nEnglish: {{sentence_eng_Latn}} \nVenda: "
+include: ntrex
+task: ntrex_eng_Latn-ven_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-wol_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-wol_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..707b76a59f0bd3a661dfac59eb9413c46d323c8b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-wol_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: wol_Latn
+doc_to_target: sentence_wol_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Wolof \nEnglish: {{sentence_eng_Latn}} \nWolof: "
+include: ntrex
+task: ntrex_eng_Latn-wol_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-xho_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-xho_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e7f51491f88c75f9d2da270209fbe32b56bc529b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-xho_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: xho_Latn
+doc_to_target: sentence_xho_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Xhosa \nEnglish: {{sentence_eng_Latn}} \nXhosa: "
+include: ntrex
+task: ntrex_eng_Latn-xho_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-yor_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-yor_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6f3e4be543796276d04c65001199521701f02ed9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-yor_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: yor_Latn
+doc_to_target: sentence_yor_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Yoruba \nEnglish: {{sentence_eng_Latn}} \nYoruba: "
+include: ntrex
+task: ntrex_eng_Latn-yor_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-zul_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-zul_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..946d0020ddb845653bc574e7cb8de54bf3a35a00
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex_eng_Latn-zul_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: zul_Latn
+doc_to_target: sentence_zul_Latn
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Zulu \nEnglish: {{sentence_eng_Latn}} \nZulu: "
+include: ntrex
+task: ntrex_eng_Latn-zul_Latn_prompt_2
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex
new file mode 100644
index 0000000000000000000000000000000000000000..3bab54d824d83e7d201107a00411c22b5ec44a1b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex
@@ -0,0 +1,25 @@
+tag:
+- ntrex_afr-eng
+- ntrex_afr-eng_prompt_3
+- afrobench_MT_tasks
+dataset_path: masakhane/ntrex_african
+dataset_kwargs: {trust_remote_code: True}
+output_type: generate_until
+validation_split: test
+fewshot_split: test
+test_split: test
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: chrf
+    aggregation: chrf
+    higher_is_better: true
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+repeats: 1
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_afr_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_afr_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..09cbbfc56e84748c37d86366a68162b82869d918
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_afr_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: afr_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Afrikaans and English linguist, translate the following Afrikaans\
+  \ sentences to English \nAfrikaans: {{sentence_afr_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_afr_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_amh_Ethi-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_amh_Ethi-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..33530440f5e232fa8b86267e8c42fab503d0c551
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_amh_Ethi-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: amh_Ethi
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Amharic and English linguist, translate the following Amharic sentences\
+  \ to English \nAmharic: {{sentence_amh_Ethi}}\nEnglish: "
+include: ntrex
+task: ntrex_amh_Ethi-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_arb_Arab-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_arb_Arab-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..858c0605f39ec13ecbed5733fad5d5eef3d275ad
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_arb_Arab-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: arb_Arab
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Arabic and English linguist, translate the following Arabic sentences\
+  \ to English \nArabic: {{sentence_arb_Arab}}\nEnglish: "
+include: ntrex
+task: ntrex_arb_Arab-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_bem_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_bem_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3be00855b6bfb5318442ebf3603c6c611f1c319c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_bem_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: bem_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Bemba and English linguist, translate the following Bemba sentences\
+  \ to English \nBemba: {{sentence_bem_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_bem_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_ewe_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_ewe_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..493176a7baed9c8b7ad64e6e028ce3b00d8a1067
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_ewe_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ewe_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Ewe and English linguist, translate the following Ewe sentences\
+  \ to English \nEwe: {{sentence_ewe_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_ewe_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_fra_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_fra_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b009a37bfd786707c077c55b391bacba7e6dad15
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_fra_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: fra_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a French and English linguist, translate the following French sentences\
+  \ to English \nFrench: {{sentence_fra_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_fra_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_hau_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_hau_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a3c6f72111e504082a34674e05670288b6877d3c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_hau_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: hau_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Hausa and English linguist, translate the following Hausa sentences\
+  \ to English \nHausa: {{sentence_hau_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_hau_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_ibo_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_ibo_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d4b7e768d4c4f4719842aa11af37f6afd89d4f9d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_ibo_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ibo_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Igbo and English linguist, translate the following Igbo sentences\
+  \ to English \nIgbo: {{sentence_ibo_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_ibo_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_kin_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_kin_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bba2d32395d22f86492720856e1801d586cab8ab
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_kin_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kin_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Kinyarwanda and English linguist, translate the following Kinyarwanda\
+  \ sentences to English \nKinyarwanda: {{sentence_kin_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_kin_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_mey_Arab-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_mey_Arab-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9a567548787c0b007306ef41762b9934eb1ad36e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_mey_Arab-eng_Latn.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: mey_Arab
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Hassaniya Arabic and English linguist, translate the following\
+  \ Hassaniya Arabic sentences to English \nHassaniya Arabic: {{sentence_mey_Arab}}\n\
+  English: "
+include: ntrex
+task: ntrex_mey_Arab-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_mlg_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_mlg_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..23d55c5a0b0f86bb04087ca00bb23c1970ad1fbd
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_mlg_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: mlg_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Malagasy and English linguist, translate the following Malagasy\
+  \ sentences to English \nMalagasy: {{sentence_mlg_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_mlg_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_msa_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_msa_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fa1a9618f29da1bbcc7171fda71629d593cada91
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_msa_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: msa_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Malay and English linguist, translate the following Malay sentences\
+  \ to English \nMalay: {{sentence_msa_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_msa_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_nde_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_nde_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..855defd07be478e59de82c3eccb19e42dd07f042
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_nde_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: nde_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a North Ndebele and English linguist, translate the following North\
+  \ Ndebele sentences to English \nNorth Ndebele: {{sentence_nde_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_nde_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_nso_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_nso_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..29a7452c39392d1aa94bf3db22ed0ee9b62dd120
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_nso_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: nso_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Northern Sotho and English linguist, translate the following Northern\
+  \ Sotho sentences to English \nNorthern Sotho: {{sentence_nso_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_nso_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_nya_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_nya_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..505586519ea0ea567364ae76d5db58fdec05da08
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_nya_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: nya_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Chichewa and English linguist, translate the following Chichewa\
+  \ sentences to English \nChichewa: {{sentence_nya_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_nya_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_orm_Ethi-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_orm_Ethi-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9a38e9312cdb30b6bc62b2d3f23c1e5583f043b6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_orm_Ethi-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: orm_Ethi
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Oromo and English linguist, translate the following Oromo sentences\
+  \ to English \nOromo: {{sentence_orm_Ethi}}\nEnglish: "
+include: ntrex
+task: ntrex_orm_Ethi-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_shi_Arab-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_shi_Arab-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..19f363ef2439a4986282116f9a34026205ebd431
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_shi_Arab-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: shi_Arab
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Tachelhit and English linguist, translate the following Tachelhit\
+  \ sentences to English \nTachelhit: {{sentence_shi_Arab}}\nEnglish: "
+include: ntrex
+task: ntrex_shi_Arab-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_sna_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_sna_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1c7a63157eca8bdc6f8e2488fc1ef10b9941dbb9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_sna_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sna_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Shona (Latin) and English linguist, translate the following Shona\
+  \ (Latin) sentences to English \nShona (Latin): {{sentence_sna_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_sna_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_som_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_som_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..685f38233c655048cb55819812247eefaea19527
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_som_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: som_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Somali and English linguist, translate the following Somali sentences\
+  \ to English \nSomali: {{sentence_som_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_som_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_ssw_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_ssw_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dd95665f6f17301764a1f1ad0d525352fd8f69bd
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_ssw_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ssw_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Swati and English linguist, translate the following Swati sentences\
+  \ to English \nSwati: {{sentence_ssw_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_ssw_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_swa_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_swa_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d0731d37346153f0d9dec96c8211f7a8250ec3f2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_swa_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: swa_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Swahili and English linguist, translate the following Swahili sentences\
+  \ to English \nSwahili: {{sentence_swa_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_swa_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_tam_Taml-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_tam_Taml-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..834320d846a40fcf6bd53c9445f051c38c38a439
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_tam_Taml-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: tam_Taml
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Tamil and English linguist, translate the following Tamil sentences\
+  \ to English \nTamil: {{sentence_tam_Taml}}\nEnglish: "
+include: ntrex
+task: ntrex_tam_Taml-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_tel_Telu-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_tel_Telu-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7fde743dcfd343546dcaa042bb3fef8a49a194d5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_tel_Telu-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: tel_Telu
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Telugu and English linguist, translate the following Telugu sentences\
+  \ to English \nTelugu: {{sentence_tel_Telu}}\nEnglish: "
+include: ntrex
+task: ntrex_tel_Telu-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_tir_Ethi-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_tir_Ethi-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..60189ee73207fc08911821188f61e23eb12dc62e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_tir_Ethi-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: tir_Ethi
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Tigrinya and English linguist, translate the following Tigrinya\
+  \ sentences to English \nTigrinya: {{sentence_tir_Ethi}}\nEnglish: "
+include: ntrex
+task: ntrex_tir_Ethi-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_ton_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_ton_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ec2b5ba992a535f5f5f4fd6b269653b213f1b39a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_ton_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ton_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Tongan and English linguist, translate the following Tongan sentences\
+  \ to English \nTongan: {{sentence_ton_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_ton_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_tsn_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_tsn_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fa63ca4b77edb7b0907e6660ce31df7ce0ea7278
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_tsn_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: tsn_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Tswana and English linguist, translate the following Tswana sentences\
+  \ to English \nTswana: {{sentence_tsn_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_tsn_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_urd_Arab-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_urd_Arab-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2b520795f2fd8e986b8292b985e36769c76f3553
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_urd_Arab-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: urd_Arab
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Urdu and English linguist, translate the following Urdu sentences\
+  \ to English \nUrdu: {{sentence_urd_Arab}}\nEnglish: "
+include: ntrex
+task: ntrex_urd_Arab-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_ven_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_ven_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..82372de2dd0624f9b068f27ab24f48433267ea28
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_ven_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ven_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Venda and English linguist, translate the following Venda sentences\
+  \ to English \nVenda: {{sentence_ven_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_ven_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_wol_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_wol_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ae0124f20efadbb363c017ef708b5dfb14311b07
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_wol_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: wol_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Wolof and English linguist, translate the following Wolof sentences\
+  \ to English \nWolof: {{sentence_wol_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_wol_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_xho_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_xho_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7f0528af4efc5cb15035158a7c5789878eaa653b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_xho_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: xho_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Xhosa and English linguist, translate the following Xhosa sentences\
+  \ to English \nXhosa: {{sentence_xho_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_xho_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_yor_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_yor_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..99d7cf494376be71148044b251c23c7b6f15191d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_yor_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: yor_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Yoruba and English linguist, translate the following Yoruba sentences\
+  \ to English \nYoruba: {{sentence_yor_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_yor_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_zul_Latn-eng_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_zul_Latn-eng_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..30f3b307eef0a64f137cd993ff8571b103b2e91e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex_zul_Latn-eng_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: zul_Latn
+doc_to_target: sentence_eng_Latn
+doc_to_text: "As a Zulu and English linguist, translate the following Zulu sentences\
+  \ to English \nZulu: {{sentence_zul_Latn}}\nEnglish: "
+include: ntrex
+task: ntrex_zul_Latn-eng_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex
new file mode 100644
index 0000000000000000000000000000000000000000..d001e1f6e6acc14616603aa46a9f412d7abc026b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex
@@ -0,0 +1,25 @@
+tag:
+- ntrex_eng-afr
+- ntrex_eng-afr_prompt_3
+- afrobench_MT_tasks
+dataset_path: masakhane/ntrex_african
+dataset_kwargs: {trust_remote_code: True}
+output_type: generate_until
+validation_split: test
+fewshot_split: test
+test_split: test
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: chrf
+    aggregation: chrf
+    higher_is_better: true
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+repeats: 1
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-afr_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-afr_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4aaa928ba0d31ca83a7d7eb59462a14715a2abf7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-afr_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: afr_Latn
+doc_to_target: sentence_afr_Latn
+doc_to_text: "As a Afrikaans and English linguist, translate the following English\
+  \ sentences to Afrikaans \nEnglish: {{sentence_eng_Latn}} \nAfrikaans: "
+include: ntrex
+task: ntrex_eng_Latn-afr_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-amh_Ethi.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-amh_Ethi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..008f73024a1c7136ba9c7db28badce24097da5d2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-amh_Ethi.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: amh_Ethi
+doc_to_target: sentence_amh_Ethi
+doc_to_text: "As a Amharic and English linguist, translate the following English sentences\
+  \ to Amharic \nEnglish: {{sentence_eng_Latn}} \nAmharic: "
+include: ntrex
+task: ntrex_eng_Latn-amh_Ethi_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-arb_Arab.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-arb_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d0c9e8132374542c605789269c27aabf181dad28
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-arb_Arab.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: arb_Arab
+doc_to_target: sentence_arb_Arab
+doc_to_text: "As a Arabic and English linguist, translate the following English sentences\
+  \ to Arabic \nEnglish: {{sentence_eng_Latn}} \nArabic: "
+include: ntrex
+task: ntrex_eng_Latn-arb_Arab_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-bem_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-bem_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e4ab2af30cba88d413b0c99a868b52614921aed8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-bem_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: bem_Latn
+doc_to_target: sentence_bem_Latn
+doc_to_text: "As a Bemba and English linguist, translate the following English sentences\
+  \ to Bemba \nEnglish: {{sentence_eng_Latn}} \nBemba: "
+include: ntrex
+task: ntrex_eng_Latn-bem_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-ewe_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-ewe_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e1c99ad06add81beb54d8b0e3b0d97a987bd2d70
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-ewe_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ewe_Latn
+doc_to_target: sentence_ewe_Latn
+doc_to_text: "As a Ewe and English linguist, translate the following English sentences\
+  \ to Ewe \nEnglish: {{sentence_eng_Latn}} \nEwe: "
+include: ntrex
+task: ntrex_eng_Latn-ewe_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-fra_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-fra_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3668db57aa9a1b9431dec109be78bf98e0080962
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-fra_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: fra_Latn
+doc_to_target: sentence_fra_Latn
+doc_to_text: "As a French and English linguist, translate the following English sentences\
+  \ to French \nEnglish: {{sentence_eng_Latn}} \nFrench: "
+include: ntrex
+task: ntrex_eng_Latn-fra_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-hau_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-hau_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6bca042cb417c3511cf4e8fb442c61239f010a12
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-hau_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: hau_Latn
+doc_to_target: sentence_hau_Latn
+doc_to_text: "As a Hausa and English linguist, translate the following English sentences\
+  \ to Hausa \nEnglish: {{sentence_eng_Latn}} \nHausa: "
+include: ntrex
+task: ntrex_eng_Latn-hau_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-ibo_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-ibo_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c23fcce8fee0b0767977b865af9f24eb27396384
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-ibo_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ibo_Latn
+doc_to_target: sentence_ibo_Latn
+doc_to_text: "As a Igbo and English linguist, translate the following English sentences\
+  \ to Igbo \nEnglish: {{sentence_eng_Latn}} \nIgbo: "
+include: ntrex
+task: ntrex_eng_Latn-ibo_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-kin_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-kin_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b0041bfb4a44480f142af0c1b9ea37ccd9a47663
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-kin_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kin_Latn
+doc_to_target: sentence_kin_Latn
+doc_to_text: "As a Kinyarwanda and English linguist, translate the following English\
+  \ sentences to Kinyarwanda \nEnglish: {{sentence_eng_Latn}} \nKinyarwanda: "
+include: ntrex
+task: ntrex_eng_Latn-kin_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-mey_Arab.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-mey_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..435df83d9fe3e56a2a75cab98df058c54fd5a8a0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-mey_Arab.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: mey_Arab
+doc_to_target: sentence_mey_Arab
+doc_to_text: "As a Hassaniya Arabic and English linguist, translate the following\
+  \ English sentences to Hassaniya Arabic \nEnglish: {{sentence_eng_Latn}} \nHassaniya\
+  \ Arabic: "
+include: ntrex
+task: ntrex_eng_Latn-mey_Arab_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-mlg_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-mlg_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..74f92d92668d1b5a9539c503fd3aa5c687988ed8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-mlg_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: mlg_Latn
+doc_to_target: sentence_mlg_Latn
+doc_to_text: "As a Malagasy and English linguist, translate the following English\
+  \ sentences to Malagasy \nEnglish: {{sentence_eng_Latn}} \nMalagasy: "
+include: ntrex
+task: ntrex_eng_Latn-mlg_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-msa_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-msa_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bc9a3245f365cdb7c03e5d67e45a9bb236b6477f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-msa_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: msa_Latn
+doc_to_target: sentence_msa_Latn
+doc_to_text: "As a Malay and English linguist, translate the following English sentences\
+  \ to Malay \nEnglish: {{sentence_eng_Latn}} \nMalay: "
+include: ntrex
+task: ntrex_eng_Latn-msa_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-nde_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-nde_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f7cf092447f290829c9ac3bbbcbb49d915543f26
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-nde_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: nde_Latn
+doc_to_target: sentence_nde_Latn
+doc_to_text: "As a North Ndebele and English linguist, translate the following English\
+  \ sentences to North Ndebele \nEnglish: {{sentence_eng_Latn}} \nNorth Ndebele: "
+include: ntrex
+task: ntrex_eng_Latn-nde_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-nso_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-nso_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d52c1ef1f9c4a1ba82ae0f0722669fbf126569f2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-nso_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: nso_Latn
+doc_to_target: sentence_nso_Latn
+doc_to_text: "As a Northern Sotho and English linguist, translate the following English\
+  \ sentences to Northern Sotho \nEnglish: {{sentence_eng_Latn}} \nNorthern Sotho: "
+include: ntrex
+task: ntrex_eng_Latn-nso_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-nya_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-nya_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5a3d395516d48af64ed67d177cd0fa8b28fd9a46
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-nya_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: nya_Latn
+doc_to_target: sentence_nya_Latn
+doc_to_text: "As a Chichewa and English linguist, translate the following English\
+  \ sentences to Chichewa \nEnglish: {{sentence_eng_Latn}} \nChichewa: "
+include: ntrex
+task: ntrex_eng_Latn-nya_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-orm_Ethi.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-orm_Ethi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d3de07b02307696d09c97eec6120b69580dffade
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-orm_Ethi.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: orm_Ethi
+doc_to_target: sentence_orm_Ethi
+doc_to_text: "As a Oromo and English linguist, translate the following English sentences\
+  \ to Oromo \nEnglish: {{sentence_eng_Latn}} \nOromo: "
+include: ntrex
+task: ntrex_eng_Latn-orm_Ethi_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-shi_Arab.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-shi_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e193c7a3b5a73495c70dcd4176288a79cd6eb2c6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-shi_Arab.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: shi_Arab
+doc_to_target: sentence_shi_Arab
+doc_to_text: "As a Tachelhit and English linguist, translate the following English\
+  \ sentences to Tachelhit \nEnglish: {{sentence_eng_Latn}} \nTachelhit: "
+include: ntrex
+task: ntrex_eng_Latn-shi_Arab_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-sna_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-sna_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ce8c50f5cacf69e70aca8f451ab8bc1fa8270158
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-sna_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sna_Latn
+doc_to_target: sentence_sna_Latn
+doc_to_text: "As a Shona (Latin) and English linguist, translate the following English\
+  \ sentences to Shona (Latin) \nEnglish: {{sentence_eng_Latn}} \nShona (Latin): "
+include: ntrex
+task: ntrex_eng_Latn-sna_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-som_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-som_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4b7f46323401a4c04b1026507b1163111fa71455
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-som_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: som_Latn
+doc_to_target: sentence_som_Latn
+doc_to_text: "As a Somali and English linguist, translate the following English sentences\
+  \ to Somali \nEnglish: {{sentence_eng_Latn}} \nSomali: "
+include: ntrex
+task: ntrex_eng_Latn-som_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-ssw_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-ssw_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6f02e88ca3f7d5abb314ea174fe21c35b48af402
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-ssw_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ssw_Latn
+doc_to_target: sentence_ssw_Latn
+doc_to_text: "As a Swati and English linguist, translate the following English sentences\
+  \ to Swati \nEnglish: {{sentence_eng_Latn}} \nSwati: "
+include: ntrex
+task: ntrex_eng_Latn-ssw_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-swa_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-swa_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..47090821da435d1b9d4caada3e91221cd1eed3b3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-swa_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: swa_Latn
+doc_to_target: sentence_swa_Latn
+doc_to_text: "As a Swahili and English linguist, translate the following English sentences\
+  \ to Swahili \nEnglish: {{sentence_eng_Latn}} \nSwahili: "
+include: ntrex
+task: ntrex_eng_Latn-swa_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-tam_Taml.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-tam_Taml.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..78d61866bd42b467246479946dfa342a6e7835ff
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-tam_Taml.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: tam_Taml
+doc_to_target: sentence_tam_Taml
+doc_to_text: "As a Tamil and English linguist, translate the following English sentences\
+  \ to Tamil \nEnglish: {{sentence_eng_Latn}} \nTamil: "
+include: ntrex
+task: ntrex_eng_Latn-tam_Taml_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-tel_Telu.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-tel_Telu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..829635313dd6f5cbc1d08c31a52732aef1513e19
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-tel_Telu.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: tel_Telu
+doc_to_target: sentence_tel_Telu
+doc_to_text: "As a Telugu and English linguist, translate the following English sentences\
+  \ to Telugu \nEnglish: {{sentence_eng_Latn}} \nTelugu: "
+include: ntrex
+task: ntrex_eng_Latn-tel_Telu_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-tir_Ethi.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-tir_Ethi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1f27f4389cb2b2be56b02f2427a8d8df222aed17
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-tir_Ethi.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: tir_Ethi
+doc_to_target: sentence_tir_Ethi
+doc_to_text: "As a Tigrinya and English linguist, translate the following English\
+  \ sentences to Tigrinya \nEnglish: {{sentence_eng_Latn}} \nTigrinya: "
+include: ntrex
+task: ntrex_eng_Latn-tir_Ethi_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-ton_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-ton_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3ffeb74fbb04205e0bb1b27d0ec855252688f6e5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-ton_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ton_Latn
+doc_to_target: sentence_ton_Latn
+doc_to_text: "As a Tongan and English linguist, translate the following English sentences\
+  \ to Tongan \nEnglish: {{sentence_eng_Latn}} \nTongan: "
+include: ntrex
+task: ntrex_eng_Latn-ton_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-tsn_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-tsn_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ed11f2cba88a703b44ce2a077f761b4cd98135c0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-tsn_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: tsn_Latn
+doc_to_target: sentence_tsn_Latn
+doc_to_text: "As a Tswana and English linguist, translate the following English sentences\
+  \ to Tswana \nEnglish: {{sentence_eng_Latn}} \nTswana: "
+include: ntrex
+task: ntrex_eng_Latn-tsn_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-urd_Arab.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-urd_Arab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a05e951bef2f38005b9d7fb3133bdf811f69c565
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-urd_Arab.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: urd_Arab
+doc_to_target: sentence_urd_Arab
+doc_to_text: "As a Urdu and English linguist, translate the following English sentences\
+  \ to Urdu \nEnglish: {{sentence_eng_Latn}} \nUrdu: "
+include: ntrex
+task: ntrex_eng_Latn-urd_Arab_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-ven_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-ven_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4345201694bc0c8a9f9bda487a9ecfb36982c8bf
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-ven_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ven_Latn
+doc_to_target: sentence_ven_Latn
+doc_to_text: "As a Venda and English linguist, translate the following English sentences\
+  \ to Venda \nEnglish: {{sentence_eng_Latn}} \nVenda: "
+include: ntrex
+task: ntrex_eng_Latn-ven_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-wol_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-wol_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..48abbb33f870ec3305f8337e62b131bbd38683fb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-wol_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: wol_Latn
+doc_to_target: sentence_wol_Latn
+doc_to_text: "As a Wolof and English linguist, translate the following English sentences\
+  \ to Wolof \nEnglish: {{sentence_eng_Latn}} \nWolof: "
+include: ntrex
+task: ntrex_eng_Latn-wol_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-xho_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-xho_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b1071a5fb2faad74df4e2f357f412923162b0044
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-xho_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: xho_Latn
+doc_to_target: sentence_xho_Latn
+doc_to_text: "As a Xhosa and English linguist, translate the following English sentences\
+  \ to Xhosa \nEnglish: {{sentence_eng_Latn}} \nXhosa: "
+include: ntrex
+task: ntrex_eng_Latn-xho_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-yor_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-yor_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..43c1be35ee76adf853e6429e4bb06fea867ce5d6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-yor_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: yor_Latn
+doc_to_target: sentence_yor_Latn
+doc_to_text: "As a Yoruba and English linguist, translate the following English sentences\
+  \ to Yoruba \nEnglish: {{sentence_eng_Latn}} \nYoruba: "
+include: ntrex
+task: ntrex_eng_Latn-yor_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-zul_Latn.yaml b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-zul_Latn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..10e890a9b3cbffdbb2205d091d91fa42eae880b1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex_eng_Latn-zul_Latn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: zul_Latn
+doc_to_target: sentence_zul_Latn
+doc_to_text: "As a Zulu and English linguist, translate the following English sentences\
+  \ to Zulu \nEnglish: {{sentence_eng_Latn}} \nZulu: "
+include: ntrex
+task: ntrex_eng_Latn-zul_Latn_prompt_3
diff --git a/lm_eval/tasks/afrobench/openai_mmlu/README.md b/lm_eval/tasks/afrobench/openai_mmlu/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..fe980e87464b07c91d2c766254c760d772d65c36
--- /dev/null
+++ b/lm_eval/tasks/afrobench/openai_mmlu/README.md
@@ -0,0 +1,25 @@
+#
+
+## Paper
+Title: `Multilingual Massive Multitask Language Understanding (MMMLU)`
+
+Paper Link: https://arxiv.org/abs/2009.03300
+
+## Abstract
+>We propose a new test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more. To attain high accuracy on this test, models must possess extensive world knowledge and problem solving ability. We find that while most recent models have near random-chance accuracy, the very largest GPT-3 model improves over random chance by almost 20 percentage points on average. However, on every one of the 57 tasks, the best models still need substantial improvements before they can reach expert-level accuracy. Models also have lopsided performance and frequently do not know when they are wrong. Worse, they still have near-random accuracy on some socially important subjects such as morality and law. By comprehensively evaluating the breadth and depth of a model's academic and professional understanding, our test can be used to analyze models across many tasks and to identify important shortcomings.
+
+HomePage: https://huggingface.co/datasets/openai/MMMLU
+
+### Citation
+
+```
+@misc{hendrycks2021measuringmassivemultitasklanguage,
+      title={Measuring Massive Multitask Language Understanding},
+      author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
+      year={2021},
+      eprint={2009.03300},
+      archivePrefix={arXiv},
+      primaryClass={cs.CY},
+      url={https://arxiv.org/abs/2009.03300},
+}
+```
diff --git a/lm_eval/tasks/afrobench/openai_mmlu/openai_mmlu.yaml b/lm_eval/tasks/afrobench/openai_mmlu/openai_mmlu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..541eb43cfdd783b15cad4123437c2dffcf1cc794
--- /dev/null
+++ b/lm_eval/tasks/afrobench/openai_mmlu/openai_mmlu.yaml
@@ -0,0 +1,13 @@
+group: openai_mmlu
+task:
+  - openai_mmlu_prompt_1
+  - openai_mmlu_prompt_2
+  - openai_mmlu_prompt_3
+  - openai_mmlu_prompt_4
+  - openai_mmlu_prompt_5
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1
diff --git a/lm_eval/tasks/afrobench/openai_mmlu/prompt_1/openai_mmlu b/lm_eval/tasks/afrobench/openai_mmlu/prompt_1/openai_mmlu
new file mode 100644
index 0000000000000000000000000000000000000000..ce4f02eeda277404713974f4699c716b454514f5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/openai_mmlu/prompt_1/openai_mmlu
@@ -0,0 +1,22 @@
+tag:
+    - openai_mmlu_tasks
+    - openai_mmlu_prompt_1
+    - afrobench_mmlu_tasks
+dataset_path: openai/MMMLU
+output_type: multiple_choice
+test_split: test
+fewshot_config:
+  sampler: first_n
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(Answer.strip())}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{Question}}"
+doc_to_choice: ["A", "B", "C", "D"]
+metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+  - metric: acc_norm
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/openai_mmlu/prompt_1/openai_mmlu_ara.yaml b/lm_eval/tasks/afrobench/openai_mmlu/prompt_1/openai_mmlu_ara.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5c9b86fc1d1c5d8185692c48bc85d991714dbff5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/openai_mmlu/prompt_1/openai_mmlu_ara.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: AR_XY
+doc_to_text: 'Q: {{Question.strip()}}
+
+  A: {{A}}
+
+  B: {{B}}
+
+  C: {{C}}
+
+  D: {{D}}
+
+  Please choose the correct answer from the options above:'
+include: openai_mmlu
+task: openai_mmlu_ara_prompt_1
diff --git a/lm_eval/tasks/afrobench/openai_mmlu/prompt_1/openai_mmlu_swa.yaml b/lm_eval/tasks/afrobench/openai_mmlu/prompt_1/openai_mmlu_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1a3661d45235258f0c0cb1a6bb21119de326ef7f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/openai_mmlu/prompt_1/openai_mmlu_swa.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: SW_KE
+doc_to_text: 'Q: {{Question.strip()}}
+
+  A: {{A}}
+
+  B: {{B}}
+
+  C: {{C}}
+
+  D: {{D}}
+
+  Please choose the correct answer from the options above:'
+include: openai_mmlu
+task: openai_mmlu_swa_prompt_1
diff --git a/lm_eval/tasks/afrobench/openai_mmlu/prompt_1/openai_mmlu_yor.yaml b/lm_eval/tasks/afrobench/openai_mmlu/prompt_1/openai_mmlu_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4124252bfc0b549160ac802f18c44004792d3bf2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/openai_mmlu/prompt_1/openai_mmlu_yor.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: YO_NG
+doc_to_text: 'Q: {{Question.strip()}}
+
+  A: {{A}}
+
+  B: {{B}}
+
+  C: {{C}}
+
+  D: {{D}}
+
+  Please choose the correct answer from the options above:'
+include: openai_mmlu
+task: openai_mmlu_yor_prompt_1
diff --git a/lm_eval/tasks/afrobench/openai_mmlu/prompt_2/openai_mmlu b/lm_eval/tasks/afrobench/openai_mmlu/prompt_2/openai_mmlu
new file mode 100644
index 0000000000000000000000000000000000000000..9f39b0a9d7423b4d5638f23f294b636240570281
--- /dev/null
+++ b/lm_eval/tasks/afrobench/openai_mmlu/prompt_2/openai_mmlu
@@ -0,0 +1,22 @@
+tag:
+    - openai_mmlu_tasks
+    - openai_mmlu_prompt_2
+    - afrobench_mmlu_tasks
+dataset_path: openai/MMMLU
+output_type: multiple_choice
+test_split: test
+fewshot_config:
+  sampler: first_n
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(Answer.strip())}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{Question}}"
+doc_to_choice: ["A", "B", "C", "D"]
+metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+  - metric: acc_norm
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/openai_mmlu/prompt_2/openai_mmlu_ara.yaml b/lm_eval/tasks/afrobench/openai_mmlu/prompt_2/openai_mmlu_ara.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..550834257a69f7054ae397a403c6dc00d15c8888
--- /dev/null
+++ b/lm_eval/tasks/afrobench/openai_mmlu/prompt_2/openai_mmlu_ara.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: AR_XY
+doc_to_text: 'Question: {{Question.strip()}}
+
+  1: {{A}}
+
+  2: {{B}}
+
+  3: {{C}}
+
+  4: {{D}}
+
+  Please select the correct answer from the given choices:'
+include: openai_mmlu
+task: openai_mmlu_ara_prompt_2
diff --git a/lm_eval/tasks/afrobench/openai_mmlu/prompt_2/openai_mmlu_swa.yaml b/lm_eval/tasks/afrobench/openai_mmlu/prompt_2/openai_mmlu_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9b3025fd726ab59f48ee90bb65b294580a6cfc3c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/openai_mmlu/prompt_2/openai_mmlu_swa.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: SW_KE
+doc_to_text: 'Question: {{Question.strip()}}
+
+  1: {{A}}
+
+  2: {{B}}
+
+  3: {{C}}
+
+  4: {{D}}
+
+  Please select the correct answer from the given choices:'
+include: openai_mmlu
+task: openai_mmlu_swa_prompt_2
diff --git a/lm_eval/tasks/afrobench/openai_mmlu/prompt_2/openai_mmlu_yor.yaml b/lm_eval/tasks/afrobench/openai_mmlu/prompt_2/openai_mmlu_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..145b237ef50234278732605b1e3936bfccb9968a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/openai_mmlu/prompt_2/openai_mmlu_yor.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: YO_NG
+doc_to_text: 'Question: {{Question.strip()}}
+
+  1: {{A}}
+
+  2: {{B}}
+
+  3: {{C}}
+
+  4: {{D}}
+
+  Please select the correct answer from the given choices:'
+include: openai_mmlu
+task: openai_mmlu_yor_prompt_2
diff --git a/lm_eval/tasks/afrobench/openai_mmlu/prompt_3/openai_mmlu b/lm_eval/tasks/afrobench/openai_mmlu/prompt_3/openai_mmlu
new file mode 100644
index 0000000000000000000000000000000000000000..95456656739a2490a3e11037e7d9f67f72d60962
--- /dev/null
+++ b/lm_eval/tasks/afrobench/openai_mmlu/prompt_3/openai_mmlu
@@ -0,0 +1,23 @@
+tag:
+    - openai_mmlu_tasks
+    - openai_mmlu_prompt_3
+    - afrobench_mmlu_tasks
+dataset_path: openai/MMMLU
+dataset_name: null
+output_type: multiple_choice
+test_split: test
+fewshot_config:
+  sampler: first_n
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(Answer.strip())}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{Question}}"
+doc_to_choice: ["A", "B", "C", "D"]
+metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+  - metric: acc_norm
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/openai_mmlu/prompt_3/openai_mmlu_ara.yaml b/lm_eval/tasks/afrobench/openai_mmlu/prompt_3/openai_mmlu_ara.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..012192ceee6638f197db8ea8b9210e1529b6b92d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/openai_mmlu/prompt_3/openai_mmlu_ara.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: AR_XY
+doc_to_text: 'Input Question: {{Question.strip()}}
+
+  Option A: {{A}}
+
+  Option B: {{B}}
+
+  Option C: {{C}}
+
+  Option D: {{D}}
+
+  Please indicate the correct option from the list above:'
+include: openai_mmlu
+task: openai_mmlu_ara_prompt_3
diff --git a/lm_eval/tasks/afrobench/openai_mmlu/prompt_3/openai_mmlu_swa.yaml b/lm_eval/tasks/afrobench/openai_mmlu/prompt_3/openai_mmlu_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..431bdb345178bf44b12ea01507cc805cd000113f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/openai_mmlu/prompt_3/openai_mmlu_swa.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: SW_KE
+doc_to_text: 'Input Question: {{Question.strip()}}
+
+  Option A: {{A}}
+
+  Option B: {{B}}
+
+  Option C: {{C}}
+
+  Option D: {{D}}
+
+  Please indicate the correct option from the list above:'
+include: openai_mmlu
+task: openai_mmlu_swa_prompt_3
diff --git a/lm_eval/tasks/afrobench/openai_mmlu/prompt_3/openai_mmlu_yor.yaml b/lm_eval/tasks/afrobench/openai_mmlu/prompt_3/openai_mmlu_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..814fe380267e57da691f727198f2828042aa54c0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/openai_mmlu/prompt_3/openai_mmlu_yor.yaml
@@ -0,0 +1,15 @@
+# Generated by utils.py
+dataset_name: YO_NG
+doc_to_text: 'Input Question: {{Question.strip()}}
+
+  Option A: {{A}}
+
+  Option B: {{B}}
+
+  Option C: {{C}}
+
+  Option D: {{D}}
+
+  Please indicate the correct option from the list above:'
+include: openai_mmlu
+task: openai_mmlu_yor_prompt_3
diff --git a/lm_eval/tasks/afrobench/openai_mmlu/prompt_4/openai_mmlu b/lm_eval/tasks/afrobench/openai_mmlu/prompt_4/openai_mmlu
new file mode 100644
index 0000000000000000000000000000000000000000..37a5949f93795737f8f61a06fc2824ebb671dbe2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/openai_mmlu/prompt_4/openai_mmlu
@@ -0,0 +1,23 @@
+tag:
+    - openai_mmlu_tasks
+    - openai_mmlu_prompt_4
+    - afrobench_mmlu_tasks
+dataset_path: openai/MMMLU
+dataset_name: null
+output_type: multiple_choice
+test_split: test
+fewshot_config:
+  sampler: first_n
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(Answer.strip())}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{Question}}"
+doc_to_choice: ["A", "B", "C", "D"]
+metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+  - metric: acc_norm
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/openai_mmlu/prompt_4/openai_mmlu_ara.yaml b/lm_eval/tasks/afrobench/openai_mmlu/prompt_4/openai_mmlu_ara.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..793eb7441ce36573953525c3c97e60daffb10b02
--- /dev/null
+++ b/lm_eval/tasks/afrobench/openai_mmlu/prompt_4/openai_mmlu_ara.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: AR_XY
+doc_to_text: 'Critically analyze the question and select the most probable answer
+  from the list:
+
+  {{Question.strip()}}
+
+  Choices:
+
+  A) {{A}}
+
+  B) {{B}}
+
+  C) {{C}}
+
+  D) {{D}}'
+include: openai_mmlu
+task: openai_mmlu_ara_prompt_4
diff --git a/lm_eval/tasks/afrobench/openai_mmlu/prompt_4/openai_mmlu_swa.yaml b/lm_eval/tasks/afrobench/openai_mmlu/prompt_4/openai_mmlu_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..095dd7ff6d04db581bc070eff001d48600014e0e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/openai_mmlu/prompt_4/openai_mmlu_swa.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: SW_KE
+doc_to_text: 'Critically analyze the question and select the most probable answer
+  from the list:
+
+  {{Question.strip()}}
+
+  Choices:
+
+  A) {{A}}
+
+  B) {{B}}
+
+  C) {{C}}
+
+  D) {{D}}'
+include: openai_mmlu
+task: openai_mmlu_swa_prompt_4
diff --git a/lm_eval/tasks/afrobench/openai_mmlu/prompt_4/openai_mmlu_yor.yaml b/lm_eval/tasks/afrobench/openai_mmlu/prompt_4/openai_mmlu_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dd0a9daa1ed5a2ae9882225aefdfe5e653dffcc6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/openai_mmlu/prompt_4/openai_mmlu_yor.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: YO_NG
+doc_to_text: 'Critically analyze the question and select the most probable answer
+  from the list:
+
+  {{Question.strip()}}
+
+  Choices:
+
+  A) {{A}}
+
+  B) {{B}}
+
+  C) {{C}}
+
+  D) {{D}}'
+include: openai_mmlu
+task: openai_mmlu_yor_prompt_4
diff --git a/lm_eval/tasks/afrobench/openai_mmlu/prompt_5/openai_mmlu b/lm_eval/tasks/afrobench/openai_mmlu/prompt_5/openai_mmlu
new file mode 100644
index 0000000000000000000000000000000000000000..77183eb04c0567b83f87bfd17bbdd18bf003f7dd
--- /dev/null
+++ b/lm_eval/tasks/afrobench/openai_mmlu/prompt_5/openai_mmlu
@@ -0,0 +1,23 @@
+tag:
+    - openai_mmlu_tasks
+    - openai_mmlu_prompt_5
+    - afrobench_mmlu_tasks
+dataset_path: openai/MMMLU
+dataset_name: null
+output_type: multiple_choice
+test_split: test
+fewshot_config:
+  sampler: first_n
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(Answer.strip())}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{Question}}"
+doc_to_choice: ["A", "B", "C", "D"]
+metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+  - metric: acc_norm
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/openai_mmlu/prompt_5/openai_mmlu_ara.yaml b/lm_eval/tasks/afrobench/openai_mmlu/prompt_5/openai_mmlu_ara.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..50a6e74ff2198b326716b99a7430102d8aaf0221
--- /dev/null
+++ b/lm_eval/tasks/afrobench/openai_mmlu/prompt_5/openai_mmlu_ara.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: AR_XY
+doc_to_text: 'Answer the question and pick the correct answer from the options: {{Question.strip()}}
+
+  Options:
+
+  A. {{A}}
+
+  B. {{B}}
+
+  C. {{C}}
+
+  D. {{D}}
+
+  Please choose the correct option from the above list:'
+include: openai_mmlu
+task: openai_mmlu_ara_prompt_5
diff --git a/lm_eval/tasks/afrobench/openai_mmlu/prompt_5/openai_mmlu_swa.yaml b/lm_eval/tasks/afrobench/openai_mmlu/prompt_5/openai_mmlu_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c0cc19860cc5f7bc90d499a1ead811a549170eb6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/openai_mmlu/prompt_5/openai_mmlu_swa.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: SW_KE
+doc_to_text: 'Answer the question and pick the correct answer from the options: {{Question.strip()}}
+
+  Options:
+
+  A. {{A}}
+
+  B. {{B}}
+
+  C. {{C}}
+
+  D. {{D}}
+
+  Please choose the correct option from the above list:'
+include: openai_mmlu
+task: openai_mmlu_swa_prompt_5
diff --git a/lm_eval/tasks/afrobench/openai_mmlu/prompt_5/openai_mmlu_yor.yaml b/lm_eval/tasks/afrobench/openai_mmlu/prompt_5/openai_mmlu_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..691657ef46974107e46c37291fb1efa66364a5b9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/openai_mmlu/prompt_5/openai_mmlu_yor.yaml
@@ -0,0 +1,17 @@
+# Generated by utils.py
+dataset_name: YO_NG
+doc_to_text: 'Answer the question and pick the correct answer from the options: {{Question.strip()}}
+
+  Options:
+
+  A. {{A}}
+
+  B. {{B}}
+
+  C. {{C}}
+
+  D. {{D}}
+
+  Please choose the correct option from the above list:'
+include: openai_mmlu
+task: openai_mmlu_yor_prompt_5
diff --git a/lm_eval/tasks/afrobench/openai_mmlu/utils.py b/lm_eval/tasks/afrobench/openai_mmlu/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fc0fea958c32b2b8d104f586045564c04de8c86
--- /dev/null
+++ b/lm_eval/tasks/afrobench/openai_mmlu/utils.py
@@ -0,0 +1,99 @@
+import argparse
+import os
+
+import yaml
+
+
+def prompt_func(mode, lang):
+    prompt_map = {
+        "prompt_1": "Q: {{Question.strip()}}\nA: {{A}}\nB: {{B}}\nC: {{C}}\nD: {{D}}\nPlease choose the correct answer from the options above:",
+        "prompt_2": "Question: {{Question.strip()}}\n1: {{A}}\n2: {{B}}\n3: {{C}}\n4: {{D}}\nPlease select the correct answer from the given choices:",
+        "prompt_3": "Input Question: {{Question.strip()}}\nOption A: {{A}}\nOption B: {{B}}\nOption C: {{C}}\nOption D: {{D}}\nPlease indicate the correct option from the list above:",
+        "prompt_4": "Critically analyze the question and select the most probable answer from the list:\n{{Question.strip()}}\nChoices:\nA) {{A}}\nB) {{B}}\nC) {{C}}\nD) {{D}}",
+        "prompt_5": "Answer the question and pick the correct answer from the options: {{Question.strip()}}\nOptions:\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nPlease choose the correct option from the above list:",
+    }
+    return prompt_map[mode]
+
+
+def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
+    """
+    Generate a yaml file for each language.
+
+    :param output_dir: The directory to output the files to.
+    :param overwrite: Whether to overwrite files if they already exist.
+    """
+    err = []
+    languages = {
+        "ara": "Arabic",
+        "swa": "Swahili",
+        "yor": "Yoruba",
+    }
+
+    lang2_code = {
+        "ara": "AR_XY",
+        "swa": "SW_KE",
+        "yor": "YO_NG",
+    }
+
+    for lang in languages.keys():
+        try:
+            file_name = f"openai_mmlu_{lang}.yaml"
+            task_name = f"openai_mmlu_{lang}_{mode}"
+            yaml_template = "openai_mmlu"
+            yaml_details = {
+                "include": yaml_template,
+                "task": task_name,
+                "dataset_name": lang2_code[lang],
+                "doc_to_text": prompt_func(mode, languages[lang]),
+            }
+            file_path = os.path.join(output_dir, mode)
+            os.makedirs(file_path, exist_ok=True)
+
+            with open(
+                f"{output_dir}/{mode}/{file_name}",
+                "w" if overwrite else "x",
+                encoding="utf8",
+            ) as f:
+                f.write("# Generated by utils.py\n")
+                yaml.dump(
+                    yaml_details,
+                    f,
+                    allow_unicode=True,
+                )
+        except FileExistsError:
+            err.append(file_name)
+
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist (use --overwrite flag):"
+            f" {', '.join(err)}"
+        )
+
+
+def main() -> None:
+    """Parse CLI args and generate language-specific yaml files."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--overwrite",
+        default=True,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default="./",
+        help="Directory to write yaml files to",
+    )
+    parser.add_argument(
+        "--mode",
+        default="prompt_1",
+        choices=["prompt_1", "prompt_2", "prompt_3", "prompt_4", "prompt_5"],
+        help="Prompt number",
+    )
+    args = parser.parse_args()
+
+    gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite, mode=args.mode)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lm_eval/tasks/afrobench/salt/README.md b/lm_eval/tasks/afrobench/salt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3c5239a05e88cbfbadf6670f96d6ed621b0d805c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/README.md
@@ -0,0 +1,17 @@
+#
+
+## Paper
+Title: `Sunbird African Language Technology (SALT) dataset`
+
+Paper Link: https://aclanthology.org/2023.emnlp-main.862/
+
+## Abstract
+>SALT is a multi-way parallel text and speech corpus of Engish and six languages widely spoken in Uganda and East Africa: Luganda, Lugbara, Acholi, Runyankole, Ateso and Swahili. The core of the dataset is a set of 25,000 sentences covering a range of topics of local relevance, such as agriculture, health and society. Each sentence is translated into all languages, to support machine translation, and speech recordings are made for approximately 5,000 of the sentences both by a variety of speakers in natural settings (suitable for ASR) and by professionals in a studio setting (suitable for text-to-speech).
+
+HomePage: https://github.com/SunbirdAI/salt
+
+### Publications
+
+Multilingual Model and Data Resources for Text-To-Speech in Ugandan Languages. Isaac Owomugisha, Benjamin Akera, Ernest Tonny Mwebaze, John Quinn. 4th Workshop on African Natural Language Processing, 2023. [pdf](https://openreview.net/pdf?id=vaxG0WAPzL)
+
+Machine Translation For African Languages: Community Creation Of Datasets And Models In Uganda. Benjamin Akera, Jonathan Mukiibi, Lydia Sanyu Naggayi, Claire Babirye, Isaac Owomugisha, Solomon Nsumba, Joyce Nakatumba-Nabende, Engineer Bainomugisha, Ernest Mwebaze, John Quinn. 3rd Workshop on African Natural Language Processing, 2022. [pdf](https://openreview.net/pdf?id=BK-z5qzEU-9)
diff --git a/lm_eval/tasks/afrobench/salt/gen_utils.py b/lm_eval/tasks/afrobench/salt/gen_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ac703a0d5d0912d38fb624dbba967ed3ffdb734
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/gen_utils.py
@@ -0,0 +1,149 @@
+import argparse
+import os
+
+import yaml
+
+
+class FunctionTag:
+    def __init__(self, value):
+        self.value = value
+
+
+def prompt_func(mode, lang, lang_dict):
+    language_column_name = f"{lang}_text"
+    prompt_map = {
+        "prompt_1": f"{lang_dict[lang]} sentence: {{{{{language_column_name}}}}} \nEnglish sentence: ",
+        "prompt_1_reverse": "English sentence: {{eng_source_text}} "
+        f"\n{lang_dict[lang]} sentence: ",
+        "prompt_2": f"You are a translation expert. Translate the following {lang_dict[lang]} sentences to English \n"
+        f"{lang_dict[lang]} sentence: {{{{{language_column_name}}}}}\nEnglish sentence: ",
+        "prompt_2_reverse": f"You are a translation expert. Translate the following English sentences to "
+        f"{lang_dict[lang]} "
+        "\nEnglish sentence: {{eng_source_text}} "
+        f"\n{lang_dict[lang]} sentence: ",
+        "prompt_3": f"As a {lang_dict[lang]} and English linguist, translate the following {lang_dict[lang]} sentences "
+        f"to English. \n{lang_dict[lang]} sentence: {{{{{language_column_name}}}}}\nEnglish sentence: ",
+        "prompt_3_reverse": f"As a {lang_dict[lang]} and English linguist, translate the following English sentences to "
+        f"{lang_dict[lang]}. "
+        "\nEnglish sentence: {{eng_source_text}} "
+        f"\n{lang_dict[lang]} sentence: ",
+    }
+    return prompt_map[mode]
+
+
+def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str, reverse: bool) -> None:
+    """
+    Generate a yaml file for each language.
+
+    :param output_dir: The directory to output the files to.
+    :param overwrite: Whether to overwrite files if they already exist.
+    """
+    err = []
+    languages = {
+        "eng": "English",
+        "lug": "Luganda",
+        "ach": "Acholi",
+        "lgg": "Lugbara",
+        "teo": "Ateso",
+        "nyn": "Runyankole",
+        "swa": "Swahili",
+        "ibo": "Igbo",
+    }
+
+    for lang in languages.keys():
+        try:
+            if lang != "eng":
+                if not reverse:
+                    file_name = f"salt_{lang}-eng.yaml"
+                    task_name = f"salt_{lang}-eng_{mode}"
+                    yaml_template = "salt"
+                    yaml_details = {
+                        "include": yaml_template,
+                        "task": task_name,
+                        "dataset_name": "text-all",
+                        "doc_to_target": "eng_target_text",
+                        "doc_to_text": prompt_func(mode, lang, languages),
+                    }
+                    os.makedirs(f"{output_dir}/{mode}", exist_ok=True)
+                    with open(
+                        f"{output_dir}/{mode}/{file_name}",
+                        "w" if overwrite else "x",
+                        encoding="utf8",
+                    ) as f:
+                        f.write("# Generated by utils.py\n")
+                        yaml.dump(
+                            yaml_details,
+                            f,
+                            allow_unicode=True,
+                        )
+                else:
+                    file_name = f"salt_eng-{lang}.yaml"
+                    task_name = f"salt_eng-{lang}_{mode}"
+                    yaml_template = "salt"
+                    yaml_details = {
+                        "include": yaml_template,
+                        "task": task_name,
+                        "dataset_name": "text-all",
+                        "doc_to_target": f"{lang}_text",
+                        "doc_to_text": prompt_func(f"{mode}_reverse", lang, languages),
+                    }
+                    os.makedirs(f"{output_dir}/{mode}", exist_ok=True)
+                    with open(
+                        f"{output_dir}/{mode}/{file_name}",
+                        "w" if overwrite else "x",
+                        encoding="utf8",
+                    ) as f:
+                        f.write("# Generated by utils.py\n")
+                        yaml.dump(
+                            yaml_details,
+                            f,
+                            allow_unicode=True,
+                        )
+        except FileExistsError:
+            err.append(file_name)
+
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist (use --overwrite flag):"
+            f" {', '.join(err)}"
+        )
+
+
+def main() -> None:
+    """Parse CLI args and generate language-specific yaml files."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--overwrite",
+        default=True,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default="./",
+        help="Directory to write yaml files to",
+    )
+    parser.add_argument(
+        "--mode",
+        default="prompt_1",
+        choices=["prompt_1", "prompt_2", "prompt_3"],
+        help="Prompt number",
+    )
+    parser.add_argument(
+        "--reverse",
+        default=True,
+        choices=[True, False],
+        help="Reverse the translation direction",
+    )
+    args = parser.parse_args()
+
+    gen_lang_yamls(
+        output_dir=args.output_dir,
+        overwrite=args.overwrite,
+        mode=args.mode,
+        reverse=args.reverse,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lm_eval/tasks/afrobench/salt/prompt_1/salt b/lm_eval/tasks/afrobench/salt/prompt_1/salt
new file mode 100644
index 0000000000000000000000000000000000000000..a07d434a8bfb5e4c85abef6fe556e648c6fe5a00
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_1/salt
@@ -0,0 +1,24 @@
+tag:
+- salt_tasks
+- salt_prompt_1
+- afrobench_MT_tasks
+dataset_path: Sunbird/salt
+dataset_kwargs: {trust_remote_code: True}
+output_type: generate_until
+validation_split: dev
+fewshot_split: dev
+test_split: test
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: chrf
+    aggregation: chrf
+    higher_is_better: true
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/salt/prompt_1/salt_ach-eng.yaml b/lm_eval/tasks/afrobench/salt/prompt_1/salt_ach-eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..41731279817637401307fc9f55ecd96cd2a80794
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_1/salt_ach-eng.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: eng_target_text
+doc_to_text: "Acholi sentence: {{ach_text}} \nEnglish sentence: "
+include: salt
+task: salt_ach-eng_prompt_1
diff --git a/lm_eval/tasks/afrobench/salt/prompt_1/salt_eng-ach.yaml b/lm_eval/tasks/afrobench/salt/prompt_1/salt_eng-ach.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..219e5780634f4812157ea6d2ad70b7b22e72ae49
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_1/salt_eng-ach.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: ach_text
+doc_to_text: "English sentence: {{eng_source_text}} \nAcholi sentence: "
+include: salt
+task: salt_eng-ach_prompt_1
diff --git a/lm_eval/tasks/afrobench/salt/prompt_1/salt_eng-ibo.yaml b/lm_eval/tasks/afrobench/salt/prompt_1/salt_eng-ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f90220591f5f7047da6d488740c759c850a95b1e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_1/salt_eng-ibo.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: ibo_text
+doc_to_text: "English sentence: {{eng_source_text}} \nIgbo sentence: "
+include: salt
+task: salt_eng-ibo_prompt_1
diff --git a/lm_eval/tasks/afrobench/salt/prompt_1/salt_eng-lgg.yaml b/lm_eval/tasks/afrobench/salt/prompt_1/salt_eng-lgg.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2a038ddb39eb1b171be9e5631e129995ceeed64e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_1/salt_eng-lgg.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: lgg_text
+doc_to_text: "English sentence: {{eng_source_text}} \nLugbara sentence: "
+include: salt
+task: salt_eng-lgg_prompt_1
diff --git a/lm_eval/tasks/afrobench/salt/prompt_1/salt_eng-lug.yaml b/lm_eval/tasks/afrobench/salt/prompt_1/salt_eng-lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4539913786124aec4ea68f16538989a91131ca44
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_1/salt_eng-lug.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: lug_text
+doc_to_text: "English sentence: {{eng_source_text}} \nLuganda sentence: "
+include: salt
+task: salt_eng-lug_prompt_1
diff --git a/lm_eval/tasks/afrobench/salt/prompt_1/salt_eng-nyn.yaml b/lm_eval/tasks/afrobench/salt/prompt_1/salt_eng-nyn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..448e1101d681d4f31bde8c81418d4f2f64b6eb13
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_1/salt_eng-nyn.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: nyn_text
+doc_to_text: "English sentence: {{eng_source_text}} \nRunyankole sentence: "
+include: salt
+task: salt_eng-nyn_prompt_1
diff --git a/lm_eval/tasks/afrobench/salt/prompt_1/salt_eng-swa.yaml b/lm_eval/tasks/afrobench/salt/prompt_1/salt_eng-swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..792b4840c2551627b66008fdd2c172e3660cc914
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_1/salt_eng-swa.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: swa_text
+doc_to_text: "English sentence: {{eng_source_text}} \nSwahili sentence: "
+include: salt
+task: salt_eng-swa_prompt_1
diff --git a/lm_eval/tasks/afrobench/salt/prompt_1/salt_eng-teo.yaml b/lm_eval/tasks/afrobench/salt/prompt_1/salt_eng-teo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..810626c6a5ddf8525d45344b5a5eb7a2d65ab34e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_1/salt_eng-teo.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: teo_text
+doc_to_text: "English sentence: {{eng_source_text}} \nAteso sentence: "
+include: salt
+task: salt_eng-teo_prompt_1
diff --git a/lm_eval/tasks/afrobench/salt/prompt_1/salt_ibo-eng.yaml b/lm_eval/tasks/afrobench/salt/prompt_1/salt_ibo-eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0a98c8648081bc8c3e1fd1c897c41212701f36fa
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_1/salt_ibo-eng.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: eng_target_text
+doc_to_text: "Igbo sentence: {{ibo_text}} \nEnglish sentence: "
+include: salt
+task: salt_ibo-eng_prompt_1
diff --git a/lm_eval/tasks/afrobench/salt/prompt_1/salt_lgg-eng.yaml b/lm_eval/tasks/afrobench/salt/prompt_1/salt_lgg-eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c8e281ac3189dbaada8d21fbd4896a0c8478dbc6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_1/salt_lgg-eng.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: eng_target_text
+doc_to_text: "Lugbara sentence: {{lgg_text}} \nEnglish sentence: "
+include: salt
+task: salt_lgg-eng_prompt_1
diff --git a/lm_eval/tasks/afrobench/salt/prompt_1/salt_lug-eng.yaml b/lm_eval/tasks/afrobench/salt/prompt_1/salt_lug-eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f924d5c50f82e1dbbf6be1dd4a138d2c5d61c5ac
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_1/salt_lug-eng.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: eng_target_text
+doc_to_text: "Luganda sentence: {{lug_text}} \nEnglish sentence: "
+include: salt
+task: salt_lug-eng_prompt_1
diff --git a/lm_eval/tasks/afrobench/salt/prompt_1/salt_nyn-eng.yaml b/lm_eval/tasks/afrobench/salt/prompt_1/salt_nyn-eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bd9363614648969391f20deb49fd2a92afdcfede
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_1/salt_nyn-eng.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: eng_target_text
+doc_to_text: "Runyankole sentence: {{nyn_text}} \nEnglish sentence: "
+include: salt
+task: salt_nyn-eng_prompt_1
diff --git a/lm_eval/tasks/afrobench/salt/prompt_1/salt_swa-eng.yaml b/lm_eval/tasks/afrobench/salt/prompt_1/salt_swa-eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c2308593e3d54e222d7543403f214dba76719a80
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_1/salt_swa-eng.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: eng_target_text
+doc_to_text: "Swahili sentence: {{swa_text}} \nEnglish sentence: "
+include: salt
+task: salt_swa-eng_prompt_1
diff --git a/lm_eval/tasks/afrobench/salt/prompt_1/salt_teo-eng.yaml b/lm_eval/tasks/afrobench/salt/prompt_1/salt_teo-eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6efb4ea0807a9a66eb84797503b6bc4762777fd0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_1/salt_teo-eng.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: eng_target_text
+doc_to_text: "Ateso sentence: {{teo_text}} \nEnglish sentence: "
+include: salt
+task: salt_teo-eng_prompt_1
diff --git a/lm_eval/tasks/afrobench/salt/prompt_2/salt b/lm_eval/tasks/afrobench/salt/prompt_2/salt
new file mode 100644
index 0000000000000000000000000000000000000000..66355878cbb8354261bd426623d29589ce93383a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_2/salt
@@ -0,0 +1,24 @@
+tag:
+- salt_tasks
+- salt_prompt_2
+- afrobench_MT_tasks
+dataset_path: Sunbird/salt
+dataset_kwargs: {trust_remote_code: True}
+output_type: generate_until
+validation_split: dev
+fewshot_split: dev
+test_split: test
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: chrf
+    aggregation: chrf
+    higher_is_better: true
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/salt/prompt_2/salt_ach-eng.yaml b/lm_eval/tasks/afrobench/salt/prompt_2/salt_ach-eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dda717b7942cb37c7f6d821070572cd302717639
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_2/salt_ach-eng.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: eng_target_text
+doc_to_text: "You are a translation expert. Translate the following Acholi sentences\
+  \ to English \nAcholi sentence: {{ach_text}}\nEnglish sentence: "
+include: salt
+task: salt_ach-eng_prompt_2
diff --git a/lm_eval/tasks/afrobench/salt/prompt_2/salt_eng-ach.yaml b/lm_eval/tasks/afrobench/salt/prompt_2/salt_eng-ach.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1e4a72a5116a41a7d7950cfed80cbd826a37a0dc
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_2/salt_eng-ach.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: ach_text
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Acholi \nEnglish sentence: {{eng_source_text}} \nAcholi sentence: "
+include: salt
+task: salt_eng-ach_prompt_2
diff --git a/lm_eval/tasks/afrobench/salt/prompt_2/salt_eng-ibo.yaml b/lm_eval/tasks/afrobench/salt/prompt_2/salt_eng-ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..04649c1287e599a2ecdf376b4b30bc86700dcaca
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_2/salt_eng-ibo.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: ibo_text
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Igbo \nEnglish sentence: {{eng_source_text}} \nIgbo sentence: "
+include: salt
+task: salt_eng-ibo_prompt_2
diff --git a/lm_eval/tasks/afrobench/salt/prompt_2/salt_eng-lgg.yaml b/lm_eval/tasks/afrobench/salt/prompt_2/salt_eng-lgg.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0ac6becbcb7b10890cb1b2cd56dbe43c23742683
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_2/salt_eng-lgg.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: lgg_text
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Lugbara \nEnglish sentence: {{eng_source_text}} \nLugbara sentence: "
+include: salt
+task: salt_eng-lgg_prompt_2
diff --git a/lm_eval/tasks/afrobench/salt/prompt_2/salt_eng-lug.yaml b/lm_eval/tasks/afrobench/salt/prompt_2/salt_eng-lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1b5f6399cf6ddc5276fb48545e4ad3d1e0e4ab1f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_2/salt_eng-lug.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: lug_text
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Luganda \nEnglish sentence: {{eng_source_text}} \nLuganda sentence: "
+include: salt
+task: salt_eng-lug_prompt_2
diff --git a/lm_eval/tasks/afrobench/salt/prompt_2/salt_eng-nyn.yaml b/lm_eval/tasks/afrobench/salt/prompt_2/salt_eng-nyn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..84452d5aed07b2fe13d6836a7656ff85dfa2ae8e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_2/salt_eng-nyn.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: nyn_text
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Runyankole \nEnglish sentence: {{eng_source_text}} \nRunyankole sentence: "
+include: salt
+task: salt_eng-nyn_prompt_2
diff --git a/lm_eval/tasks/afrobench/salt/prompt_2/salt_eng-swa.yaml b/lm_eval/tasks/afrobench/salt/prompt_2/salt_eng-swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..523db9fb7e913dff30b80d28ca13b8c613653ad6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_2/salt_eng-swa.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: swa_text
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Swahili \nEnglish sentence: {{eng_source_text}} \nSwahili sentence: "
+include: salt
+task: salt_eng-swa_prompt_2
diff --git a/lm_eval/tasks/afrobench/salt/prompt_2/salt_eng-teo.yaml b/lm_eval/tasks/afrobench/salt/prompt_2/salt_eng-teo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..000e8d043bb1897c5647480d6584191181b45c68
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_2/salt_eng-teo.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: teo_text
+doc_to_text: "You are a translation expert. Translate the following English sentences\
+  \ to Ateso \nEnglish sentence: {{eng_source_text}} \nAteso sentence: "
+include: salt
+task: salt_eng-teo_prompt_2
diff --git a/lm_eval/tasks/afrobench/salt/prompt_2/salt_ibo-eng.yaml b/lm_eval/tasks/afrobench/salt/prompt_2/salt_ibo-eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b4ec6601af313b25606c05752715a3dfadf1476e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_2/salt_ibo-eng.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: eng_target_text
+doc_to_text: "You are a translation expert. Translate the following Igbo sentences\
+  \ to English \nIgbo sentence: {{ibo_text}}\nEnglish sentence: "
+include: salt
+task: salt_ibo-eng_prompt_2
diff --git a/lm_eval/tasks/afrobench/salt/prompt_2/salt_lgg-eng.yaml b/lm_eval/tasks/afrobench/salt/prompt_2/salt_lgg-eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2d802c0faa99f895605e00c25aea3197b0fad7d2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_2/salt_lgg-eng.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: eng_target_text
+doc_to_text: "You are a translation expert. Translate the following Lugbara sentences\
+  \ to English \nLugbara sentence: {{lgg_text}}\nEnglish sentence: "
+include: salt
+task: salt_lgg-eng_prompt_2
diff --git a/lm_eval/tasks/afrobench/salt/prompt_2/salt_lug-eng.yaml b/lm_eval/tasks/afrobench/salt/prompt_2/salt_lug-eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..521bbf15c008670a0d71b671be84e58b9ca7290b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_2/salt_lug-eng.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: eng_target_text
+doc_to_text: "You are a translation expert. Translate the following Luganda sentences\
+  \ to English \nLuganda sentence: {{lug_text}}\nEnglish sentence: "
+include: salt
+task: salt_lug-eng_prompt_2
diff --git a/lm_eval/tasks/afrobench/salt/prompt_2/salt_nyn-eng.yaml b/lm_eval/tasks/afrobench/salt/prompt_2/salt_nyn-eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4cc4abfc26505ada2abb05775a6b4b43c67fb139
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_2/salt_nyn-eng.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: eng_target_text
+doc_to_text: "You are a translation expert. Translate the following Runyankole sentences\
+  \ to English \nRunyankole sentence: {{nyn_text}}\nEnglish sentence: "
+include: salt
+task: salt_nyn-eng_prompt_2
diff --git a/lm_eval/tasks/afrobench/salt/prompt_2/salt_swa-eng.yaml b/lm_eval/tasks/afrobench/salt/prompt_2/salt_swa-eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9e80b9087df91df24f619c61c61c553decfcb1bf
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_2/salt_swa-eng.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: eng_target_text
+doc_to_text: "You are a translation expert. Translate the following Swahili sentences\
+  \ to English \nSwahili sentence: {{swa_text}}\nEnglish sentence: "
+include: salt
+task: salt_swa-eng_prompt_2
diff --git a/lm_eval/tasks/afrobench/salt/prompt_2/salt_teo-eng.yaml b/lm_eval/tasks/afrobench/salt/prompt_2/salt_teo-eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d0b0d516de9ae00758cd9fccb45c84d65eb069bc
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_2/salt_teo-eng.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: eng_target_text
+doc_to_text: "You are a translation expert. Translate the following Ateso sentences\
+  \ to English \nAteso sentence: {{teo_text}}\nEnglish sentence: "
+include: salt
+task: salt_teo-eng_prompt_2
diff --git a/lm_eval/tasks/afrobench/salt/prompt_3/salt b/lm_eval/tasks/afrobench/salt/prompt_3/salt
new file mode 100644
index 0000000000000000000000000000000000000000..51dac9c53b42569b2b5c7f19a5b9fa6b83fc68e4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_3/salt
@@ -0,0 +1,24 @@
+tag:
+- salt_tasks
+- salt_prompt_3
+- afrobench_MT_tasks
+dataset_path: Sunbird/salt
+dataset_kwargs: {trust_remote_code: True}
+output_type: generate_until
+validation_split: dev
+fewshot_split: dev
+test_split: test
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: chrf
+    aggregation: chrf
+    higher_is_better: true
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/salt/prompt_3/salt_ach-eng.yaml b/lm_eval/tasks/afrobench/salt/prompt_3/salt_ach-eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c198a59f843447475f221823361f7ddf919419c3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_3/salt_ach-eng.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: eng_target_text
+doc_to_text: "As a Acholi and English linguist, translate the following Acholi sentences\
+  \ to English. \nAcholi sentence: {{ach_text}}\nEnglish sentence: "
+include: salt
+task: salt_ach-eng_prompt_3
diff --git a/lm_eval/tasks/afrobench/salt/prompt_3/salt_eng-ach.yaml b/lm_eval/tasks/afrobench/salt/prompt_3/salt_eng-ach.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..636a77d8606343d9de230547b958f8e49b448b5c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_3/salt_eng-ach.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: ach_text
+doc_to_text: "As a Acholi and English linguist, translate the following English sentences\
+  \ to Acholi. \nEnglish sentence: {{eng_source_text}} \nAcholi sentence: "
+include: salt
+task: salt_eng-ach_prompt_3
diff --git a/lm_eval/tasks/afrobench/salt/prompt_3/salt_eng-ibo.yaml b/lm_eval/tasks/afrobench/salt/prompt_3/salt_eng-ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..44d015d6ca9db85477a082c687a46c7e46276068
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_3/salt_eng-ibo.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: ibo_text
+doc_to_text: "As a Igbo and English linguist, translate the following English sentences\
+  \ to Igbo. \nEnglish sentence: {{eng_source_text}} \nIgbo sentence: "
+include: salt
+task: salt_eng-ibo_prompt_3
diff --git a/lm_eval/tasks/afrobench/salt/prompt_3/salt_eng-lgg.yaml b/lm_eval/tasks/afrobench/salt/prompt_3/salt_eng-lgg.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8f1e6f43ba7783c2b521ee3a0caeec1d0904790e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_3/salt_eng-lgg.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: lgg_text
+doc_to_text: "As a Lugbara and English linguist, translate the following English sentences\
+  \ to Lugbara. \nEnglish sentence: {{eng_source_text}} \nLugbara sentence: "
+include: salt
+task: salt_eng-lgg_prompt_3
diff --git a/lm_eval/tasks/afrobench/salt/prompt_3/salt_eng-lug.yaml b/lm_eval/tasks/afrobench/salt/prompt_3/salt_eng-lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e2065c30df12a680ca08b218ce3e842324313da4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_3/salt_eng-lug.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: lug_text
+doc_to_text: "As a Luganda and English linguist, translate the following English sentences\
+  \ to Luganda. \nEnglish sentence: {{eng_source_text}} \nLuganda sentence: "
+include: salt
+task: salt_eng-lug_prompt_3
diff --git a/lm_eval/tasks/afrobench/salt/prompt_3/salt_eng-nyn.yaml b/lm_eval/tasks/afrobench/salt/prompt_3/salt_eng-nyn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9e48970a8ccb136d4328598224c370076949954b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_3/salt_eng-nyn.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: nyn_text
+doc_to_text: "As a Runyankole and English linguist, translate the following English\
+  \ sentences to Runyankole. \nEnglish sentence: {{eng_source_text}} \nRunyankole\
+  \ sentence: "
+include: salt
+task: salt_eng-nyn_prompt_3
diff --git a/lm_eval/tasks/afrobench/salt/prompt_3/salt_eng-swa.yaml b/lm_eval/tasks/afrobench/salt/prompt_3/salt_eng-swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cfd3f8eadb1ca0ff898595c897a3eebde72f08a8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_3/salt_eng-swa.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: swa_text
+doc_to_text: "As a Swahili and English linguist, translate the following English sentences\
+  \ to Swahili. \nEnglish sentence: {{eng_source_text}} \nSwahili sentence: "
+include: salt
+task: salt_eng-swa_prompt_3
diff --git a/lm_eval/tasks/afrobench/salt/prompt_3/salt_eng-teo.yaml b/lm_eval/tasks/afrobench/salt/prompt_3/salt_eng-teo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f8d280bb41808f8af50287861fa1131b92295e70
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_3/salt_eng-teo.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: teo_text
+doc_to_text: "As a Ateso and English linguist, translate the following English sentences\
+  \ to Ateso. \nEnglish sentence: {{eng_source_text}} \nAteso sentence: "
+include: salt
+task: salt_eng-teo_prompt_3
diff --git a/lm_eval/tasks/afrobench/salt/prompt_3/salt_ibo-eng.yaml b/lm_eval/tasks/afrobench/salt/prompt_3/salt_ibo-eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..13be699cb1dc1255939321205d25921625cdb140
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_3/salt_ibo-eng.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: eng_target_text
+doc_to_text: "As a Igbo and English linguist, translate the following Igbo sentences\
+  \ to English. \nIgbo sentence: {{ibo_text}}\nEnglish sentence: "
+include: salt
+task: salt_ibo-eng_prompt_3
diff --git a/lm_eval/tasks/afrobench/salt/prompt_3/salt_lgg-eng.yaml b/lm_eval/tasks/afrobench/salt/prompt_3/salt_lgg-eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7aa4ffc442c41ea9abd148257b1e49524173eca5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_3/salt_lgg-eng.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: eng_target_text
+doc_to_text: "As a Lugbara and English linguist, translate the following Lugbara sentences\
+  \ to English. \nLugbara sentence: {{lgg_text}}\nEnglish sentence: "
+include: salt
+task: salt_lgg-eng_prompt_3
diff --git a/lm_eval/tasks/afrobench/salt/prompt_3/salt_lug-eng.yaml b/lm_eval/tasks/afrobench/salt/prompt_3/salt_lug-eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..da505f6d7589d9a7bba4ea7be1c73134fc562a20
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_3/salt_lug-eng.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: eng_target_text
+doc_to_text: "As a Luganda and English linguist, translate the following Luganda sentences\
+  \ to English. \nLuganda sentence: {{lug_text}}\nEnglish sentence: "
+include: salt
+task: salt_lug-eng_prompt_3
diff --git a/lm_eval/tasks/afrobench/salt/prompt_3/salt_nyn-eng.yaml b/lm_eval/tasks/afrobench/salt/prompt_3/salt_nyn-eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9edba7c495369e1849106e854100a65d0bda9ee5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_3/salt_nyn-eng.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: eng_target_text
+doc_to_text: "As a Runyankole and English linguist, translate the following Runyankole\
+  \ sentences to English. \nRunyankole sentence: {{nyn_text}}\nEnglish sentence: "
+include: salt
+task: salt_nyn-eng_prompt_3
diff --git a/lm_eval/tasks/afrobench/salt/prompt_3/salt_swa-eng.yaml b/lm_eval/tasks/afrobench/salt/prompt_3/salt_swa-eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3d01c9170c602c7eebdc3b0a5c216d5bdd4bc52a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_3/salt_swa-eng.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: eng_target_text
+doc_to_text: "As a Swahili and English linguist, translate the following Swahili sentences\
+  \ to English. \nSwahili sentence: {{swa_text}}\nEnglish sentence: "
+include: salt
+task: salt_swa-eng_prompt_3
diff --git a/lm_eval/tasks/afrobench/salt/prompt_3/salt_teo-eng.yaml b/lm_eval/tasks/afrobench/salt/prompt_3/salt_teo-eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c81336cac58f12d6dd2118315a6cdb64a913a2af
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/prompt_3/salt_teo-eng.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: text-all
+doc_to_target: eng_target_text
+doc_to_text: "As a Ateso and English linguist, translate the following Ateso sentences\
+  \ to English. \nAteso sentence: {{teo_text}}\nEnglish sentence: "
+include: salt
+task: salt_teo-eng_prompt_3
diff --git a/lm_eval/tasks/afrobench/salt/salt.yaml b/lm_eval/tasks/afrobench/salt/salt.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..edd3070d8ba2c24b651038ca7408a38b45e00da3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/salt/salt.yaml
@@ -0,0 +1,11 @@
+group: salt
+task:
+  - salt_prompt_1
+  - salt_prompt_2
+  - salt_prompt_3
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1
diff --git a/lm_eval/tasks/afrobench/sample_run_scripts/run_afrobench.sh b/lm_eval/tasks/afrobench/sample_run_scripts/run_afrobench.sh
new file mode 100644
index 0000000000000000000000000000000000000000..886c94956cc8204ce9fda69e912cec91424a3d92
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sample_run_scripts/run_afrobench.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+batch_size=5
+num_fewshot=0
+
+export CUDA_VISIBLE_DEVICES=0,1
+
+model_names=(
+  "google/gemma-1.1-7b-it",
+  "google/gemma-2-9b-it",
+  "google/gemma-2-27b-it",
+  "Jacaranda/AfroLlama_V1",
+  "LLaMAX/LLaMAX3-8B-Alpaca",
+  "meta-llama/Llama-2-7b-chat-hf",
+  "meta-llama/Llama-3.1-8B-Instruct",
+  "meta-llama/Llama-3.1-70B-Instruct",
+  "meta-llama/Meta-Llama-3-8B-Instruct",
+  "CohereForAI/aya-101"
+)
+
+for model_name in "${model_names[@]}"
+do
+    echo "Running model: $model_name"
+    lm_eval --model hf \
+    --model_args pretrained=${model_names},parallelize=true \
+    --tasks  afrobench\
+    --batch_size ${batch_size} \
+    --num_fewshot ${num_fewshot} \
+    --verbosity DEBUG \
+    --output_path 'path_to_results/' \
+    --log_samples
+done
diff --git a/lm_eval/tasks/afrobench/sample_run_scripts/run_afrobench_lite.sh b/lm_eval/tasks/afrobench/sample_run_scripts/run_afrobench_lite.sh
new file mode 100644
index 0000000000000000000000000000000000000000..89291faadb97fa9267d09be80e81a7b480aabcb5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sample_run_scripts/run_afrobench_lite.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+batch_size=5
+num_fewshot=0
+
+export CUDA_VISIBLE_DEVICES=0,1
+
+model_names=(
+  "google/gemma-1.1-7b-it",
+  "google/gemma-2-9b-it",
+  "google/gemma-2-27b-it",
+  "Jacaranda/AfroLlama_V1",
+  "LLaMAX/LLaMAX3-8B-Alpaca",
+  "meta-llama/Llama-2-7b-chat-hf",
+  "meta-llama/Llama-3.1-8B-Instruct",
+  "meta-llama/Llama-3.1-70B-Instruct",
+  "meta-llama/Meta-Llama-3-8B-Instruct",
+  "CohereForAI/aya-101"
+)
+
+for model_name in "${model_names[@]}"
+do
+    echo "Running model: $model_name"
+    lm_eval --model hf \
+    --model_args pretrained=${model_name},parallelize=true \
+    --tasks afrobench_lite\
+    --batch_size ${batch_size} \
+    --num_fewshot ${num_fewshot} \
+    --verbosity DEBUG \
+    --output_path 'path_to_results/' \
+    --log_samples
+done
diff --git a/lm_eval/tasks/afrobench/sib/README.md b/lm_eval/tasks/afrobench/sib/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..732db84b0eb6ad373442692b221e7f97e18e112a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/README.md
@@ -0,0 +1,37 @@
+#
+
+## Paper
+Title: `SIB-200: A Simple, Inclusive, and Big Evaluation Dataset for Topic Classification in 200+ Languages and Dialects`
+
+Paper Link: https://aclanthology.org/2024.eacl-long.14/
+
+## Abstract
+>Despite the progress in building multilingual language models, evaluation is often limited to a few languages with available datasets which excludes a large number of low-resource languages. In this paper, we create SIB-200—a large-scale open-sourced benchmark dataset for topic classification in 205 languages and dialects to address the lack of evaluation dataset for Natural Language Understanding (NLU). For many of the languages covered in SIB-200, this is the first publicly available evaluation dataset for NLU. The dataset is based on Flores-200 machine translation corpus. We annotated the English portion of the dataset and extended the sentence-level annotation to the remaining 204 languages covered in the corpus. Despite the simplicity of this task, our evaluation in full-supervised setting, cross-lingual transfer setting and prompting of large language model setting show that there is still a large gap between the performance of high-resource and low-resource languages when multilingual evaluation is scaled to numerous world languages. We found that languages unseen during the pre-training of multilingual language models, languages from under-represented families (like Nilotic and Altantic-Congo), and languages from the regions of Africa, Americas, Oceania and South East Asia, often have the lowest performance on our topic classification dataset. We hope our dataset %will encourages a more inclusive evaluation of multilingual language models on a more diverse set of languages.
+
+HomePage: https://github.com/dadelani/sib-200
+
+### Citation
+
+```
+@inproceedings{adelani-etal-2024-sib,
+    title = "{SIB}-200: A Simple, Inclusive, and Big Evaluation Dataset for Topic Classification in 200+ Languages and Dialects",
+    author = "Adelani, David Ifeoluwa  and
+      Liu, Hannah  and
+      Shen, Xiaoyu  and
+      Vassilyev, Nikita  and
+      Alabi, Jesujoba O.  and
+      Mao, Yanke  and
+      Gao, Haonan  and
+      Lee, En-Shiun Annie",
+    editor = "Graham, Yvette  and
+      Purver, Matthew",
+    booktitle = "Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)",
+    month = mar,
+    year = "2024",
+    address = "St. Julian{'}s, Malta",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2024.eacl-long.14/",
+    pages = "226--245",
+    abstract = "Despite the progress in building multilingual language models, evaluation is often limited to a few languages with available datasets which excludes a large number of low-resource languages. In this paper, we create SIB-200{---}a large-scale open-sourced benchmark dataset for topic classification in 205 languages and dialects to address the lack of evaluation dataset for Natural Language Understanding (NLU). For many of the languages covered in SIB-200, this is the first publicly available evaluation dataset for NLU. The dataset is based on Flores-200 machine translation corpus. We annotated the English portion of the dataset and extended the sentence-level annotation to the remaining 204 languages covered in the corpus. Despite the simplicity of this task, our evaluation in full-supervised setting, cross-lingual transfer setting and prompting of large language model setting show that there is still a large gap between the performance of high-resource and low-resource languages when multilingual evaluation is scaled to numerous world languages. We found that languages unseen during the pre-training of multilingual language models, languages from under-represented families (like Nilotic and Altantic-Congo), and languages from the regions of Africa, Americas, Oceania and South East Asia, often have the lowest performance on our topic classification dataset. We hope our dataset {\%}will encourages a more inclusive evaluation of multilingual language models on a more diverse set of languages."
+}
+```
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib b/lm_eval/tasks/afrobench/sib/prompt_1/sib
new file mode 100644
index 0000000000000000000000000000000000000000..37fda5d192dc8b4e1aa115d66858876e6bca3bda
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib
@@ -0,0 +1,43 @@
+tag:
+    - sib_tasks
+    - sib_prompt_1
+    - afrobench_TC_tasks
+dataset_path: Davlan/sib200
+dataset_name: null
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: validation
+doc_to_target: category
+doc_to_choice:
+    - "science/technology"
+    - "travel"
+    - "politics"
+    - "sports"
+    - "health"
+    - "entertainment"
+    - "geography"
+should_decontaminate: true
+doc_to_decontamination_query: text
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_aeb.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_aeb.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d4116035df2599f79d31293b25abf43191943abd
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_aeb.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: aeb_Arab
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_aeb_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_afr.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_afr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..001eee846bd92a3e1703d64d799e5bc8c066f70e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_afr.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: afr_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_afr_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_aka.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_aka.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..907977dc638bdfc7aba5ea11324d54667cd21d1e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_aka.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: aka_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_aka_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_amh.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dde5420724bdb678ac877c5ff895df74ba0b08c6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_amh.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: amh_Ethi
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_amh_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_ary.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_ary.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..68347bd51439c95b88403f843fb78a06a3562d39
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_ary.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ary_Arab
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_ary_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_arz.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_arz.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2c0328134c766bd56637a2097f1b87bfa03a4973
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_arz.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: arz_Arab
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_arz_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_bam.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_bam.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5469a8a17ea44b468172c326a148f1185a559015
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_bam.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: bam_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_bam_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_bem.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_bem.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..01aaa1cbd82342de4ace8c11387f1851a21661d1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_bem.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: bem_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_bem_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_cjk.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_cjk.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6deaee753f460189a1fcf47c800239b2242ccf8d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_cjk.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: cjk_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_cjk_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_dik.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_dik.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d80d0a080890269475d0133cb4a73cc80ffbe6eb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_dik.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: dik_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_dik_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_dyu.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_dyu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1d72e6321e92d7e8947cce5109d363f7eb51f9de
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_dyu.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: dyu_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_dyu_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_eng.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1e32469681e4400517131926dff6e8b1a717b69d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_eng.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_eng_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_ewe.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..60cf7db830a1aff9215989f4af5c9a6f8d278985
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_ewe.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ewe_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_ewe_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_fon.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_fon.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7ae765522ccd81ddadd2842bb7e8a346fff18088
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_fon.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: fon_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_fon_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_fra.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4614e6d27f2d41e5558045d933df41a66a909cfb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_fra.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: fra_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_fra_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_fuv.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_fuv.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..24f1d28a8f088d383bf7fbbff939dc73b4cf447e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_fuv.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: fuv_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_fuv_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_gaz.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_gaz.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..df904f957318c08eaf8c2f5cba4d0befa5220fbc
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_gaz.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: gaz_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_gaz_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_hau.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b160b8cfc0aa662bfadcc68f2891208e7039c01b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_hau.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: hau_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_hau_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_ibo.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e481aeacd5c7d63cbfd11e7efcb3fb1ac738e945
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_ibo.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ibo_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_ibo_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_kab.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_kab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a914b01cc54c35941cd769dbe6667ee624421b91
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_kab.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kab_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_kab_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_kam.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_kam.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..aaa05108b0cc3313932e71a174b0f53e747e42eb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_kam.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kam_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_kam_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_kbp.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_kbp.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d428490863c573b3a757672bc3c074d8fd548c0d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_kbp.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kbp_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_kbp_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_kea.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_kea.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4e458fb225b8c1b4b4ee2823f46b4b4ad7a6dcad
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_kea.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kea_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_kea_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_kik.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_kik.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..beb94a8edb7cec7b51c960fe319a98e798a84581
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_kik.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kik_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_kik_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_kin.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7c16432eba27800bcc8eb927e4a201aac7b3f2e0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_kin.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kin_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_kin_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_kmb.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_kmb.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c46477e31e4639a9b9c1dca0ce59318534e883e4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_kmb.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kmb_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_kmb_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_knc.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_knc.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9b43157e3642dc91b6f04de06ecc622b67fb036e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_knc.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: knc_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_knc_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_kon.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_kon.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..def4a77def17ff2d11cc00d6c87962a03c4081cd
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_kon.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kon_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_kon_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_lin.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bbba95e0cf7217c4385f4601a7867ffc6576b2b7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_lin.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: lin_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_lin_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_lua.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_lua.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d4bc665b3f9e6fce1703b2ea53c93bcc52111363
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_lua.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: lua_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_lua_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_lug.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cbf42e1889e695a379d8261bac27d02fa7f4d33d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_lug.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: lug_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_lug_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_luo.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_luo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a62ea03c7ba534928d5c3c333d631216cf0dd248
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_luo.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: luo_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_luo_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_mos.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_mos.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..54140a5d1339758f59a3504d3a4a0a5448414b90
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_mos.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: mos_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_mos_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_nso.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_nso.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7f7382d58f3071f1dddcab360ecf06b2cf7a427c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_nso.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: nso_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_nso_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_nus.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_nus.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..28208912f85036d26b494ed495b43f2a57982869
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_nus.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: nus_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_nus_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_nya.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_nya.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6ca90a9233e68301406a3303ebbb85cb47da2207
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_nya.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: nya_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_nya_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_plt.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_plt.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..650b9a4b711f30b59c0aadde797e941d69a17ed6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_plt.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: plt_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_plt_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_por.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_por.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7901e924a043d74dadbf8b0dabff2303273d03a5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_por.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: por_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_por_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_run.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_run.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..510fc5c15841c9c130af5cce0e3e2d8499eb71d0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_run.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: run_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_run_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_sag.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_sag.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e7c0bb3148857edab4e8eaef00974fa5e4dfd974
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_sag.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sag_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_sag_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_sna.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f4115112c393c0dd424b14bdd66046d58e82eb83
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_sna.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sna_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_sna_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_som.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_som.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..be9c19f1039b8093e3c5bcd7573168b23f6e923c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_som.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: som_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_som_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_sot.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..78d0e1f50dc0909475131e7892bbe726f5144412
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_sot.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sot_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_sot_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_ssw.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_ssw.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..988f6828cbe84bdf7cec2a03798a452956e1768d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_ssw.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ssw_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_ssw_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_swa.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d4a92192eb750ed34c647381ae0c8655b141f4a2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_swa.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: swh_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_swa_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_taq.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_taq.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a860f019dd0d259ea3fd9eddfb776870ea24b7f2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_taq.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: taq_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_taq_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_tir.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_tir.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..606755c5c59c01cdd1148437cdfccb4791ebc689
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_tir.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: tir_Ethi
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_tir_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_tso.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_tso.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c6b2e46369554e76573e3cdec7128b56d9853913
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_tso.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: tso_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_tso_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_tum.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_tum.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9e17521fb63ca03a4b38747157cf0171dcb2cf13
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_tum.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: tum_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_tum_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_twi.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bf818808af1a80bfa7cfad46b5f16930b5619636
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_twi.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: twi_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_twi_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_tzm.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_tzm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..10cf4c5b6626fe9ffc3addfbc8197156e23ee45f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_tzm.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: tzm_Tfng
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_tzm_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_umb.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_umb.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d171c9c6b6fd7f2db5ac205e3adfed2e6e6fb867
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_umb.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: umb_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_umb_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_wol.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c3a6d7e6234c0ac9df872fb3cfcbc1e9f0e4f483
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_wol.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: wol_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_wol_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_xho.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..57ce4d2db833add543832e1798a90b7479d8a360
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_xho.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: xho_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_xho_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_yor.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cab811762f3b61828cb698857e0d46f33855f568
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_yor.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: yor_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_yor_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/sib_zul.yaml b/lm_eval/tasks/afrobench/sib/prompt_1/sib_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..694ddfc11f55e33b04544bde8a2004939e8bb158
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/sib_zul.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: zul_Latn
+doc_to_text: "Given the categories science/technology, travel, politics, sports, health,\
+  \ entertainment, or geography; what category does the text: '{{text}}' belong to:\
+  \ \n\n"
+include: sib
+task: sib_zul_prompt_1
diff --git a/lm_eval/tasks/afrobench/sib/prompt_1/utils.py b/lm_eval/tasks/afrobench/sib/prompt_1/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e735e2deb1f9c53152c072615aebe8ba3acb90b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_1/utils.py
@@ -0,0 +1 @@
+from lm_eval.utils import weighted_f1_score
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib b/lm_eval/tasks/afrobench/sib/prompt_2/sib
new file mode 100644
index 0000000000000000000000000000000000000000..27dd7d1f64838b9692fbaa06ea98c6cd7f7db97e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib
@@ -0,0 +1,43 @@
+tag:
+    - sib_tasks
+    - sib_prompt_2
+    - afrobench_TC_tasks
+dataset_path: Davlan/sib200
+dataset_name: null
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: validation
+doc_to_target: category
+doc_to_choice:
+    - "science/technology"
+    - "travel"
+    - "politics"
+    - "sports"
+    - "health"
+    - "entertainment"
+    - "geography"
+should_decontaminate: true
+doc_to_decontamination_query: text
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_aeb.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_aeb.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..32b2443948fd04761dab4331d9421b50e6293397
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_aeb.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: aeb_Arab
+doc_to_text: 'Does this Tunisian Arabic topic; ''{{text}}'' belong to one of the following
+  categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_aeb_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_afr.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_afr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c212b13f1f2cf4cd9a2b5b70fce75429a2dbbd91
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_afr.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: afr_Latn
+doc_to_text: 'Does this Afrikaans topic; ''{{text}}'' belong to one of the following
+  categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_afr_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_aka.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_aka.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dacfef07608dfebc67025bc8ff983260ee535f6b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_aka.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: aka_Latn
+doc_to_text: 'Does this Akan topic; ''{{text}}'' belong to one of the following categories:
+  science/technology, travel, politics, sports, health, entertainment, or geography?
+  category only
+
+
+  '
+include: sib
+task: sib_aka_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_amh.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..259009f056b82ff8968feb9df082f7c232845124
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_amh.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: amh_Ethi
+doc_to_text: 'Does this Amharic topic; ''{{text}}'' belong to one of the following
+  categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_amh_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_ary.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_ary.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..141a6691de71e7f932970dd3a73c91aa818c45b2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_ary.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: ary_Arab
+doc_to_text: 'Does this Moroccan Arabic topic; ''{{text}}'' belong to one of the following
+  categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_ary_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_arz.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_arz.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b2fee5eed9e22ca4448a5aa1efe26e756ed41562
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_arz.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: arz_Arab
+doc_to_text: 'Does this Egyptian Arabic topic; ''{{text}}'' belong to one of the following
+  categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_arz_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_bam.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_bam.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0ae5ddd0ea44b3d4c9a90e40125b942cd1919d26
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_bam.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: bam_Latn
+doc_to_text: 'Does this Bambara topic; ''{{text}}'' belong to one of the following
+  categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_bam_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_bem.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_bem.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1631a349226b60b9de250b4f97db8e474094951e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_bem.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: bem_Latn
+doc_to_text: 'Does this Bemba topic; ''{{text}}'' belong to one of the following categories:
+  science/technology, travel, politics, sports, health, entertainment, or geography?
+  category only
+
+
+  '
+include: sib
+task: sib_bem_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_cjk.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_cjk.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..85521f131a3532d7791bc3c022572bb624fd653c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_cjk.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: cjk_Latn
+doc_to_text: 'Does this Chokwe topic; ''{{text}}'' belong to one of the following
+  categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_cjk_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_dik.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_dik.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c660516f42e0e869c8a266d113e65dcbbbf8f032
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_dik.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: dik_Latn
+doc_to_text: 'Does this Southwestern Dinka topic; ''{{text}}'' belong to one of the
+  following categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_dik_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_dyu.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_dyu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..698782fda2a65ea766eef9b91381d497949005ca
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_dyu.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: dyu_Latn
+doc_to_text: 'Does this Dyula topic; ''{{text}}'' belong to one of the following categories:
+  science/technology, travel, politics, sports, health, entertainment, or geography?
+  category only
+
+
+  '
+include: sib
+task: sib_dyu_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_eng.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..564d86565f8aa47d9944d3a5aedc9555ca29c9a8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_eng.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: eng_Latn
+doc_to_text: 'Does this English topic; ''{{text}}'' belong to one of the following
+  categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_eng_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_ewe.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ba064082941553b1177d6c4ea4901e6aa7ba61be
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_ewe.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: ewe_Latn
+doc_to_text: 'Does this Ewe topic; ''{{text}}'' belong to one of the following categories:
+  science/technology, travel, politics, sports, health, entertainment, or geography?
+  category only
+
+
+  '
+include: sib
+task: sib_ewe_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_fon.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_fon.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9bb542dd84452cdd500d01a6e561c408e7a7fcf1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_fon.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: fon_Latn
+doc_to_text: 'Does this Fon topic; ''{{text}}'' belong to one of the following categories:
+  science/technology, travel, politics, sports, health, entertainment, or geography?
+  category only
+
+
+  '
+include: sib
+task: sib_fon_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_fra.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cf279d611378a2a1981415940a692389728fd339
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_fra.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: fra_Latn
+doc_to_text: 'Does this French topic; ''{{text}}'' belong to one of the following
+  categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_fra_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_fuv.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_fuv.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..50bb4b824748d070ad7d004efd15d2bab5cd8c0f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_fuv.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: fuv_Latn
+doc_to_text: 'Does this Nigerian Fulfulde topic; ''{{text}}'' belong to one of the
+  following categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_fuv_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_gaz.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_gaz.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..601d5f79f2605a3c0db8278500ecce1f5987222a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_gaz.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: gaz_Latn
+doc_to_text: 'Does this West Central Oromo topic; ''{{text}}'' belong to one of the
+  following categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_gaz_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_hau.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2c7255d4747d4b5a033129bca1be441114bef36f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_hau.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: hau_Latn
+doc_to_text: 'Does this Hausa topic; ''{{text}}'' belong to one of the following categories:
+  science/technology, travel, politics, sports, health, entertainment, or geography?
+  category only
+
+
+  '
+include: sib
+task: sib_hau_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_ibo.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..714c132f655a0b57e9c160a513a6c73350c5919c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_ibo.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: ibo_Latn
+doc_to_text: 'Does this Igbo topic; ''{{text}}'' belong to one of the following categories:
+  science/technology, travel, politics, sports, health, entertainment, or geography?
+  category only
+
+
+  '
+include: sib
+task: sib_ibo_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_kab.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_kab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..22303a3fbb1db518170ee57c258cff95c9f2c134
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_kab.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: kab_Latn
+doc_to_text: 'Does this Kabyle topic; ''{{text}}'' belong to one of the following
+  categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_kab_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_kam.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_kam.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..498781d6e836f9854a7703f669e80b3a16003637
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_kam.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: kam_Latn
+doc_to_text: 'Does this Kamba topic; ''{{text}}'' belong to one of the following categories:
+  science/technology, travel, politics, sports, health, entertainment, or geography?
+  category only
+
+
+  '
+include: sib
+task: sib_kam_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_kbp.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_kbp.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..679d7ccd7a74430df674154fa03af065fc4e23a8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_kbp.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: kbp_Latn
+doc_to_text: 'Does this Kabiye topic; ''{{text}}'' belong to one of the following
+  categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_kbp_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_kea.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_kea.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..aee33cf27faf2fed8b6873b601a5a437dae11bb7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_kea.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: kea_Latn
+doc_to_text: 'Does this Kabuverdianu topic; ''{{text}}'' belong to one of the following
+  categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_kea_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_kik.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_kik.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..77c87bc131b912e0564156acf740cb5aa3007615
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_kik.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: kik_Latn
+doc_to_text: 'Does this Kikuyu topic; ''{{text}}'' belong to one of the following
+  categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_kik_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_kin.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5be0643e11f39513363994cc6bbc02ac1604f24c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_kin.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: kin_Latn
+doc_to_text: 'Does this Kinyarwanda topic; ''{{text}}'' belong to one of the following
+  categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_kin_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_kmb.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_kmb.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..02f4e9d22410d932c345df2eeb1b4de1c3e71c4b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_kmb.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: kmb_Latn
+doc_to_text: 'Does this Kimbundu topic; ''{{text}}'' belong to one of the following
+  categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_kmb_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_knc.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_knc.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2623c480235bbf269b082a6604af129bb82e7df4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_knc.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: knc_Latn
+doc_to_text: 'Does this Central Kanuri topic; ''{{text}}'' belong to one of the following
+  categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_knc_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_kon.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_kon.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7ec3bcf97652bde14ee764bf961ea49aca088df4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_kon.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: kon_Latn
+doc_to_text: 'Does this Kikongo topic; ''{{text}}'' belong to one of the following
+  categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_kon_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_lin.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ec2fa57a8bbc4309bbb44a865568a2cf70b842e4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_lin.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: lin_Latn
+doc_to_text: 'Does this Lingala topic; ''{{text}}'' belong to one of the following
+  categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_lin_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_lua.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_lua.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4f3acc3dbeb2b708257c9b5f1fcc7cac4a703d54
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_lua.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: lua_Latn
+doc_to_text: 'Does this Luba-Kasai topic; ''{{text}}'' belong to one of the following
+  categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_lua_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_lug.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1d6e7b9f0c315b7868580704abc3cdba0775cc65
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_lug.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: lug_Latn
+doc_to_text: 'Does this Luganda topic; ''{{text}}'' belong to one of the following
+  categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_lug_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_luo.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_luo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9d1a438594b1818e3fe34c9ce63e47e0c802e700
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_luo.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: luo_Latn
+doc_to_text: 'Does this Luo topic; ''{{text}}'' belong to one of the following categories:
+  science/technology, travel, politics, sports, health, entertainment, or geography?
+  category only
+
+
+  '
+include: sib
+task: sib_luo_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_mos.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_mos.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cc025905e76696e820804e4989e9b1bec2fa2257
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_mos.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: mos_Latn
+doc_to_text: 'Does this Mossi topic; ''{{text}}'' belong to one of the following categories:
+  science/technology, travel, politics, sports, health, entertainment, or geography?
+  category only
+
+
+  '
+include: sib
+task: sib_mos_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_nso.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_nso.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..75021cc514b64598cd1e94902ecdd653db50681e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_nso.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: nso_Latn
+doc_to_text: 'Does this Northern Sotho topic; ''{{text}}'' belong to one of the following
+  categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_nso_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_nus.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_nus.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..abca40e85705feeaae8fb118ae9d162c611e0545
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_nus.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: nus_Latn
+doc_to_text: 'Does this Nuer topic; ''{{text}}'' belong to one of the following categories:
+  science/technology, travel, politics, sports, health, entertainment, or geography?
+  category only
+
+
+  '
+include: sib
+task: sib_nus_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_nya.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_nya.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e09e27331ad6104a7c585f63638b4e24e9ba8880
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_nya.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: nya_Latn
+doc_to_text: 'Does this Nyanga topic; ''{{text}}'' belong to one of the following
+  categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_nya_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_plt.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_plt.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b5b385cade643f329904a7a0dab0797e57433581
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_plt.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: plt_Latn
+doc_to_text: 'Does this Plateau Malagasy topic; ''{{text}}'' belong to one of the
+  following categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_plt_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_por.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_por.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a297c05a4be4992f33fb07737bd704ab076c9cfe
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_por.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: por_Latn
+doc_to_text: 'Does this Portuguese topic; ''{{text}}'' belong to one of the following
+  categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_por_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_run.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_run.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a4bb32245653846c6eb82fef3716b31ba85adf4d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_run.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: run_Latn
+doc_to_text: 'Does this Rundi topic; ''{{text}}'' belong to one of the following categories:
+  science/technology, travel, politics, sports, health, entertainment, or geography?
+  category only
+
+
+  '
+include: sib
+task: sib_run_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_sag.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_sag.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..979b4d84e0dae472a83fdb64e7b62c35453763e9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_sag.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: sag_Latn
+doc_to_text: 'Does this Sango topic; ''{{text}}'' belong to one of the following categories:
+  science/technology, travel, politics, sports, health, entertainment, or geography?
+  category only
+
+
+  '
+include: sib
+task: sib_sag_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_sna.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b41184b3c702c2346484fb6a982a1f9a10fe6516
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_sna.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: sna_Latn
+doc_to_text: 'Does this Shona topic; ''{{text}}'' belong to one of the following categories:
+  science/technology, travel, politics, sports, health, entertainment, or geography?
+  category only
+
+
+  '
+include: sib
+task: sib_sna_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_som.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_som.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cda1fb4133df8f42bf69a0e296b866cd128ef368
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_som.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: som_Latn
+doc_to_text: 'Does this Somali topic; ''{{text}}'' belong to one of the following
+  categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_som_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_sot.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..08d0dbecbec823c712108c42d64f9a7cbed73463
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_sot.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: sot_Latn
+doc_to_text: 'Does this Southern Sotho topic; ''{{text}}'' belong to one of the following
+  categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_sot_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_ssw.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_ssw.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0d3b99e7a07affa07aeb7ad4452887808fbd47de
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_ssw.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: ssw_Latn
+doc_to_text: 'Does this Swazi topic; ''{{text}}'' belong to one of the following categories:
+  science/technology, travel, politics, sports, health, entertainment, or geography?
+  category only
+
+
+  '
+include: sib
+task: sib_ssw_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_swa.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8e9faa831698a196843ae2f6b5f8cf4939bdada0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_swa.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: swh_Latn
+doc_to_text: 'Does this Swahili topic; ''{{text}}'' belong to one of the following
+  categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_swa_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_taq.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_taq.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f1862c468c6c874e56fca81c9bbc0df09c91425e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_taq.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: taq_Latn
+doc_to_text: 'Does this Tamasheq topic; ''{{text}}'' belong to one of the following
+  categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_taq_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_tir.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_tir.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..80dcc1bb3d8d6a65ea6dcdf75af3c946a803b071
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_tir.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: tir_Ethi
+doc_to_text: 'Does this Tigrinya topic; ''{{text}}'' belong to one of the following
+  categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_tir_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_tso.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_tso.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fad909b4b7a714ba44df14041808d54e2dc7edc2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_tso.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: tso_Latn
+doc_to_text: 'Does this Tsonga topic; ''{{text}}'' belong to one of the following
+  categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_tso_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_tum.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_tum.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..613535bc95647f5edf818e844423eebad5291937
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_tum.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: tum_Latn
+doc_to_text: 'Does this Tumbuka topic; ''{{text}}'' belong to one of the following
+  categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_tum_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_twi.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..064edb4cb8e0e1a88dbc1ccfad20adefa13034e9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_twi.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: twi_Latn
+doc_to_text: 'Does this Twi topic; ''{{text}}'' belong to one of the following categories:
+  science/technology, travel, politics, sports, health, entertainment, or geography?
+  category only
+
+
+  '
+include: sib
+task: sib_twi_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_tzm.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_tzm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7ec8adc260622a611261d30aebd49450d202b700
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_tzm.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: tzm_Tfng
+doc_to_text: 'Does this Tamazight topic; ''{{text}}'' belong to one of the following
+  categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_tzm_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_umb.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_umb.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5a910abc5fa30c9462c950f059059f5f91554b3a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_umb.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: umb_Latn
+doc_to_text: 'Does this Umbundu topic; ''{{text}}'' belong to one of the following
+  categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_umb_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_wol.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4453b3458ecde8bbc26cff73793db71471301850
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_wol.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: wol_Latn
+doc_to_text: 'Does this Wolof topic; ''{{text}}'' belong to one of the following categories:
+  science/technology, travel, politics, sports, health, entertainment, or geography?
+  category only
+
+
+  '
+include: sib
+task: sib_wol_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_xho.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e038cc9458fbb376331296bd4dd1c96a5f26a8f1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_xho.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: xho_Latn
+doc_to_text: 'Does this Xhosa topic; ''{{text}}'' belong to one of the following categories:
+  science/technology, travel, politics, sports, health, entertainment, or geography?
+  category only
+
+
+  '
+include: sib
+task: sib_xho_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_yor.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e831b3117b828bb4ea68016ca19fdcd9c89525b3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_yor.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: yor_Latn
+doc_to_text: 'Does this Yoruba topic; ''{{text}}'' belong to one of the following
+  categories: science/technology, travel, politics, sports, health, entertainment,
+  or geography? category only
+
+
+  '
+include: sib
+task: sib_yor_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/sib_zul.yaml b/lm_eval/tasks/afrobench/sib/prompt_2/sib_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f61a4061f2b636167400491e59db890289aff3d3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/sib_zul.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: zul_Latn
+doc_to_text: 'Does this Zulu topic; ''{{text}}'' belong to one of the following categories:
+  science/technology, travel, politics, sports, health, entertainment, or geography?
+  category only
+
+
+  '
+include: sib
+task: sib_zul_prompt_2
diff --git a/lm_eval/tasks/afrobench/sib/prompt_2/utils.py b/lm_eval/tasks/afrobench/sib/prompt_2/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e735e2deb1f9c53152c072615aebe8ba3acb90b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_2/utils.py
@@ -0,0 +1 @@
+from lm_eval.utils import weighted_f1_score
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib b/lm_eval/tasks/afrobench/sib/prompt_3/sib
new file mode 100644
index 0000000000000000000000000000000000000000..fed4e5c5019f791c72cfbe214efb2698943c5b92
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib
@@ -0,0 +1,43 @@
+tag:
+    - sib_tasks
+    - sib_prompt_3
+    - afrobench_TC_tasks
+dataset_path: Davlan/sib200
+dataset_name: null
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: validation
+doc_to_target: category
+doc_to_choice:
+    - "science/technology"
+    - "travel"
+    - "politics"
+    - "sports"
+    - "health"
+    - "entertainment"
+    - "geography"
+should_decontaminate: true
+doc_to_decontamination_query: text
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_aeb.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_aeb.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7b82cc4ec3cc8cff2dff2818f9238477ea12528a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_aeb.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: aeb_Arab
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Tunisian Arabic statement below? Return\
+  \ only the category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_aeb_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_afr.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_afr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f818759646be15a6c6d1c0193a7b24deb730bb03
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_afr.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: afr_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Afrikaans statement below? Return only\
+  \ the category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_afr_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_aka.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_aka.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6d4ff4e42cf1d7c926015812ea5c716928b697fc
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_aka.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: aka_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Akan statement below? Return only the category.\
+  \ \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_aka_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_amh.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..58207e9e39010c4f30d3ff0a1f46fd9e53f3b042
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_amh.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: amh_Ethi
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Amharic statement below? Return only the\
+  \ category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_amh_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_ary.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_ary.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ccb9a06880d6aa946ffd750429da3fb650c46eea
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_ary.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: ary_Arab
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Moroccan Arabic statement below? Return\
+  \ only the category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_ary_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_arz.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_arz.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..19ebbed7b9a441cca520eff58a663354d29a7395
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_arz.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: arz_Arab
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Egyptian Arabic statement below? Return\
+  \ only the category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_arz_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_bam.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_bam.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d2c1a18d9b3a4b076ca70b32981b2ecb58e3f9c0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_bam.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: bam_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Bambara statement below? Return only the\
+  \ category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_bam_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_bem.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_bem.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..99750497c258c93b1393ced6f34b9a724fbe518d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_bem.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: bem_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Bemba statement below? Return only the\
+  \ category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_bem_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_cjk.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_cjk.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..470612b51b1f7d4821573764692a04f2a623a42f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_cjk.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: cjk_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Chokwe statement below? Return only the\
+  \ category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_cjk_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_dik.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_dik.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c5269b0262805239b807d789a7343dc0d1507a29
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_dik.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: dik_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Southwestern Dinka statement below? Return\
+  \ only the category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_dik_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_dyu.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_dyu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f04a1c17199e50ecebfac887459b3c3f124a1529
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_dyu.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: dyu_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Dyula statement below? Return only the\
+  \ category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_dyu_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_eng.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bf22d08fcab877aae2ce77081274d1928e27f8c4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_eng.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: eng_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the English statement below? Return only the\
+  \ category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_eng_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_ewe.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4cc991048ad19285b1dd269e91a6bb32898b6d88
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_ewe.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: ewe_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Ewe statement below? Return only the category.\
+  \ \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_ewe_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_fon.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_fon.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3127fde242956bcaf89224ed3a88be80dc967c52
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_fon.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: fon_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Fon statement below? Return only the category.\
+  \ \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_fon_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_fra.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9a24ff30e4f6408f02a0f4a8978250d91e36621f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_fra.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: fra_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the French statement below? Return only the\
+  \ category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_fra_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_fuv.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_fuv.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..405838c78ddbe6a99d66f436699de00f0b7e814b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_fuv.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: fuv_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Nigerian Fulfulde statement below? Return\
+  \ only the category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_fuv_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_gaz.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_gaz.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..282b439a3c2d6703c04446b4cf477a9bd60bf340
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_gaz.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: gaz_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the West Central Oromo statement below? Return\
+  \ only the category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_gaz_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_hau.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..239181bf1f1586ac83aa1741e9cffe531c2433b1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_hau.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: hau_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Hausa statement below? Return only the\
+  \ category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_hau_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_ibo.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0581291dd7ed5cf53e52fe2f44154363b8be9599
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_ibo.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: ibo_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Igbo statement below? Return only the category.\
+  \ \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_ibo_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_kab.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_kab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..32fbbf4407d07a5d78558b022260117f592645d7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_kab.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: kab_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Kabyle statement below? Return only the\
+  \ category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_kab_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_kam.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_kam.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3f745ba54f9daaca1a3302443c4a5aba3de795f4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_kam.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: kam_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Kamba statement below? Return only the\
+  \ category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_kam_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_kbp.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_kbp.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c5be1035bb58862fe73c2678287620d409a76b87
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_kbp.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: kbp_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Kabiye statement below? Return only the\
+  \ category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_kbp_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_kea.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_kea.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a1d3e2a68cf8b465f589701f1004ae4b5dc07dd9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_kea.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: kea_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Kabuverdianu statement below? Return only\
+  \ the category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_kea_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_kik.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_kik.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..521a0f89226460e6f1a9e25c0d24066cd929c662
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_kik.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: kik_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Kikuyu statement below? Return only the\
+  \ category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_kik_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_kin.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..066bfb630c59e334f65dcd74ea536a6790b3337d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_kin.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: kin_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Kinyarwanda statement below? Return only\
+  \ the category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_kin_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_kmb.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_kmb.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c548af893d77f13231cab13318e396cbcf423388
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_kmb.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: kmb_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Kimbundu statement below? Return only the\
+  \ category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_kmb_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_knc.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_knc.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9136823770a1e6754070143b2b0e40a988da22f4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_knc.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: knc_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Central Kanuri statement below? Return\
+  \ only the category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_knc_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_kon.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_kon.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d8777511ef33fad4019d3e157d1dbc4f6d0aad96
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_kon.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: kon_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Kikongo statement below? Return only the\
+  \ category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_kon_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_lin.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8688cb875fa5554625073325810a9dbb1198f06b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_lin.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: lin_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Lingala statement below? Return only the\
+  \ category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_lin_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_lua.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_lua.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e71ac2aae77f40f06796c1572b2d38b44ec53962
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_lua.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: lua_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Luba-Kasai statement below? Return only\
+  \ the category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_lua_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_lug.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c3554267ebad03a604a4a3dcca369af535efb156
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_lug.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: lug_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Luganda statement below? Return only the\
+  \ category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_lug_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_luo.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_luo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..161814d36bde3e61fe3cdf38e98d2ef62f6b9248
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_luo.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: luo_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Luo statement below? Return only the category.\
+  \ \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_luo_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_mos.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_mos.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7b80d5008087bf66a82b8b7855fc5b8c857497fe
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_mos.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: mos_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Mossi statement below? Return only the\
+  \ category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_mos_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_nso.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_nso.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5c9dd8bd3f8cd30ce172286ca005c16b0ead9214
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_nso.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: nso_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Northern Sotho statement below? Return\
+  \ only the category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_nso_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_nus.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_nus.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..361698af10f6f231fbbdabf9e92a287504c45057
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_nus.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: nus_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Nuer statement below? Return only the category.\
+  \ \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_nus_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_nya.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_nya.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0c455c788ad7fc13a440885b6c7fc594ed4fc6e4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_nya.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: nya_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Nyanga statement below? Return only the\
+  \ category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_nya_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_plt.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_plt.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bb90a034be0e94aba92823ba3b5762fc13eabe6f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_plt.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: plt_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Plateau Malagasy statement below? Return\
+  \ only the category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_plt_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_por.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_por.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..65b8c2da4ab91e1723ceabd2e9fb08d3b6de2cfe
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_por.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: por_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Portuguese statement below? Return only\
+  \ the category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_por_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_run.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_run.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..19f3681cf856c7bc28bb1fcb5e8c31eda1f1b618
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_run.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: run_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Rundi statement below? Return only the\
+  \ category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_run_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_sag.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_sag.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8dfdcbd41929bc5747c34f18e93633fec8ac04e6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_sag.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: sag_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Sango statement below? Return only the\
+  \ category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_sag_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_sna.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f30ff0d2b995c831b4005318a18a723998c92aa8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_sna.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: sna_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Shona statement below? Return only the\
+  \ category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_sna_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_som.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_som.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0ea27fd2e1b298324e1e1abcff152b25cd9cfc3d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_som.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: som_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Somali statement below? Return only the\
+  \ category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_som_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_sot.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d4ad477db4c912a09e317c7edc7818fe96b355f7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_sot.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: sot_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Southern Sotho statement below? Return\
+  \ only the category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_sot_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_ssw.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_ssw.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..25b7f85e1c955207a6afe5d154ed4286602a5313
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_ssw.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: ssw_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Swazi statement below? Return only the\
+  \ category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_ssw_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_swa.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7be0be9d211d0fce16493b3b62d593a7ad60b864
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_swa.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: swh_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Swahili statement below? Return only the\
+  \ category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_swa_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_taq.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_taq.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a7e7b3abbbbca622c1b56169a7abc6c917d9b241
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_taq.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: taq_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Tamasheq statement below? Return only the\
+  \ category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_taq_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_tir.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_tir.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..aceb352596ba9fed4c4a3a544beb616923dca213
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_tir.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: tir_Ethi
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Tigrinya statement below? Return only the\
+  \ category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_tir_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_tso.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_tso.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..025b7163c069a6291dc4691a34de44732cc4c8b7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_tso.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: tso_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Tsonga statement below? Return only the\
+  \ category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_tso_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_tum.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_tum.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..35092ea79435767ad3e4907e152273d2cd6f1dca
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_tum.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: tum_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Tumbuka statement below? Return only the\
+  \ category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_tum_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_twi.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fc75f6579cdb57391a21bbce3a49fb062d7263f9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_twi.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: twi_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Twi statement below? Return only the category.\
+  \ \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_twi_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_tzm.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_tzm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b9b3044cdd08f81c95edfaeb8ded07d4a1da919d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_tzm.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: tzm_Tfng
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Tamazight statement below? Return only\
+  \ the category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_tzm_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_umb.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_umb.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d8bb8540180f44e612ea82e5276749d104362492
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_umb.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: umb_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Umbundu statement below? Return only the\
+  \ category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_umb_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_wol.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..115796d5276b8739efb6fccfb9064b5f4bb6a27e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_wol.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: wol_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Wolof statement below? Return only the\
+  \ category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_wol_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_xho.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b61c84b700da4e798848d52ed2310c9cb5ee3467
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_xho.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: xho_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Xhosa statement below? Return only the\
+  \ category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_xho_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_yor.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c5ccd0c738eb5d1d365e5d7342ebe4eaaf7686b8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_yor.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: yor_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Yoruba statement below? Return only the\
+  \ category. \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_yor_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/sib_zul.yaml b/lm_eval/tasks/afrobench/sib/prompt_3/sib_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4915989dbbb0849b3345a80a420daa18a37eb97b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/sib_zul.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: zul_Latn
+doc_to_text: "You are an assistant able to classify topics in texts. \n\nGiven the\
+  \ categories science/technology, travel, politics, sports, health, entertainment,\
+  \ or geography; what is the topic of the Zulu statement below? Return only the category.\
+  \ \n\ntext: {{text}} \\category:\n\n"
+include: sib
+task: sib_zul_prompt_3
diff --git a/lm_eval/tasks/afrobench/sib/prompt_3/utils.py b/lm_eval/tasks/afrobench/sib/prompt_3/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e735e2deb1f9c53152c072615aebe8ba3acb90b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_3/utils.py
@@ -0,0 +1 @@
+from lm_eval.utils import weighted_f1_score
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib b/lm_eval/tasks/afrobench/sib/prompt_4/sib
new file mode 100644
index 0000000000000000000000000000000000000000..28ed8f4a0da4e25815ebcfa6e58092a382e1708e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib
@@ -0,0 +1,43 @@
+tag:
+    - sib_tasks
+    - sib_prompt_4
+    - afrobench_TC_tasks
+dataset_path: Davlan/sib200
+dataset_name: null
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: validation
+doc_to_target: category
+doc_to_choice:
+    - "science/technology"
+    - "travel"
+    - "politics"
+    - "sports"
+    - "health"
+    - "entertainment"
+    - "geography"
+should_decontaminate: true
+doc_to_decontamination_query: text
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_aeb.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_aeb.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e8c737f278122c8893e028ea2334ff93646a73cf
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_aeb.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: aeb_Arab
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_aeb_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_afr.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_afr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7139d04e9a5b4a4865ba11941d3078802cc9a85c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_afr.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: afr_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_afr_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_aka.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_aka.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..59c8c56a6b78ffbb0da5e3b3abeb24ccd13b35d1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_aka.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: aka_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_aka_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_amh.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cec6b6c43425195e36be88cbdc266d8806844a24
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_amh.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: amh_Ethi
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_amh_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_ary.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_ary.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2c10743470b814bf689bfef10410af3b4e03bb84
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_ary.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ary_Arab
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_ary_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_arz.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_arz.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1740975a66196d9c4c3bd6780ad50281766cb0b2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_arz.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: arz_Arab
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_arz_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_bam.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_bam.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..33ee240e6d95a0e43426b514f5e33f696526faeb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_bam.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: bam_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_bam_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_bem.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_bem.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..aa5608e849606f6d63f371b1bd7362d355b7d42b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_bem.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: bem_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_bem_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_cjk.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_cjk.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..52e08d7b8c5dc10d6c35a5b4fa4deee9b494f2d5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_cjk.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: cjk_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_cjk_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_dik.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_dik.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c8db6013f1d2a63d8242ac59d55c3f006f01e660
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_dik.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: dik_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_dik_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_dyu.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_dyu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e9bbc0b547f3b25150eb000d4e56bb6e24e86991
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_dyu.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: dyu_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_dyu_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_eng.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1c84749120e002dee47446d05600d81ed14bc193
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_eng.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: eng_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_eng_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_ewe.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..02e7ea822fee11a3d0b3869ef3ea493048a114da
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_ewe.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ewe_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_ewe_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_fon.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_fon.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..67053ed8cd739682270062acea206792a7df5679
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_fon.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: fon_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_fon_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_fra.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c2b858ce4e502334f8440c8551b1bcd10feb3b15
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_fra.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: fra_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_fra_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_fuv.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_fuv.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0c73f82679a48664a468f8e36432a0e33399190c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_fuv.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: fuv_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_fuv_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_gaz.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_gaz.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ba95ef5d8ee884a65befdab1a83853686d8b8ef5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_gaz.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: gaz_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_gaz_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_hau.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1d53794868c164768810226db74aab7f06ccb383
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_hau.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: hau_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_hau_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_ibo.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2683d98dba0ca644b5314ef96a1359571a83fe9d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_ibo.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ibo_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_ibo_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_kab.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_kab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9f645a4598e2de1ccc45de14274a882b26deceb7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_kab.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kab_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_kab_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_kam.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_kam.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7f035b89505f2f6ef889addc2af1c972efc8ff2d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_kam.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kam_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_kam_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_kbp.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_kbp.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c65b6352e1dd12d2a7d511825f9c043ad213aebc
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_kbp.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kbp_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_kbp_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_kea.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_kea.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0e7bba4ae7a6c359568f6252edaebd0bee96c860
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_kea.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kea_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_kea_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_kik.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_kik.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..06480d183bb1d2765f8d23e8dda80ee6c37c029e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_kik.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kik_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_kik_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_kin.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7b447219fb3bafaf2a81e3ac727e5216f408893f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_kin.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kin_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_kin_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_kmb.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_kmb.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5fc51890964f59d20f53b06cf3ddbdb02b444471
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_kmb.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kmb_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_kmb_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_knc.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_knc.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..326443318488c921765094a96c84adb8e208eda8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_knc.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: knc_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_knc_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_kon.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_kon.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6da4ab390d0e433391e313dea2c82d302d090dd2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_kon.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: kon_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_kon_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_lin.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..51076dbd56131d82389d83bf2b12a224ef6c5443
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_lin.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: lin_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_lin_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_lua.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_lua.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..95973f7d5ce8309ee595ce4e751d564851796923
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_lua.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: lua_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_lua_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_lug.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a570b58488496b4a73ca0fe46a2210c95b470bb2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_lug.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: lug_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_lug_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_luo.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_luo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..76d799856b98c4f78804815fd5bd86dd415a100d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_luo.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: luo_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_luo_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_mos.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_mos.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..aeb058ac584c862898e7acf7441aa76b8c123709
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_mos.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: mos_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_mos_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_nso.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_nso.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f76e016a6bcdda14daf24179da982f696732a199
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_nso.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: nso_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_nso_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_nus.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_nus.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..255c1861589e185c2bcdb3f1d9f679ed26837be0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_nus.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: nus_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_nus_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_nya.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_nya.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bc7a48abf7f86cdea8f07d54a3a166ffe9550f06
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_nya.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: nya_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_nya_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_plt.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_plt.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..880c3d03ef5be9db726024243158f283c2013861
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_plt.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: plt_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_plt_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_por.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_por.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..16a258365d56b100c44fd269da27413fd3bffa83
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_por.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: por_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_por_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_run.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_run.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a97737edf4b9ab31a53749d109359c8acb3d3f4f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_run.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: run_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_run_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_sag.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_sag.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8c6897795ec414f37037eb2b79e6ffb6e3124ed7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_sag.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sag_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_sag_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_sna.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..da13a6ecf2b11650068be21b5b28ff470aba9002
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_sna.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sna_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_sna_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_som.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_som.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b6c35f3cb4a25a3ddecda9d4b1dc8528fce64d1f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_som.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: som_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_som_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_sot.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1385e058deaf19b0cdf272a768f260997d7cae92
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_sot.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sot_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_sot_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_ssw.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_ssw.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d678c12422ff05f87849f934daf402454fa3415e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_ssw.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ssw_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_ssw_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_swa.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7492cfa329f48c959f6255ffbb879d952fcbe200
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_swa.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: swh_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_swa_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_taq.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_taq.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..783be833f8c77c16a82948f4055162941723849f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_taq.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: taq_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_taq_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_tir.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_tir.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..931ede568a3faa337637be06bccfd9ca136d8bc2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_tir.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: tir_Ethi
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_tir_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_tso.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_tso.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fc4c0f1a3278259574fc84fd09be60422174b871
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_tso.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: tso_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_tso_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_tum.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_tum.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c099dc6fd73bdad8c8e9e4d306cee8d9dec243fa
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_tum.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: tum_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_tum_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_twi.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..00501281a217c4d68834e8a1cd6dec9463e87268
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_twi.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: twi_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_twi_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_tzm.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_tzm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3915fa18f2c7963e0b1a0f4f10ca1da87f765141
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_tzm.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: tzm_Tfng
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_tzm_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_umb.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_umb.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a7f1cc79736bfba50fb8ab03c8749a77523feec4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_umb.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: umb_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_umb_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_wol.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fc2440248af154acd2aecdfc6d341230d4bfa67a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_wol.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: wol_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_wol_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_xho.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e075b84c190d949ad8e177b06445e0136f0445d0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_xho.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: xho_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_xho_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_yor.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..41ef062098ebc4b54b7aec5da59851d490924e6c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_yor.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: yor_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_yor_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/sib_zul.yaml b/lm_eval/tasks/afrobench/sib/prompt_4/sib_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7fc2f85efc9f9799446133ac107b6b0d66cfb38b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/sib_zul.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: zul_Latn
+doc_to_text: "Label the following text as science/technology, travel, politics, sports,\
+  \ health, entertainment, or geography. Provide only the category as your response.\
+  \ \n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_zul_prompt_4
diff --git a/lm_eval/tasks/afrobench/sib/prompt_4/utils.py b/lm_eval/tasks/afrobench/sib/prompt_4/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e735e2deb1f9c53152c072615aebe8ba3acb90b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_4/utils.py
@@ -0,0 +1 @@
+from lm_eval.utils import weighted_f1_score
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib b/lm_eval/tasks/afrobench/sib/prompt_5/sib
new file mode 100644
index 0000000000000000000000000000000000000000..812df7f614a9c8146b6da3137f4c2e97049b07f2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib
@@ -0,0 +1,43 @@
+tag:
+    - sib_tasks
+    - sib_prompt_5
+    - afrobench_TC_tasks
+dataset_path: Davlan/sib200
+dataset_name: null
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+fewshot_split: validation
+doc_to_target: category
+doc_to_choice:
+    - "science/technology"
+    - "travel"
+    - "politics"
+    - "sports"
+    - "health"
+    - "entertainment"
+    - "geography"
+should_decontaminate: true
+doc_to_decontamination_query: text
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_aeb.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_aeb.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3c88c0a28bdd40981ba847762e3cc08b36e66690
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_aeb.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: aeb_Arab
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Tunisian Arabic text. For each input, classify the topic as science/technology,\
+  \ travel, politics, sports, health, entertainment, or geography. Use the following\
+  \ guidelines: \n\n science/technology: The text discusses scientific discoveries,\
+  \ technological advancements, or related topics. \ntravel: The text describes travel\
+  \ experiences, destinations, or related topics. \npolitics: The text covers political\
+  \ events, policies, or related topics. \nsports: The text talks about sports events,\
+  \ athletes, or related topics. \nhealth: The text addresses health issues, medical\
+  \ advancements, or related topics. \nentertainment: The text pertains to movies,\
+  \ music, celebrities, or related topics. \ngeography: The text involves geographical\
+  \ information, locations, or related topics. \n\nIf the text contains multiple topics,\
+  \ choose the dominant topic. For ambiguous or unclear topics, select the category\
+  \ that best reflects the overall content. Please provide a single classification\
+  \ for each input.\n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_aeb_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_afr.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_afr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0d585478be65a678629dd3718c9e44f25a42b5e6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_afr.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: afr_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Afrikaans text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_afr_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_aka.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_aka.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4644bfa3c9c923d544f86f13b273c5f754b236f7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_aka.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: aka_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Akan text. For each input, classify the topic as science/technology, travel, politics,\
+  \ sports, health, entertainment, or geography. Use the following guidelines: \n\n\
+  \ science/technology: The text discusses scientific discoveries, technological advancements,\
+  \ or related topics. \ntravel: The text describes travel experiences, destinations,\
+  \ or related topics. \npolitics: The text covers political events, policies, or\
+  \ related topics. \nsports: The text talks about sports events, athletes, or related\
+  \ topics. \nhealth: The text addresses health issues, medical advancements, or related\
+  \ topics. \nentertainment: The text pertains to movies, music, celebrities, or related\
+  \ topics. \ngeography: The text involves geographical information, locations, or\
+  \ related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_aka_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_amh.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_amh.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f2b5e6f9223f1b20d1d02b5f27635a0684388744
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_amh.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: amh_Ethi
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Amharic text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_amh_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_ary.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_ary.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..348c849d219b06501167c51182458f1946f51439
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_ary.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: ary_Arab
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Moroccan Arabic text. For each input, classify the topic as science/technology,\
+  \ travel, politics, sports, health, entertainment, or geography. Use the following\
+  \ guidelines: \n\n science/technology: The text discusses scientific discoveries,\
+  \ technological advancements, or related topics. \ntravel: The text describes travel\
+  \ experiences, destinations, or related topics. \npolitics: The text covers political\
+  \ events, policies, or related topics. \nsports: The text talks about sports events,\
+  \ athletes, or related topics. \nhealth: The text addresses health issues, medical\
+  \ advancements, or related topics. \nentertainment: The text pertains to movies,\
+  \ music, celebrities, or related topics. \ngeography: The text involves geographical\
+  \ information, locations, or related topics. \n\nIf the text contains multiple topics,\
+  \ choose the dominant topic. For ambiguous or unclear topics, select the category\
+  \ that best reflects the overall content. Please provide a single classification\
+  \ for each input.\n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_ary_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_arz.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_arz.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..100570428142a0f35ec728558251e96ec484ccb5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_arz.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: arz_Arab
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Egyptian Arabic text. For each input, classify the topic as science/technology,\
+  \ travel, politics, sports, health, entertainment, or geography. Use the following\
+  \ guidelines: \n\n science/technology: The text discusses scientific discoveries,\
+  \ technological advancements, or related topics. \ntravel: The text describes travel\
+  \ experiences, destinations, or related topics. \npolitics: The text covers political\
+  \ events, policies, or related topics. \nsports: The text talks about sports events,\
+  \ athletes, or related topics. \nhealth: The text addresses health issues, medical\
+  \ advancements, or related topics. \nentertainment: The text pertains to movies,\
+  \ music, celebrities, or related topics. \ngeography: The text involves geographical\
+  \ information, locations, or related topics. \n\nIf the text contains multiple topics,\
+  \ choose the dominant topic. For ambiguous or unclear topics, select the category\
+  \ that best reflects the overall content. Please provide a single classification\
+  \ for each input.\n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_arz_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_bam.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_bam.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bdc655003fc959e8219ae681b64b82dd137853d1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_bam.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: bam_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Bambara text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_bam_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_bem.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_bem.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d42ea873b83e69d3d3d621a9ba1fafd7a88a4ab3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_bem.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: bem_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Bemba text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_bem_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_cjk.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_cjk.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9623b8c52bb39ab7dfa0f743d5482169a462b7ab
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_cjk.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: cjk_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Chokwe text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_cjk_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_dik.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_dik.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..83e76e963fe2962b472b8312d35a79c4e14d2b55
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_dik.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: dik_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Southwestern Dinka text. For each input, classify the topic as science/technology,\
+  \ travel, politics, sports, health, entertainment, or geography. Use the following\
+  \ guidelines: \n\n science/technology: The text discusses scientific discoveries,\
+  \ technological advancements, or related topics. \ntravel: The text describes travel\
+  \ experiences, destinations, or related topics. \npolitics: The text covers political\
+  \ events, policies, or related topics. \nsports: The text talks about sports events,\
+  \ athletes, or related topics. \nhealth: The text addresses health issues, medical\
+  \ advancements, or related topics. \nentertainment: The text pertains to movies,\
+  \ music, celebrities, or related topics. \ngeography: The text involves geographical\
+  \ information, locations, or related topics. \n\nIf the text contains multiple topics,\
+  \ choose the dominant topic. For ambiguous or unclear topics, select the category\
+  \ that best reflects the overall content. Please provide a single classification\
+  \ for each input.\n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_dik_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_dyu.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_dyu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8ab215e89f959e6edf4bd07d1729f0424e85e0a8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_dyu.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: dyu_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Dyula text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_dyu_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_eng.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_eng.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a17a006d21d32ce9901010cbdcd94aad3af933f4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_eng.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: eng_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ English text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_eng_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_ewe.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_ewe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..195876998160addea6184dbc4d3375192068aec5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_ewe.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: ewe_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Ewe text. For each input, classify the topic as science/technology, travel, politics,\
+  \ sports, health, entertainment, or geography. Use the following guidelines: \n\n\
+  \ science/technology: The text discusses scientific discoveries, technological advancements,\
+  \ or related topics. \ntravel: The text describes travel experiences, destinations,\
+  \ or related topics. \npolitics: The text covers political events, policies, or\
+  \ related topics. \nsports: The text talks about sports events, athletes, or related\
+  \ topics. \nhealth: The text addresses health issues, medical advancements, or related\
+  \ topics. \nentertainment: The text pertains to movies, music, celebrities, or related\
+  \ topics. \ngeography: The text involves geographical information, locations, or\
+  \ related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_ewe_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_fon.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_fon.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..61980b5110a424ed7391b29dbedf7f7828563f03
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_fon.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: fon_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Fon text. For each input, classify the topic as science/technology, travel, politics,\
+  \ sports, health, entertainment, or geography. Use the following guidelines: \n\n\
+  \ science/technology: The text discusses scientific discoveries, technological advancements,\
+  \ or related topics. \ntravel: The text describes travel experiences, destinations,\
+  \ or related topics. \npolitics: The text covers political events, policies, or\
+  \ related topics. \nsports: The text talks about sports events, athletes, or related\
+  \ topics. \nhealth: The text addresses health issues, medical advancements, or related\
+  \ topics. \nentertainment: The text pertains to movies, music, celebrities, or related\
+  \ topics. \ngeography: The text involves geographical information, locations, or\
+  \ related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_fon_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_fra.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_fra.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..29573054bcfa08ddf225f925cb6131b6d4909163
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_fra.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: fra_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ French text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_fra_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_fuv.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_fuv.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3b48f9f4e09e7042dfdec6ffa224300ee824b580
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_fuv.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: fuv_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Nigerian Fulfulde text. For each input, classify the topic as science/technology,\
+  \ travel, politics, sports, health, entertainment, or geography. Use the following\
+  \ guidelines: \n\n science/technology: The text discusses scientific discoveries,\
+  \ technological advancements, or related topics. \ntravel: The text describes travel\
+  \ experiences, destinations, or related topics. \npolitics: The text covers political\
+  \ events, policies, or related topics. \nsports: The text talks about sports events,\
+  \ athletes, or related topics. \nhealth: The text addresses health issues, medical\
+  \ advancements, or related topics. \nentertainment: The text pertains to movies,\
+  \ music, celebrities, or related topics. \ngeography: The text involves geographical\
+  \ information, locations, or related topics. \n\nIf the text contains multiple topics,\
+  \ choose the dominant topic. For ambiguous or unclear topics, select the category\
+  \ that best reflects the overall content. Please provide a single classification\
+  \ for each input.\n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_fuv_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_gaz.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_gaz.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..37e2a4f97793217ecbd9f8c551d585901366cd34
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_gaz.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: gaz_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ West Central Oromo text. For each input, classify the topic as science/technology,\
+  \ travel, politics, sports, health, entertainment, or geography. Use the following\
+  \ guidelines: \n\n science/technology: The text discusses scientific discoveries,\
+  \ technological advancements, or related topics. \ntravel: The text describes travel\
+  \ experiences, destinations, or related topics. \npolitics: The text covers political\
+  \ events, policies, or related topics. \nsports: The text talks about sports events,\
+  \ athletes, or related topics. \nhealth: The text addresses health issues, medical\
+  \ advancements, or related topics. \nentertainment: The text pertains to movies,\
+  \ music, celebrities, or related topics. \ngeography: The text involves geographical\
+  \ information, locations, or related topics. \n\nIf the text contains multiple topics,\
+  \ choose the dominant topic. For ambiguous or unclear topics, select the category\
+  \ that best reflects the overall content. Please provide a single classification\
+  \ for each input.\n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_gaz_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_hau.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_hau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..24ce0970e9923725e2255abe09dc6e1629c9d23f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_hau.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: hau_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Hausa text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_hau_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_ibo.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_ibo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a39ee75cb90a18ddb12bbedc54fd82a4c4c45ded
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_ibo.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: ibo_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Igbo text. For each input, classify the topic as science/technology, travel, politics,\
+  \ sports, health, entertainment, or geography. Use the following guidelines: \n\n\
+  \ science/technology: The text discusses scientific discoveries, technological advancements,\
+  \ or related topics. \ntravel: The text describes travel experiences, destinations,\
+  \ or related topics. \npolitics: The text covers political events, policies, or\
+  \ related topics. \nsports: The text talks about sports events, athletes, or related\
+  \ topics. \nhealth: The text addresses health issues, medical advancements, or related\
+  \ topics. \nentertainment: The text pertains to movies, music, celebrities, or related\
+  \ topics. \ngeography: The text involves geographical information, locations, or\
+  \ related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_ibo_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_kab.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_kab.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d29da033388668302aca70c3511d52377e8797d9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_kab.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: kab_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Kabyle text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_kab_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_kam.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_kam.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1e55d1218586efb9f8d0b9bad3e9c2c76727a74b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_kam.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: kam_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Kamba text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_kam_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_kbp.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_kbp.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..210baea8685a19c4e6bee6bcd141f2f0cb2a101a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_kbp.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: kbp_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Kabiye text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_kbp_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_kea.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_kea.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..34a6813c8eff67f57adff7f43c982006b431ccf6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_kea.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: kea_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Kabuverdianu text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_kea_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_kik.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_kik.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..55fdcb00e3f003422315e8de2ef64c8aa9e0abbc
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_kik.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: kik_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Kikuyu text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_kik_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_kin.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_kin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6567d52bf1f47beaba19141ab2f95b8168298290
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_kin.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: kin_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Kinyarwanda text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_kin_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_kmb.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_kmb.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4ae05cd06aef47942e20bbf72092cd23a5b4fb2f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_kmb.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: kmb_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Kimbundu text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_kmb_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_knc.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_knc.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9870bd64740561b60f69af908c01ab221585d3fc
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_knc.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: knc_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Central Kanuri text. For each input, classify the topic as science/technology,\
+  \ travel, politics, sports, health, entertainment, or geography. Use the following\
+  \ guidelines: \n\n science/technology: The text discusses scientific discoveries,\
+  \ technological advancements, or related topics. \ntravel: The text describes travel\
+  \ experiences, destinations, or related topics. \npolitics: The text covers political\
+  \ events, policies, or related topics. \nsports: The text talks about sports events,\
+  \ athletes, or related topics. \nhealth: The text addresses health issues, medical\
+  \ advancements, or related topics. \nentertainment: The text pertains to movies,\
+  \ music, celebrities, or related topics. \ngeography: The text involves geographical\
+  \ information, locations, or related topics. \n\nIf the text contains multiple topics,\
+  \ choose the dominant topic. For ambiguous or unclear topics, select the category\
+  \ that best reflects the overall content. Please provide a single classification\
+  \ for each input.\n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_knc_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_kon.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_kon.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..afcab8b8dd34cfa63b1653223c05170789dddc10
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_kon.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: kon_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Kikongo text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_kon_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_lin.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_lin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4c1611fd77106ae1eeef87ff5fcce60221ef8039
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_lin.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: lin_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Lingala text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_lin_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_lua.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_lua.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b3b2b9edcb68cf91b5c63273e07bedc936a7ffe1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_lua.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: lua_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Luba-Kasai text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_lua_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_lug.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_lug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f8ca880ace46c5220acdcf2fb5d47bf37e2791aa
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_lug.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: lug_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Luganda text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_lug_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_luo.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_luo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b942d69d0cf5a367213aca5cc437b26015c83ae3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_luo.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: luo_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Luo text. For each input, classify the topic as science/technology, travel, politics,\
+  \ sports, health, entertainment, or geography. Use the following guidelines: \n\n\
+  \ science/technology: The text discusses scientific discoveries, technological advancements,\
+  \ or related topics. \ntravel: The text describes travel experiences, destinations,\
+  \ or related topics. \npolitics: The text covers political events, policies, or\
+  \ related topics. \nsports: The text talks about sports events, athletes, or related\
+  \ topics. \nhealth: The text addresses health issues, medical advancements, or related\
+  \ topics. \nentertainment: The text pertains to movies, music, celebrities, or related\
+  \ topics. \ngeography: The text involves geographical information, locations, or\
+  \ related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_luo_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_mos.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_mos.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..daccd62e9345db0c0625c7a26df993fe4f528411
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_mos.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: mos_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Mossi text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_mos_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_nso.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_nso.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..09936e3c333c1cb2c285fc056d6da25246bcfefe
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_nso.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: nso_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Northern Sotho text. For each input, classify the topic as science/technology,\
+  \ travel, politics, sports, health, entertainment, or geography. Use the following\
+  \ guidelines: \n\n science/technology: The text discusses scientific discoveries,\
+  \ technological advancements, or related topics. \ntravel: The text describes travel\
+  \ experiences, destinations, or related topics. \npolitics: The text covers political\
+  \ events, policies, or related topics. \nsports: The text talks about sports events,\
+  \ athletes, or related topics. \nhealth: The text addresses health issues, medical\
+  \ advancements, or related topics. \nentertainment: The text pertains to movies,\
+  \ music, celebrities, or related topics. \ngeography: The text involves geographical\
+  \ information, locations, or related topics. \n\nIf the text contains multiple topics,\
+  \ choose the dominant topic. For ambiguous or unclear topics, select the category\
+  \ that best reflects the overall content. Please provide a single classification\
+  \ for each input.\n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_nso_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_nus.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_nus.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a5f8e101910ba130d16708ce63388b4808286236
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_nus.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: nus_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Nuer text. For each input, classify the topic as science/technology, travel, politics,\
+  \ sports, health, entertainment, or geography. Use the following guidelines: \n\n\
+  \ science/technology: The text discusses scientific discoveries, technological advancements,\
+  \ or related topics. \ntravel: The text describes travel experiences, destinations,\
+  \ or related topics. \npolitics: The text covers political events, policies, or\
+  \ related topics. \nsports: The text talks about sports events, athletes, or related\
+  \ topics. \nhealth: The text addresses health issues, medical advancements, or related\
+  \ topics. \nentertainment: The text pertains to movies, music, celebrities, or related\
+  \ topics. \ngeography: The text involves geographical information, locations, or\
+  \ related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_nus_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_nya.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_nya.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..65737777ba6914aa1a735a260ee7ce7e3bfa9754
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_nya.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: nya_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Nyanga text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_nya_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_plt.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_plt.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..24f6ea33e114f92a89b2b08581c3e2d93985362f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_plt.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: plt_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Plateau Malagasy text. For each input, classify the topic as science/technology,\
+  \ travel, politics, sports, health, entertainment, or geography. Use the following\
+  \ guidelines: \n\n science/technology: The text discusses scientific discoveries,\
+  \ technological advancements, or related topics. \ntravel: The text describes travel\
+  \ experiences, destinations, or related topics. \npolitics: The text covers political\
+  \ events, policies, or related topics. \nsports: The text talks about sports events,\
+  \ athletes, or related topics. \nhealth: The text addresses health issues, medical\
+  \ advancements, or related topics. \nentertainment: The text pertains to movies,\
+  \ music, celebrities, or related topics. \ngeography: The text involves geographical\
+  \ information, locations, or related topics. \n\nIf the text contains multiple topics,\
+  \ choose the dominant topic. For ambiguous or unclear topics, select the category\
+  \ that best reflects the overall content. Please provide a single classification\
+  \ for each input.\n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_plt_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_por.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_por.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d98ee118637f21a2ff1ffa30cd84099327965cbc
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_por.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: por_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Portuguese text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_por_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_run.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_run.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..01820da52cbd3b30dfe1c65a052b47ce1af0c7c0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_run.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: run_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Rundi text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_run_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_sag.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_sag.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fdabdcb63ca35cc8fd419558911099c7d8f14877
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_sag.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: sag_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Sango text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_sag_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_sna.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_sna.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3d66f53a7736d27346e47675426abfd4b63b6388
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_sna.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: sna_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Shona text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_sna_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_som.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_som.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c0c34f97d20b7c7e4b7be5b7225bf6a91baec3e0
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_som.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: som_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Somali text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_som_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_sot.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_sot.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..81ab5c3f7e66f457d59edef55eb79c693c18913d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_sot.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: sot_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Southern Sotho text. For each input, classify the topic as science/technology,\
+  \ travel, politics, sports, health, entertainment, or geography. Use the following\
+  \ guidelines: \n\n science/technology: The text discusses scientific discoveries,\
+  \ technological advancements, or related topics. \ntravel: The text describes travel\
+  \ experiences, destinations, or related topics. \npolitics: The text covers political\
+  \ events, policies, or related topics. \nsports: The text talks about sports events,\
+  \ athletes, or related topics. \nhealth: The text addresses health issues, medical\
+  \ advancements, or related topics. \nentertainment: The text pertains to movies,\
+  \ music, celebrities, or related topics. \ngeography: The text involves geographical\
+  \ information, locations, or related topics. \n\nIf the text contains multiple topics,\
+  \ choose the dominant topic. For ambiguous or unclear topics, select the category\
+  \ that best reflects the overall content. Please provide a single classification\
+  \ for each input.\n\ntext: {{text}} \\category: \n\n"
+include: sib
+task: sib_sot_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_ssw.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_ssw.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f662d2ab44cebe8ec184c7864207b9ceafa95f58
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_ssw.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: ssw_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Swazi text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_ssw_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_swa.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_swa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ee47ca51598f6455804e6e6cad3fb1ca1cacc4d6
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_swa.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: swh_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Swahili text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_swa_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_taq.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_taq.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3fa1380df4b219fae78d9af94069a3becc256832
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_taq.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: taq_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Tamasheq text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_taq_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_tir.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_tir.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..20ec0638837c561c45b0267d47a2a7481a3e9ec7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_tir.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: tir_Ethi
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Tigrinya text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_tir_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_tso.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_tso.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..44b3b867a796111bbcfe2d295ff5c04435878208
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_tso.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: tso_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Tsonga text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_tso_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_tum.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_tum.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bb15fb71e821e69e255b36183d2273a75292fd60
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_tum.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: tum_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Tumbuka text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_tum_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_twi.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_twi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..44bca6194bc19417067ce10659958d0c5993ad87
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_twi.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: twi_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Twi text. For each input, classify the topic as science/technology, travel, politics,\
+  \ sports, health, entertainment, or geography. Use the following guidelines: \n\n\
+  \ science/technology: The text discusses scientific discoveries, technological advancements,\
+  \ or related topics. \ntravel: The text describes travel experiences, destinations,\
+  \ or related topics. \npolitics: The text covers political events, policies, or\
+  \ related topics. \nsports: The text talks about sports events, athletes, or related\
+  \ topics. \nhealth: The text addresses health issues, medical advancements, or related\
+  \ topics. \nentertainment: The text pertains to movies, music, celebrities, or related\
+  \ topics. \ngeography: The text involves geographical information, locations, or\
+  \ related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_twi_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_tzm.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_tzm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5d1af77d17ccc2e287c4e598b0094383ec5e4b01
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_tzm.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: tzm_Tfng
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Tamazight text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_tzm_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_umb.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_umb.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a756680cfa29d6b4c83363192d734196989b2d45
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_umb.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: umb_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Umbundu text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_umb_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_wol.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_wol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8062b55d71066a62df167f96c8e6a72b67e51b60
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_wol.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: wol_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Wolof text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_wol_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_xho.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_xho.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..22c27b71a7e08c4f9c58039f5f52a10df324a878
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_xho.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: xho_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Xhosa text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_xho_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_yor.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_yor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..df51978255654580b37eca4e552e4561a6455e69
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_yor.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: yor_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Yoruba text. For each input, classify the topic as science/technology, travel,\
+  \ politics, sports, health, entertainment, or geography. Use the following guidelines:\
+  \ \n\n science/technology: The text discusses scientific discoveries, technological\
+  \ advancements, or related topics. \ntravel: The text describes travel experiences,\
+  \ destinations, or related topics. \npolitics: The text covers political events,\
+  \ policies, or related topics. \nsports: The text talks about sports events, athletes,\
+  \ or related topics. \nhealth: The text addresses health issues, medical advancements,\
+  \ or related topics. \nentertainment: The text pertains to movies, music, celebrities,\
+  \ or related topics. \ngeography: The text involves geographical information, locations,\
+  \ or related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_yor_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/sib_zul.yaml b/lm_eval/tasks/afrobench/sib/prompt_5/sib_zul.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..03fb9af917049b8dac781d9aefac58ebd3fe4dba
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/sib_zul.yaml
@@ -0,0 +1,18 @@
+# Generated by utils.py
+dataset_name: zul_Latn
+doc_to_text: "You are tasked with performing topic classification on the following\
+  \ Zulu text. For each input, classify the topic as science/technology, travel, politics,\
+  \ sports, health, entertainment, or geography. Use the following guidelines: \n\n\
+  \ science/technology: The text discusses scientific discoveries, technological advancements,\
+  \ or related topics. \ntravel: The text describes travel experiences, destinations,\
+  \ or related topics. \npolitics: The text covers political events, policies, or\
+  \ related topics. \nsports: The text talks about sports events, athletes, or related\
+  \ topics. \nhealth: The text addresses health issues, medical advancements, or related\
+  \ topics. \nentertainment: The text pertains to movies, music, celebrities, or related\
+  \ topics. \ngeography: The text involves geographical information, locations, or\
+  \ related topics. \n\nIf the text contains multiple topics, choose the dominant\
+  \ topic. For ambiguous or unclear topics, select the category that best reflects\
+  \ the overall content. Please provide a single classification for each input.\n\n\
+  text: {{text}} \\category: \n\n"
+include: sib
+task: sib_zul_prompt_5
diff --git a/lm_eval/tasks/afrobench/sib/prompt_5/utils.py b/lm_eval/tasks/afrobench/sib/prompt_5/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e735e2deb1f9c53152c072615aebe8ba3acb90b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/prompt_5/utils.py
@@ -0,0 +1 @@
+from lm_eval.utils import weighted_f1_score
diff --git a/lm_eval/tasks/afrobench/sib/sib.yaml b/lm_eval/tasks/afrobench/sib/sib.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d6935fee28978fd5f7efb02afd1a54dac363d111
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/sib.yaml
@@ -0,0 +1,13 @@
+group: sib
+task:
+  - sib_prompt_1
+  - sib_prompt_2
+  - sib_prompt_3
+  - sib_prompt_4
+  - sib_prompt_5
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1
diff --git a/lm_eval/tasks/afrobench/sib/utils.py b/lm_eval/tasks/afrobench/sib/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d99649e343fa4c491c77cb3167c89cc09907f579
--- /dev/null
+++ b/lm_eval/tasks/afrobench/sib/utils.py
@@ -0,0 +1,227 @@
+import argparse
+import os
+
+import yaml
+
+
+class FunctionTag:
+    def __init__(self, value):
+        self.value = value
+
+
+def prompt_func(mode, lang):
+    prompt_map = {
+        "prompt_1": "Given the categories science/technology, travel, politics, sports, health, entertainment, or geography; what category does the text: '{{text}}' belong to: \n\n",
+        "prompt_2": f"Does this {lang} topic; "
+        "'{{text}}' belong to one of the following categories: science/technology, travel, politics, sports, health, entertainment, or geography? category only\n\n",
+        "prompt_3": f"You are an assistant able to classify topics in texts. \n\n"
+        f"Given the categories science/technology, travel, politics, sports, health, entertainment, or geography; what is "
+        f"the topic of the {lang} statement below? Return only the category. "
+        "\n\ntext: {{text}} \category:\n\n",
+        "prompt_4": "Label the following text as science/technology, travel, politics, sports, health, entertainment, or geography. Provide only the category as your "
+        "response. \n\ntext: {{text}} \category: \n\n",
+        "prompt_5": f"You are tasked with performing topic classification on the following {lang} text. "
+        f"For each input, classify the topic as science/technology, travel, politics, sports, health, entertainment, or geography. "
+        f"Use the following guidelines: \n\n "
+        f"science/technology: The text discusses scientific discoveries, technological advancements, or related topics. \n"
+        f"travel: The text describes travel experiences, destinations, or related topics. \n"
+        f"politics: The text covers political events, policies, or related topics. \n"
+        f"sports: The text talks about sports events, athletes, or related topics. \n"
+        f"health: The text addresses health issues, medical advancements, or related topics. \n"
+        f"entertainment: The text pertains to movies, music, celebrities, or related topics. \n"
+        f"geography: The text involves geographical information, locations, or related topics. \n\n"
+        f"If the text contains multiple topics, choose the dominant topic. "
+        f"For ambiguous or unclear topics, select the category that best reflects the overall content. "
+        "Please provide a single classification for each input.\n\ntext: {{text}} \category: \n\n",
+    }
+    return prompt_map[mode]
+
+
+def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
+    """
+    Generate a yaml file for each language.
+
+    :param output_dir: The directory to output the files to.
+    :param overwrite: Whether to overwrite files if they already exist.
+    """
+    err = []
+    languages = {
+        "aeb": "Tunisian Arabic",
+        "afr": "Afrikaans",
+        "aka": "Akan",
+        "amh": "Amharic",
+        "ary": "Moroccan Arabic",
+        "arz": "Egyptian Arabic",
+        "bam": "Bambara",
+        "bem": "Bemba",
+        "cjk": "Chokwe",
+        "dik": "Southwestern Dinka",
+        "dyu": "Dyula",
+        "eng": "English",
+        "ewe": "Ewe",
+        "fon": "Fon",
+        "fra": "French",
+        "fuv": "Nigerian Fulfulde",
+        "gaz": "West Central Oromo",
+        "hau": "Hausa",
+        "ibo": "Igbo",
+        "kab": "Kabyle",
+        "kam": "Kamba",
+        "kmb": "Kimbundu",
+        "kbp": "Kabiye",
+        "kea": "Kabuverdianu",
+        "kik": "Kikuyu",
+        "kin": "Kinyarwanda",
+        "kon": "Kikongo",
+        "knc": "Central Kanuri",
+        "lua": "Luba-Kasai",
+        "lug": "Luganda",
+        "luo": "Luo",
+        "lin": "Lingala",
+        "mos": "Mossi",
+        "nus": "Nuer",
+        "nso": "Northern Sotho",
+        "nya": "Nyanga",
+        "plt": "Plateau Malagasy",
+        "por": "Portuguese",
+        "run": "Rundi",
+        "sag": "Sango",
+        "sna": "Shona",
+        "som": "Somali",
+        "sot": "Southern Sotho",
+        "ssw": "Swazi",
+        "swa": "Swahili",
+        "taq": "Tamasheq",
+        "tir": "Tigrinya",
+        "tum": "Tumbuka",
+        "tso": "Tsonga",
+        "twi": "Twi",
+        "tzm": "Tamazight",
+        "umb": "Umbundu",
+        "wol": "Wolof",
+        "xho": "Xhosa",
+        "yor": "Yoruba",
+        "zul": "Zulu",
+    }
+
+    lang_2_dataset_lang_code = {
+        "aeb": "aeb_Arab",
+        "afr": "afr_Latn",
+        "aka": "aka_Latn",
+        "amh": "amh_Ethi",
+        "ary": "ary_Arab",
+        "arz": "arz_Arab",
+        "bam": "bam_Latn",
+        "bem": "bem_Latn",
+        "cjk": "cjk_Latn",
+        "dik": "dik_Latn",
+        "dyu": "dyu_Latn",
+        "eng": "eng_Latn",
+        "ewe": "ewe_Latn",
+        "fon": "fon_Latn",
+        "fra": "fra_Latn",
+        "fuv": "fuv_Latn",
+        "gaz": "gaz_Latn",
+        "hau": "hau_Latn",
+        "ibo": "ibo_Latn",
+        "kab": "kab_Latn",
+        "kam": "kam_Latn",
+        "kmb": "kmb_Latn",
+        "kbp": "kbp_Latn",
+        "kea": "kea_Latn",
+        "kik": "kik_Latn",
+        "kin": "kin_Latn",
+        "kon": "kon_Latn",
+        "knc": "knc_Latn",
+        "lua": "lua_Latn",
+        "lug": "lug_Latn",
+        "luo": "luo_Latn",
+        "lin": "lin_Latn",
+        "mos": "mos_Latn",
+        "nus": "nus_Latn",
+        "nso": "nso_Latn",
+        "nya": "nya_Latn",
+        "plt": "plt_Latn",
+        "por": "por_Latn",
+        "run": "run_Latn",
+        "sag": "sag_Latn",
+        "sna": "sna_Latn",
+        "som": "som_Latn",
+        "sot": "sot_Latn",
+        "ssw": "ssw_Latn",
+        "swa": "swh_Latn",
+        "taq": "taq_Latn",
+        "tir": "tir_Ethi",
+        "tum": "tum_Latn",
+        "tso": "tso_Latn",
+        "twi": "twi_Latn",
+        "tzm": "tzm_Tfng",
+        "umb": "umb_Latn",
+        "wol": "wol_Latn",
+        "xho": "xho_Latn",
+        "yor": "yor_Latn",
+        "zul": "zul_Latn",
+    }
+
+    for lang in languages.keys():
+        try:
+            file_name = f"sib_{lang}.yaml"
+            task_name = f"sib_{lang}_{mode}"
+            yaml_template = "sib"
+            yaml_details = {
+                "include": yaml_template,
+                "task": task_name,
+                "dataset_name": lang_2_dataset_lang_code[lang],
+                "doc_to_text": prompt_func(mode, languages[lang]),
+            }
+            file_path = os.path.join(output_dir, mode)
+            os.makedirs(file_path, exist_ok=True)
+
+            with open(
+                f"{output_dir}/{mode}/{file_name}",
+                "w" if overwrite else "x",
+                encoding="utf8",
+            ) as f:
+                f.write("# Generated by utils.py\n")
+                yaml.dump(
+                    yaml_details,
+                    f,
+                    allow_unicode=True,
+                )
+        except FileExistsError:
+            err.append(file_name)
+
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist (use --overwrite flag):"
+            f" {', '.join(err)}"
+        )
+
+
+def main() -> None:
+    """Parse CLI args and generate language-specific yaml files."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--overwrite",
+        default=True,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default="./",
+        help="Directory to write yaml files to",
+    )
+    parser.add_argument(
+        "--mode",
+        default="prompt_3",
+        choices=["prompt_1", "prompt_2", "prompt_3", "prompt_4", "prompt_5"],
+        help="Prompt number",
+    )
+    args = parser.parse_args()
+
+    gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite, mode=args.mode)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/README.md b/lm_eval/tasks/afrobench/uhura-arc-easy/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a0253f987e3723c309bcb5ce4c9a9ad2b3a166ec
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/README.md
@@ -0,0 +1,25 @@
+#
+
+## Paper
+Title: `Uhura: A Benchmark for Evaluating Scientific Question Answering and Truthfulness in Low-Resource African Languages`
+
+Paper Link: https://arxiv.org/abs/2412.00948
+
+## Abstract
+>Evaluations of Large Language Models (LLMs) on knowledge-intensive tasks and factual accuracy often focus on high-resource languages primarily because datasets for low-resource languages (LRLs) are scarce. In this paper, we present Uhura -- a new benchmark that focuses on two tasks in six typologically-diverse African languages, created via human translation of existing English benchmarks. The first dataset, Uhura-ARC-Easy, is composed of multiple-choice science questions. The second, Uhura-TruthfulQA, is a safety benchmark testing the truthfulness of models on topics including health, law, finance, and politics. We highlight the challenges creating benchmarks with highly technical content for LRLs and outline mitigation strategies. Our evaluation reveals a significant performance gap between proprietary models such as GPT-4o and o1-preview, and Claude models, and open-source models like Meta's LLaMA and Google's Gemma. Additionally, all models perform better in English than in African languages. These results indicate that LMs struggle with answering scientific questions and are more prone to generating false claims in low-resource African languages. Our findings underscore the necessity for continuous improvement of multilingual LM capabilities in LRL settings to ensure safe and reliable use in real-world contexts. We open-source the Uhura Benchmark and Uhura Platform to foster further research and development in NLP for LRLs.
+
+HomePage: https://huggingface.co/datasets/masakhane/uhura-arc-easy
+
+### Citation
+
+```
+@misc{bayes2024uhurabenchmarkevaluatingscientific,
+      title={Uhura: A Benchmark for Evaluating Scientific Question Answering and Truthfulness in Low-Resource African Languages},
+      author={Edward Bayes and Israel Abebe Azime and Jesujoba O. Alabi and Jonas Kgomo and Tyna Eloundou and Elizabeth Proehl and Kai Chen and Imaan Khadir and Naome A. Etori and Shamsuddeen Hassan Muhammad and Choice Mpanza and Igneciah Pocia Thete and Dietrich Klakow and David Ifeoluwa Adelani},
+      year={2024},
+      eprint={2412.00948},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2412.00948},
+}
+```
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_1/uhura-arc-easy b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_1/uhura-arc-easy
new file mode 100644
index 0000000000000000000000000000000000000000..a7e37181359d9021a3dbb669c42a0e80e0b36c8f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_1/uhura-arc-easy
@@ -0,0 +1,39 @@
+tag:
+    - uhura_arc_easy_tasks
+    - uhura_arc_easy_prompt_1
+task: null
+dataset_path: masakhane/uhura-arc-easy
+dataset_name: null
+output_type: multiple_choice
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(answerKey)}}"
+doc_to_choice:
+  - A
+  - B
+  - C
+  - D
+test_split: test
+fewshot_split: validation
+should_decontaminate: false
+doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_1/uhura-arc-easy_am.yaml b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_1/uhura-arc-easy_am.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f61efe4ea501ae6ee9c7553a05bdb7d8540c08f7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_1/uhura-arc-easy_am.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: am_multiple_choice
+doc_to_text: "You are a virtual assistant that answers multiple-choice questions with\
+  \ the correct option only.\n\nQuestion: {{question}}\n\nChoices:\n\n{% for i in\
+  \ range(choices['text']|length) %}\t{{ 'ABCD'[i] }}: {{ choices['text'][i] }}\n\
+  {% endfor %}\nAnswer: "
+include: uhura-arc-easy
+task: uhura-arc-easy_am_prompt_1
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_1/uhura-arc-easy_en.yaml b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_1/uhura-arc-easy_en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f1b879e0221b72d2da47f4fe033f83f20526fef2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_1/uhura-arc-easy_en.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: en_multiple_choice
+doc_to_text: "You are a virtual assistant that answers multiple-choice questions with\
+  \ the correct option only.\n\nQuestion: {{question}}\n\nChoices:\n\n{% for i in\
+  \ range(choices['text']|length) %}\t{{ 'ABCD'[i] }}: {{ choices['text'][i] }}\n\
+  {% endfor %}\nAnswer: "
+include: uhura-arc-easy
+task: uhura-arc-easy_en_prompt_1
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_1/uhura-arc-easy_ha.yaml b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_1/uhura-arc-easy_ha.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..986ac5074660ef3c9756e1112e8aa5f34eafefe2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_1/uhura-arc-easy_ha.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: ha_multiple_choice
+doc_to_text: "You are a virtual assistant that answers multiple-choice questions with\
+  \ the correct option only.\n\nQuestion: {{question}}\n\nChoices:\n\n{% for i in\
+  \ range(choices['text']|length) %}\t{{ 'ABCD'[i] }}: {{ choices['text'][i] }}\n\
+  {% endfor %}\nAnswer: "
+include: uhura-arc-easy
+task: uhura-arc-easy_ha_prompt_1
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_1/uhura-arc-easy_nso.yaml b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_1/uhura-arc-easy_nso.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ead6d97d67ec6f76e5378899e979413b6f6bb41b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_1/uhura-arc-easy_nso.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: nso_multiple_choice
+doc_to_text: "You are a virtual assistant that answers multiple-choice questions with\
+  \ the correct option only.\n\nQuestion: {{question}}\n\nChoices:\n\n{% for i in\
+  \ range(choices['text']|length) %}\t{{ 'ABCD'[i] }}: {{ choices['text'][i] }}\n\
+  {% endfor %}\nAnswer: "
+fewshot_split: train
+include: uhura-arc-easy
+task: uhura-arc-easy_nso_prompt_1
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_1/uhura-arc-easy_sw.yaml b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_1/uhura-arc-easy_sw.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2e07bb234736d67e91cbcf798d6992fda2f438ec
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_1/uhura-arc-easy_sw.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: sw_multiple_choice
+doc_to_text: "You are a virtual assistant that answers multiple-choice questions with\
+  \ the correct option only.\n\nQuestion: {{question}}\n\nChoices:\n\n{% for i in\
+  \ range(choices['text']|length) %}\t{{ 'ABCD'[i] }}: {{ choices['text'][i] }}\n\
+  {% endfor %}\nAnswer: "
+include: uhura-arc-easy
+task: uhura-arc-easy_sw_prompt_1
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_1/uhura-arc-easy_yo.yaml b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_1/uhura-arc-easy_yo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f96113e4a5d3712fec89aab27fb309c0b85551b3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_1/uhura-arc-easy_yo.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: yo_multiple_choice
+doc_to_text: "You are a virtual assistant that answers multiple-choice questions with\
+  \ the correct option only.\n\nQuestion: {{question}}\n\nChoices:\n\n{% for i in\
+  \ range(choices['text']|length) %}\t{{ 'ABCD'[i] }}: {{ choices['text'][i] }}\n\
+  {% endfor %}\nAnswer: "
+include: uhura-arc-easy
+task: uhura-arc-easy_yo_prompt_1
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_1/uhura-arc-easy_zu.yaml b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_1/uhura-arc-easy_zu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..41c965a071018685e0539ae8ee0f18389d4a0d01
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_1/uhura-arc-easy_zu.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: zu_multiple_choice
+doc_to_text: "You are a virtual assistant that answers multiple-choice questions with\
+  \ the correct option only.\n\nQuestion: {{question}}\n\nChoices:\n\n{% for i in\
+  \ range(choices['text']|length) %}\t{{ 'ABCD'[i] }}: {{ choices['text'][i] }}\n\
+  {% endfor %}\nAnswer: "
+fewshot_split: train
+include: uhura-arc-easy
+task: uhura-arc-easy_zu_prompt_1
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_1/utils.py b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_1/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e735e2deb1f9c53152c072615aebe8ba3acb90b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_1/utils.py
@@ -0,0 +1 @@
+from lm_eval.utils import weighted_f1_score
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_2/uhura-arc-easy b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_2/uhura-arc-easy
new file mode 100644
index 0000000000000000000000000000000000000000..295d9c8e907f841f74f8e9b7253d52c6eee2b224
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_2/uhura-arc-easy
@@ -0,0 +1,38 @@
+tag:
+    - uhura_arc_easy_tasks
+    - uhura_arc_easy_prompt_2
+dataset_path: masakhane/uhura-arc-easy
+dataset_name: null
+output_type: multiple_choice
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(answerKey)}}"
+doc_to_choice:
+  - A
+  - B
+  - C
+  - D
+test_split: test
+fewshot_split: validation
+should_decontaminate: false
+doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_2/uhura-arc-easy_am.yaml b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_2/uhura-arc-easy_am.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2596bd487078859572e018d19f6f39c5e32f3dc5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_2/uhura-arc-easy_am.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: am_multiple_choice
+doc_to_text: "Choose the correct option that answers the question below:\n\nQuestion:\
+  \ {{question}}\n\nChoices:\n\n{% for i in range(choices['text']|length) %}\t{{ 'ABCD'[i]\
+  \ }}: {{ choices['text'][i] }}\n{% endfor %}\nAnswer: "
+include: uhura-arc-easy
+task: uhura-arc-easy_am_prompt_2
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_2/uhura-arc-easy_en.yaml b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_2/uhura-arc-easy_en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f3edfc10ea1ada6252d938e57aed3a1f03ade802
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_2/uhura-arc-easy_en.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: en_multiple_choice
+doc_to_text: "Choose the correct option that answers the question below:\n\nQuestion:\
+  \ {{question}}\n\nChoices:\n\n{% for i in range(choices['text']|length) %}\t{{ 'ABCD'[i]\
+  \ }}: {{ choices['text'][i] }}\n{% endfor %}\nAnswer: "
+include: uhura-arc-easy
+task: uhura-arc-easy_en_prompt_2
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_2/uhura-arc-easy_ha.yaml b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_2/uhura-arc-easy_ha.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d857b2e44c5fb9a7da2ac1e8dfbb426978799073
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_2/uhura-arc-easy_ha.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ha_multiple_choice
+doc_to_text: "Choose the correct option that answers the question below:\n\nQuestion:\
+  \ {{question}}\n\nChoices:\n\n{% for i in range(choices['text']|length) %}\t{{ 'ABCD'[i]\
+  \ }}: {{ choices['text'][i] }}\n{% endfor %}\nAnswer: "
+include: uhura-arc-easy
+task: uhura-arc-easy_ha_prompt_2
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_2/uhura-arc-easy_nso.yaml b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_2/uhura-arc-easy_nso.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..93fbfe587dc9dfb45c08aaaeb3c6c3528d766110
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_2/uhura-arc-easy_nso.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: nso_multiple_choice
+doc_to_text: "Choose the correct option that answers the question below:\n\nQuestion:\
+  \ {{question}}\n\nChoices:\n\n{% for i in range(choices['text']|length) %}\t{{ 'ABCD'[i]\
+  \ }}: {{ choices['text'][i] }}\n{% endfor %}\nAnswer: "
+fewshot_split: train
+include: uhura-arc-easy
+task: uhura-arc-easy_nso_prompt_2
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_2/uhura-arc-easy_sw.yaml b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_2/uhura-arc-easy_sw.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b5fc929f54de32f0010844d2ed816cd3b634184f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_2/uhura-arc-easy_sw.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sw_multiple_choice
+doc_to_text: "Choose the correct option that answers the question below:\n\nQuestion:\
+  \ {{question}}\n\nChoices:\n\n{% for i in range(choices['text']|length) %}\t{{ 'ABCD'[i]\
+  \ }}: {{ choices['text'][i] }}\n{% endfor %}\nAnswer: "
+include: uhura-arc-easy
+task: uhura-arc-easy_sw_prompt_2
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_2/uhura-arc-easy_yo.yaml b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_2/uhura-arc-easy_yo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..67b09752a4491d345d61bbbba5219cc1f5001544
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_2/uhura-arc-easy_yo.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: yo_multiple_choice
+doc_to_text: "Choose the correct option that answers the question below:\n\nQuestion:\
+  \ {{question}}\n\nChoices:\n\n{% for i in range(choices['text']|length) %}\t{{ 'ABCD'[i]\
+  \ }}: {{ choices['text'][i] }}\n{% endfor %}\nAnswer: "
+include: uhura-arc-easy
+task: uhura-arc-easy_yo_prompt_2
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_2/uhura-arc-easy_zu.yaml b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_2/uhura-arc-easy_zu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4b261b51fcc11770a22eaf0bd8285b2028d416b9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_2/uhura-arc-easy_zu.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: zu_multiple_choice
+doc_to_text: "Choose the correct option that answers the question below:\n\nQuestion:\
+  \ {{question}}\n\nChoices:\n\n{% for i in range(choices['text']|length) %}\t{{ 'ABCD'[i]\
+  \ }}: {{ choices['text'][i] }}\n{% endfor %}\nAnswer: "
+fewshot_split: train
+include: uhura-arc-easy
+task: uhura-arc-easy_zu_prompt_2
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_2/utils.py b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_2/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e735e2deb1f9c53152c072615aebe8ba3acb90b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_2/utils.py
@@ -0,0 +1 @@
+from lm_eval.utils import weighted_f1_score
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_3/uhura-arc-easy b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_3/uhura-arc-easy
new file mode 100644
index 0000000000000000000000000000000000000000..23e2c37396c75ce09fcd608e69d5ce42173df1ca
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_3/uhura-arc-easy
@@ -0,0 +1,38 @@
+tag:
+    - uhura_arc_easy_tasks
+    - uhura_arc_easy_prompt_3
+dataset_path: masakhane/uhura-arc-easy
+dataset_name: null
+output_type: multiple_choice
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(answerKey)}}"
+doc_to_choice:
+  - A
+  - B
+  - C
+  - D
+test_split: test
+fewshot_split: validation
+should_decontaminate: false
+doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_3/uhura-arc-easy_am.yaml b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_3/uhura-arc-easy_am.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..42716a7cdc9b1ac266efbec32ebe4bebe6bf578e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_3/uhura-arc-easy_am.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: am_multiple_choice
+doc_to_text: "Answer the following multiple-choice question by picking 'A', 'B', 'C',\
+  \ or 'D'.\n\nQuestion: {{question}}\n\nOptions:\n\n{% for i in range(choices['text']|length)\
+  \ %}\t{{ 'ABCD'[i] }}: {{ choices['text'][i] }}\n{% endfor %}\nAnswer: "
+include: uhura-arc-easy
+task: uhura-arc-easy_am_prompt_3
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_3/uhura-arc-easy_en.yaml b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_3/uhura-arc-easy_en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a89312e09a10b9db5a2a9a2a0914980c9ef686a5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_3/uhura-arc-easy_en.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: en_multiple_choice
+doc_to_text: "Answer the following multiple-choice question by picking 'A', 'B', 'C',\
+  \ or 'D'.\n\nQuestion: {{question}}\n\nOptions:\n\n{% for i in range(choices['text']|length)\
+  \ %}\t{{ 'ABCD'[i] }}: {{ choices['text'][i] }}\n{% endfor %}\nAnswer: "
+include: uhura-arc-easy
+task: uhura-arc-easy_en_prompt_3
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_3/uhura-arc-easy_ha.yaml b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_3/uhura-arc-easy_ha.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..de511a8af7dc52180b56d52a1c7d57955cbd6eb4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_3/uhura-arc-easy_ha.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ha_multiple_choice
+doc_to_text: "Answer the following multiple-choice question by picking 'A', 'B', 'C',\
+  \ or 'D'.\n\nQuestion: {{question}}\n\nOptions:\n\n{% for i in range(choices['text']|length)\
+  \ %}\t{{ 'ABCD'[i] }}: {{ choices['text'][i] }}\n{% endfor %}\nAnswer: "
+include: uhura-arc-easy
+task: uhura-arc-easy_ha_prompt_3
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_3/uhura-arc-easy_nso.yaml b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_3/uhura-arc-easy_nso.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..358d084cea2131f7e94f82ec733234371f2b8446
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_3/uhura-arc-easy_nso.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: nso_multiple_choice
+doc_to_text: "Answer the following multiple-choice question by picking 'A', 'B', 'C',\
+  \ or 'D'.\n\nQuestion: {{question}}\n\nOptions:\n\n{% for i in range(choices['text']|length)\
+  \ %}\t{{ 'ABCD'[i] }}: {{ choices['text'][i] }}\n{% endfor %}\nAnswer: "
+fewshot_split: train
+include: uhura-arc-easy
+task: uhura-arc-easy_nso_prompt_3
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_3/uhura-arc-easy_sw.yaml b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_3/uhura-arc-easy_sw.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d4a8785d622caed597dbb0eefbd4290f2636f866
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_3/uhura-arc-easy_sw.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sw_multiple_choice
+doc_to_text: "Answer the following multiple-choice question by picking 'A', 'B', 'C',\
+  \ or 'D'.\n\nQuestion: {{question}}\n\nOptions:\n\n{% for i in range(choices['text']|length)\
+  \ %}\t{{ 'ABCD'[i] }}: {{ choices['text'][i] }}\n{% endfor %}\nAnswer: "
+include: uhura-arc-easy
+task: uhura-arc-easy_sw_prompt_3
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_3/uhura-arc-easy_yo.yaml b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_3/uhura-arc-easy_yo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e9416362827513ac4b95cf843a83ec0d0efbc45e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_3/uhura-arc-easy_yo.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: yo_multiple_choice
+doc_to_text: "Answer the following multiple-choice question by picking 'A', 'B', 'C',\
+  \ or 'D'.\n\nQuestion: {{question}}\n\nOptions:\n\n{% for i in range(choices['text']|length)\
+  \ %}\t{{ 'ABCD'[i] }}: {{ choices['text'][i] }}\n{% endfor %}\nAnswer: "
+include: uhura-arc-easy
+task: uhura-arc-easy_yo_prompt_3
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_3/uhura-arc-easy_zu.yaml b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_3/uhura-arc-easy_zu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6a44b8c0e6ebae36db00a6847aacb3263c84fb7b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_3/uhura-arc-easy_zu.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: zu_multiple_choice
+doc_to_text: "Answer the following multiple-choice question by picking 'A', 'B', 'C',\
+  \ or 'D'.\n\nQuestion: {{question}}\n\nOptions:\n\n{% for i in range(choices['text']|length)\
+  \ %}\t{{ 'ABCD'[i] }}: {{ choices['text'][i] }}\n{% endfor %}\nAnswer: "
+fewshot_split: train
+include: uhura-arc-easy
+task: uhura-arc-easy_zu_prompt_3
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_3/utils.py b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_3/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e735e2deb1f9c53152c072615aebe8ba3acb90b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_3/utils.py
@@ -0,0 +1 @@
+from lm_eval.utils import weighted_f1_score
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_4/uhura-arc-easy b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_4/uhura-arc-easy
new file mode 100644
index 0000000000000000000000000000000000000000..e697f4c7363aee6c39b0d927ba9d1b575f4063d5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_4/uhura-arc-easy
@@ -0,0 +1,38 @@
+tag:
+    - uhura_arc_easy_tasks
+    - uhura_arc_easy_prompt_4
+dataset_path: masakhane/uhura-arc-easy
+dataset_name: null
+output_type: multiple_choice
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(answerKey)}}"
+doc_to_choice:
+  - A
+  - B
+  - C
+  - D
+test_split: test
+fewshot_split: validation
+should_decontaminate: false
+doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_4/uhura-arc-easy_am.yaml b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_4/uhura-arc-easy_am.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4eaa02f59b217a8ce13f41fca67f9491aab917aa
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_4/uhura-arc-easy_am.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: am_multiple_choice
+doc_to_text: "Question: {{question}}\n\nOptions:\n\n{% for i in range(choices['text']|length)\
+  \ %}\t{{ 'ABCD'[i] }}: {{ choices['text'][i] }}\n{% endfor %}\nAnswer: "
+include: uhura-arc-easy
+task: uhura-arc-easy_am_prompt_4
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_4/uhura-arc-easy_en.yaml b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_4/uhura-arc-easy_en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..461e6f9e7516e2382380e391f3d2d713bf494ef9
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_4/uhura-arc-easy_en.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: en_multiple_choice
+doc_to_text: "Question: {{question}}\n\nOptions:\n\n{% for i in range(choices['text']|length)\
+  \ %}\t{{ 'ABCD'[i] }}: {{ choices['text'][i] }}\n{% endfor %}\nAnswer: "
+include: uhura-arc-easy
+task: uhura-arc-easy_en_prompt_4
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_4/uhura-arc-easy_ha.yaml b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_4/uhura-arc-easy_ha.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..435ea73bc7639669ac53445f0ea9adf72edbd347
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_4/uhura-arc-easy_ha.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: ha_multiple_choice
+doc_to_text: "Question: {{question}}\n\nOptions:\n\n{% for i in range(choices['text']|length)\
+  \ %}\t{{ 'ABCD'[i] }}: {{ choices['text'][i] }}\n{% endfor %}\nAnswer: "
+include: uhura-arc-easy
+task: uhura-arc-easy_ha_prompt_4
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_4/uhura-arc-easy_nso.yaml b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_4/uhura-arc-easy_nso.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..09112d5af3adaf7251a39032aabf60af73779088
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_4/uhura-arc-easy_nso.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: nso_multiple_choice
+doc_to_text: "Question: {{question}}\n\nOptions:\n\n{% for i in range(choices['text']|length)\
+  \ %}\t{{ 'ABCD'[i] }}: {{ choices['text'][i] }}\n{% endfor %}\nAnswer: "
+fewshot_split: train
+include: uhura-arc-easy
+task: uhura-arc-easy_nso_prompt_4
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_4/uhura-arc-easy_sw.yaml b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_4/uhura-arc-easy_sw.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..264770eeda75acdf9088ac24e24645a9e5638c25
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_4/uhura-arc-easy_sw.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: sw_multiple_choice
+doc_to_text: "Question: {{question}}\n\nOptions:\n\n{% for i in range(choices['text']|length)\
+  \ %}\t{{ 'ABCD'[i] }}: {{ choices['text'][i] }}\n{% endfor %}\nAnswer: "
+include: uhura-arc-easy
+task: uhura-arc-easy_sw_prompt_4
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_4/uhura-arc-easy_yo.yaml b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_4/uhura-arc-easy_yo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..10af53de81d71c82f02f1da80bdc9b5dc114bfed
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_4/uhura-arc-easy_yo.yaml
@@ -0,0 +1,6 @@
+# Generated by utils.py
+dataset_name: yo_multiple_choice
+doc_to_text: "Question: {{question}}\n\nOptions:\n\n{% for i in range(choices['text']|length)\
+  \ %}\t{{ 'ABCD'[i] }}: {{ choices['text'][i] }}\n{% endfor %}\nAnswer: "
+include: uhura-arc-easy
+task: uhura-arc-easy_yo_prompt_4
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_4/uhura-arc-easy_zu.yaml b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_4/uhura-arc-easy_zu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..239b1648a6e2c5d431a83ef0a94c16c6db90cc1b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_4/uhura-arc-easy_zu.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: zu_multiple_choice
+doc_to_text: "Question: {{question}}\n\nOptions:\n\n{% for i in range(choices['text']|length)\
+  \ %}\t{{ 'ABCD'[i] }}: {{ choices['text'][i] }}\n{% endfor %}\nAnswer: "
+fewshot_split: train
+include: uhura-arc-easy
+task: uhura-arc-easy_zu_prompt_4
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_4/utils.py b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_4/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e735e2deb1f9c53152c072615aebe8ba3acb90b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_4/utils.py
@@ -0,0 +1 @@
+from lm_eval.utils import weighted_f1_score
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_5/uhura-arc-easy b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_5/uhura-arc-easy
new file mode 100644
index 0000000000000000000000000000000000000000..3f5ac554027a87a6fe5eeda14887a46f5af5ef2f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_5/uhura-arc-easy
@@ -0,0 +1,38 @@
+tag:
+    - uhura_arc_easy_tasks
+    - uhura_arc_easy_prompt_5
+dataset_path: masakhane/uhura-arc-easy
+dataset_name: null
+output_type: multiple_choice
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(answerKey)}}"
+doc_to_choice:
+  - A
+  - B
+  - C
+  - D
+test_split: test
+fewshot_split: validation
+should_decontaminate: false
+doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
+metric_list:
+  - metric: f1
+    aggregation: !function utils.weighted_f1_score
+    # aggregation: mean
+    average: weighted
+    hf_evaluate: true
+    higher_is_better: True
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_5/uhura-arc-easy_am.yaml b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_5/uhura-arc-easy_am.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f7f0231017eb5553893708f939b5fb23d0f60e1e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_5/uhura-arc-easy_am.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: am_multiple_choice
+doc_to_text: "Which of the following options answers this question: {{question}}\n\
+  \n{% for i in range(choices['text']|length) %}\t{{ 'ABCD'[i] }}: {{ choices['text'][i]\
+  \ }}\n{% endfor %}\nAnswer: "
+include: uhura-arc-easy
+task: uhura-arc-easy_am_prompt_5
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_5/uhura-arc-easy_en.yaml b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_5/uhura-arc-easy_en.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5aea6abac239580735b805c2582be3976a9986d4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_5/uhura-arc-easy_en.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: en_multiple_choice
+doc_to_text: "Which of the following options answers this question: {{question}}\n\
+  \n{% for i in range(choices['text']|length) %}\t{{ 'ABCD'[i] }}: {{ choices['text'][i]\
+  \ }}\n{% endfor %}\nAnswer: "
+include: uhura-arc-easy
+task: uhura-arc-easy_en_prompt_5
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_5/uhura-arc-easy_ha.yaml b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_5/uhura-arc-easy_ha.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6293bda284e9ca224c3be7a65e1686f3c97210d8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_5/uhura-arc-easy_ha.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: ha_multiple_choice
+doc_to_text: "Which of the following options answers this question: {{question}}\n\
+  \n{% for i in range(choices['text']|length) %}\t{{ 'ABCD'[i] }}: {{ choices['text'][i]\
+  \ }}\n{% endfor %}\nAnswer: "
+include: uhura-arc-easy
+task: uhura-arc-easy_ha_prompt_5
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_5/uhura-arc-easy_nso.yaml b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_5/uhura-arc-easy_nso.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..80aff7064e48444477970baf5ccedf930a560a34
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_5/uhura-arc-easy_nso.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: nso_multiple_choice
+doc_to_text: "Which of the following options answers this question: {{question}}\n\
+  \n{% for i in range(choices['text']|length) %}\t{{ 'ABCD'[i] }}: {{ choices['text'][i]\
+  \ }}\n{% endfor %}\nAnswer: "
+fewshot_split: train
+include: uhura-arc-easy
+task: uhura-arc-easy_nso_prompt_5
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_5/uhura-arc-easy_sw.yaml b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_5/uhura-arc-easy_sw.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a5bc7d2e5600b46a8660c83e76c69b9a24d0f398
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_5/uhura-arc-easy_sw.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: sw_multiple_choice
+doc_to_text: "Which of the following options answers this question: {{question}}\n\
+  \n{% for i in range(choices['text']|length) %}\t{{ 'ABCD'[i] }}: {{ choices['text'][i]\
+  \ }}\n{% endfor %}\nAnswer: "
+include: uhura-arc-easy
+task: uhura-arc-easy_sw_prompt_5
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_5/uhura-arc-easy_yo.yaml b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_5/uhura-arc-easy_yo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a267e987218945fc09217266defb4b0775fd777f
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_5/uhura-arc-easy_yo.yaml
@@ -0,0 +1,7 @@
+# Generated by utils.py
+dataset_name: yo_multiple_choice
+doc_to_text: "Which of the following options answers this question: {{question}}\n\
+  \n{% for i in range(choices['text']|length) %}\t{{ 'ABCD'[i] }}: {{ choices['text'][i]\
+  \ }}\n{% endfor %}\nAnswer: "
+include: uhura-arc-easy
+task: uhura-arc-easy_yo_prompt_5
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_5/uhura-arc-easy_zu.yaml b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_5/uhura-arc-easy_zu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..69ce4a396af9f0bd6c96071319ef51ac3c1a81cd
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_5/uhura-arc-easy_zu.yaml
@@ -0,0 +1,8 @@
+# Generated by utils.py
+dataset_name: zu_multiple_choice
+doc_to_text: "Which of the following options answers this question: {{question}}\n\
+  \n{% for i in range(choices['text']|length) %}\t{{ 'ABCD'[i] }}: {{ choices['text'][i]\
+  \ }}\n{% endfor %}\nAnswer: "
+fewshot_split: train
+include: uhura-arc-easy
+task: uhura-arc-easy_zu_prompt_5
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_5/utils.py b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_5/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e735e2deb1f9c53152c072615aebe8ba3acb90b
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/prompt_5/utils.py
@@ -0,0 +1 @@
+from lm_eval.utils import weighted_f1_score
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/uhura.yaml b/lm_eval/tasks/afrobench/uhura-arc-easy/uhura.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e2e2fea5fb49838103490f6f16321da45022cc7c
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/uhura.yaml
@@ -0,0 +1,13 @@
+group: uhura_arc_easy
+task:
+  - uhura_arc_easy_prompt_1
+  - uhura_arc_easy_prompt_2
+  - uhura_arc_easy_prompt_3
+  - uhura_arc_easy_prompt_4
+  - uhura_arc_easy_prompt_5
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1
diff --git a/lm_eval/tasks/afrobench/uhura-arc-easy/utils.py b/lm_eval/tasks/afrobench/uhura-arc-easy/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1216618cbff12e6f4a21ff532d7da16abbef1bde
--- /dev/null
+++ b/lm_eval/tasks/afrobench/uhura-arc-easy/utils.py
@@ -0,0 +1,129 @@
+import argparse
+import os
+
+import pycountry
+import yaml
+
+
+def get_language_from_code(code: str) -> str:
+    language_tuple = pycountry.languages.get(**{f"alpha_{len(code)}": code})
+    return language_tuple.name
+
+
+def prompt_func(mode):
+    prompt_map = {
+        "prompt_1": "You are a virtual assistant that answers multiple-choice questions with the correct option only.\n\n"
+        "Question: {{question}}\n\n"
+        "Choices:\n\n"
+        "{% for i in range(choices['text']|length) %}"
+        "\t{{ 'ABCD'[i] }}: {{ choices['text'][i] }}\n"
+        "{% endfor %}\n"
+        "Answer: ",
+        "prompt_2": "Choose the correct option that answers the question below:\n\n"
+        "Question: {{question}}\n\n"
+        "Choices:\n\n"
+        "{% for i in range(choices['text']|length) %}"
+        "\t{{ 'ABCD'[i] }}: {{ choices['text'][i] }}\n"
+        "{% endfor %}\n"
+        "Answer: ",
+        "prompt_3": "Answer the following multiple-choice question by picking 'A', 'B', 'C', or 'D'.\n\n"
+        "Question: {{question}}\n\n"
+        "Options:\n\n"
+        "{% for i in range(choices['text']|length) %}"
+        "\t{{ 'ABCD'[i] }}: {{ choices['text'][i] }}\n"
+        "{% endfor %}\n"
+        "Answer: ",
+        "prompt_4": "Question: {{question}}\n\n"
+        "Options:\n\n"
+        "{% for i in range(choices['text']|length) %}"
+        "\t{{ 'ABCD'[i] }}: {{ choices['text'][i] }}\n"
+        "{% endfor %}\n"
+        "Answer: ",
+        "prompt_5": "Which of the following options answers this question: {{question}}\n\n"
+        "{% for i in range(choices['text']|length) %}"
+        "\t{{ 'ABCD'[i] }}: {{ choices['text'][i] }}\n"
+        "{% endfor %}\n"
+        "Answer: ",
+    }
+    return prompt_map[mode]
+
+
+def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
+    """
+    Generate a yaml file for each language.
+
+    :param output_dir: The directory to output the files to.
+    :param overwrite: Whether to overwrite files if they already exist.
+    """
+    err = []
+    languages = {"am", "en", "ha", "nso", "sw", "yo", "zu"}
+
+    for lang in languages:
+        try:
+            file_name = f"uhura-arc-easy_{lang}.yaml"
+            task_name = f"uhura-arc-easy_{lang}_{mode}"
+            yaml_template = "uhura-arc-easy"
+            yaml_details = {
+                "include": yaml_template,
+                "task": task_name,
+                "dataset_name": f"{lang}_multiple_choice{'_unmatched' if lang == 'nso' else ''}",
+                "doc_to_text": prompt_func(mode),
+            }
+            if lang in ("nso", "zu"):
+                yaml_details["fewshot_split"] = "train"
+
+            file_path = os.path.join(output_dir, mode)
+            os.makedirs(file_path, exist_ok=True)
+
+            with open(
+                f"{output_dir}/{mode}/{file_name}",
+                "w" if overwrite else "x",
+                encoding="utf8",
+            ) as f:
+                f.write("# Generated by utils.py\n")
+                yaml.dump(
+                    yaml_details,
+                    f,
+                    allow_unicode=True,
+                )
+        except FileExistsError:
+            err.append(file_name)
+
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist (use --overwrite flag):"
+            f" {', '.join(err)}"
+        )
+
+
+def main() -> None:
+    """Parse CLI args and generate language-specific yaml files."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--overwrite",
+        default=True,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default="./",
+        help="Directory to write yaml files to",
+    )
+
+    PROMPT_CHOICES = ["prompt_1", "prompt_2", "prompt_3", "prompt_4", "prompt_5"]
+    parser.add_argument(
+        "--mode",
+        nargs="*",
+        default=PROMPT_CHOICES,
+        choices=PROMPT_CHOICES,
+        help="Prompt number(s)",
+    )
+    args = parser.parse_args()
+
+    for mode in args.mode:
+        gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite, mode=mode)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lm_eval/tasks/afrobench/xlsum/README.md b/lm_eval/tasks/afrobench/xlsum/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d9a47076e564de38fb4a7eb2cbd1df8a3b0290d2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/README.md
@@ -0,0 +1,34 @@
+#
+
+## Paper
+Title: `XL-Sum: Large-Scale Multilingual Abstractive Summarization for 44 Languages`
+
+Paper Link: https://aclanthology.org/2021.findings-acl.413/
+
+## Abstract
+>Contemporary works on abstractive text summarization have focused primarily on high-resource languages like English, mostly due to the limited availability of datasets for low/mid-resource ones. In this work, we present XL-Sum, a comprehensive and diverse dataset comprising 1 million professionally annotated article-summary pairs from BBC, extracted using a set of carefully designed heuristics. The dataset covers 44 languages ranging from low to high-resource, for many of which no public dataset is currently available. XL-Sum is highly abstractive, concise, and of high quality, as indicated by human and intrinsic evaluation. We fine-tune mT5, a state-of-the-art pretrained multilingual model, with XL-Sum and experiment on multilingual and low-resource summarization tasks. XL-Sum induces competitive results compared to the ones obtained using similar monolingual datasets: we show higher than 11 ROUGE-2 scores on 10 languages we benchmark on, with some of them exceeding 15, as obtained by multilingual training. Additionally, training on low-resource languages individually also provides competitive performance. To the best of our knowledge, XL-Sum is the largest abstractive summarization dataset in terms of the number of samples collected from a single source and the number of languages covered. We are releasing our dataset and models to encourage future research on multilingual abstractive summarization.
+
+HomePage: https://github.com/csebuetnlp/xl-sum
+
+### Citation
+
+```
+@inproceedings{hasan-etal-2021-xl,
+    title = "{XL}-Sum: Large-Scale Multilingual Abstractive Summarization for 44 Languages",
+    author = "Hasan, Tahmid  and
+      Bhattacharjee, Abhik  and
+      Islam, Md. Saiful  and
+      Mubasshir, Kazi  and
+      Li, Yuan-Fang  and
+      Kang, Yong-Bin  and
+      Rahman, M. Sohel  and
+      Shahriyar, Rifat",
+    booktitle = "Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021",
+    month = aug,
+    year = "2021",
+    address = "Online",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2021.findings-acl.413",
+    pages = "4693--4703",
+}
+```
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_1/utils.py b/lm_eval/tasks/afrobench/xlsum/prompt_1/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..85db4d4f4cef061e526c970ece194317e576de06
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_1/utils.py
@@ -0,0 +1,18 @@
+import evaluate
+
+
+def rougeL(items):
+    """
+    # passthrough for efficiency
+    """
+    return items
+
+
+def rougeL_agg(items):
+    """
+    Higher is better
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    rouge_scorer = evaluate.load("rouge")
+    return rouge_scorer.compute(predictions=preds, references=refs)["rougeL"]
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum b/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum
new file mode 100644
index 0000000000000000000000000000000000000000..f6b0421edd9ba2b3f1c2eac1dbfaf6f51e5cfba5
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum
@@ -0,0 +1,22 @@
+tag:
+  - xlsum_tasks
+  - xlsum_prompt_1
+task: null
+dataset_path: csebuetnlp/xlsum
+dataset_name: null
+dataset_kwargs:
+  trust_remote_code: true
+output_type: generate_until
+generation_kwargs:
+  until:
+    - "</s>"
+validation_split: validation
+fewshot_split: validation
+test_split: test
+should_decontaminate: false
+metric_list:
+  - metric: !function utils.rougeL
+    higher_is_better: true
+    aggregation:  !function utils.rougeL_agg
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum_amharic.yaml b/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum_amharic.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8ab68805aa658c3c15d8367f48115f40e2581aac
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum_amharic.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: amharic
+doc_to_target: '{{summary}}'
+doc_to_text: 'Provide a summary of the document written in Amharic. Ensure that you
+  provide the summary in Amharic and nothing else.
+
+  Document in Amharic: {{''text''}}\nSummary: '
+include: xlsum
+task: xlsum_amharic_prompt_1
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum_arabic.yaml b/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum_arabic.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..af7df7d90f01b274c1d54076256d7e3a510627b4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum_arabic.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: arabic
+doc_to_target: '{{summary}}'
+doc_to_text: 'Provide a summary of the document written in Arabic. Ensure that you
+  provide the summary in Arabic and nothing else.
+
+  Document in Arabic: {{''text''}}\nSummary: '
+include: xlsum
+task: xlsum_arabic_prompt_1
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum_hausa.yaml b/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum_hausa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..37f6b3e518835365e7b59fb550c15e286c85f63a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum_hausa.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: hausa
+doc_to_target: '{{summary}}'
+doc_to_text: 'Provide a summary of the document written in Hausa. Ensure that you
+  provide the summary in Hausa and nothing else.
+
+  Document in Hausa: {{''text''}}\nSummary: '
+include: xlsum
+task: xlsum_hausa_prompt_1
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum_igbo.yaml b/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum_igbo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..04644b5d7bdd8595c5beb02240fe521162dcf3fd
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum_igbo.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: igbo
+doc_to_target: '{{summary}}'
+doc_to_text: 'Provide a summary of the document written in Igbo. Ensure that you provide
+  the summary in Igbo and nothing else.
+
+  Document in Igbo: {{''text''}}\nSummary: '
+include: xlsum
+task: xlsum_igbo_prompt_1
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum_kirundi.yaml b/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum_kirundi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7c434296f5598cb995c40568ab69141f29571d57
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum_kirundi.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: kirundi
+doc_to_target: '{{summary}}'
+doc_to_text: 'Provide a summary of the document written in Kirundi. Ensure that you
+  provide the summary in Kirundi and nothing else.
+
+  Document in Kirundi: {{''text''}}\nSummary: '
+include: xlsum
+task: xlsum_kirundi_prompt_1
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum_oromo.yaml b/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum_oromo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..78fb14eca4344c17ed3300954193764568be40d4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum_oromo.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: oromo
+doc_to_target: '{{summary}}'
+doc_to_text: 'Provide a summary of the document written in Oromo. Ensure that you
+  provide the summary in Oromo and nothing else.
+
+  Document in Oromo: {{''text''}}\nSummary: '
+include: xlsum
+task: xlsum_oromo_prompt_1
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum_pidgin.yaml b/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum_pidgin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..68f2c17f560ee888ea1ee958c9ba2392d6f47dfc
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum_pidgin.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: pidgin
+doc_to_target: '{{summary}}'
+doc_to_text: 'Provide a summary of the document written in Nigerian pidgin. Ensure
+  that you provide the summary in Nigerian pidgin and nothing else.
+
+  Document in Nigerian pidgin: {{''text''}}\nSummary: '
+include: xlsum
+task: xlsum_pidgin_prompt_1
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum_somali.yaml b/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum_somali.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d699dc1905796945e89f3659060202f7314ed776
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum_somali.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: somali
+doc_to_target: '{{summary}}'
+doc_to_text: 'Provide a summary of the document written in Somali. Ensure that you
+  provide the summary in Somali and nothing else.
+
+  Document in Somali: {{''text''}}\nSummary: '
+include: xlsum
+task: xlsum_somali_prompt_1
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum_swahili.yaml b/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum_swahili.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6a951c11b8c7ee59f1dfcd3eb44eaada2bd0652a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum_swahili.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: swahili
+doc_to_target: '{{summary}}'
+doc_to_text: 'Provide a summary of the document written in Swahili. Ensure that you
+  provide the summary in Swahili and nothing else.
+
+  Document in Swahili: {{''text''}}\nSummary: '
+include: xlsum
+task: xlsum_swahili_prompt_1
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum_telugu.yaml b/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum_telugu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..82a60171a5e2a42e1eb5d43aaf8a034e77b4a798
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum_telugu.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: telugu
+doc_to_target: '{{summary}}'
+doc_to_text: 'Provide a summary of the document written in Telugu. Ensure that you
+  provide the summary in Telugu and nothing else.
+
+  Document in Telugu: {{''text''}}\nSummary: '
+include: xlsum
+task: xlsum_telugu_prompt_1
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum_tigrinya.yaml b/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum_tigrinya.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..31630982a134b934311c00164c42dc9fabf22cc7
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum_tigrinya.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: tigrinya
+doc_to_target: '{{summary}}'
+doc_to_text: 'Provide a summary of the document written in Tigrinya. Ensure that you
+  provide the summary in Tigrinya and nothing else.
+
+  Document in Tigrinya: {{''text''}}\nSummary: '
+include: xlsum
+task: xlsum_tigrinya_prompt_1
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum_yoruba.yaml b/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum_yoruba.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9c14a9113e293c057d028e27cd09ed1f6812c1e8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum_yoruba.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: yoruba
+doc_to_target: '{{summary}}'
+doc_to_text: 'Provide a summary of the document written in Yoruba. Ensure that you
+  provide the summary in Yoruba and nothing else.
+
+  Document in Yoruba: {{''text''}}\nSummary: '
+include: xlsum
+task: xlsum_yoruba_prompt_1
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_2/utils.py b/lm_eval/tasks/afrobench/xlsum/prompt_2/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..85db4d4f4cef061e526c970ece194317e576de06
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_2/utils.py
@@ -0,0 +1,18 @@
+import evaluate
+
+
+def rougeL(items):
+    """
+    # passthrough for efficiency
+    """
+    return items
+
+
+def rougeL_agg(items):
+    """
+    Higher is better
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    rouge_scorer = evaluate.load("rouge")
+    return rouge_scorer.compute(predictions=preds, references=refs)["rougeL"]
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum b/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum
new file mode 100644
index 0000000000000000000000000000000000000000..e572c00c6ae1c0f8f84f1030c5903325ca1f0ae4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum
@@ -0,0 +1,22 @@
+tag:
+  - xlsum_tasks
+  - xlsum_prompt_2
+task: null
+dataset_path: csebuetnlp/xlsum
+dataset_name: null
+dataset_kwargs:
+  trust_remote_code: true
+output_type: generate_until
+generation_kwargs:
+  until:
+    - "</s>"
+validation_split: validation
+fewshot_split: validation
+test_split: test
+should_decontaminate: false
+metric_list:
+  - metric: !function utils.rougeL
+    higher_is_better: true
+    aggregation:  !function utils.rougeL_agg
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum_amharic.yaml b/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum_amharic.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0f2275c657b54df708f62526fdc12b0381f197eb
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum_amharic.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: amharic
+doc_to_target: '{{summary}}'
+doc_to_text: 'Summarize the document below in triple backticks and return only the
+  summary and nothing else.
+
+  ```{{''text''}}```\n'
+include: xlsum
+task: xlsum_amharic_prompt_2
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum_arabic.yaml b/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum_arabic.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a4f772c31610175417ee97105ae4d99f526f0c41
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum_arabic.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: arabic
+doc_to_target: '{{summary}}'
+doc_to_text: 'Summarize the document below in triple backticks and return only the
+  summary and nothing else.
+
+  ```{{''text''}}```\n'
+include: xlsum
+task: xlsum_arabic_prompt_2
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum_hausa.yaml b/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum_hausa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7485672cb48c86d1df444391064b04754836fa77
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum_hausa.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: hausa
+doc_to_target: '{{summary}}'
+doc_to_text: 'Summarize the document below in triple backticks and return only the
+  summary and nothing else.
+
+  ```{{''text''}}```\n'
+include: xlsum
+task: xlsum_hausa_prompt_2
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum_igbo.yaml b/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum_igbo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2cf7fafe394e049aaa3382068d6da8cac70cf705
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum_igbo.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: igbo
+doc_to_target: '{{summary}}'
+doc_to_text: 'Summarize the document below in triple backticks and return only the
+  summary and nothing else.
+
+  ```{{''text''}}```\n'
+include: xlsum
+task: xlsum_igbo_prompt_2
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum_kirundi.yaml b/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum_kirundi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..63021d7d4f9ff0d252c82665fd8262bd6bb5c327
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum_kirundi.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: kirundi
+doc_to_target: '{{summary}}'
+doc_to_text: 'Summarize the document below in triple backticks and return only the
+  summary and nothing else.
+
+  ```{{''text''}}```\n'
+include: xlsum
+task: xlsum_kirundi_prompt_2
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum_oromo.yaml b/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum_oromo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b637b10d0428d614fcd4c06bdb1fb2383057ef77
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum_oromo.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: oromo
+doc_to_target: '{{summary}}'
+doc_to_text: 'Summarize the document below in triple backticks and return only the
+  summary and nothing else.
+
+  ```{{''text''}}```\n'
+include: xlsum
+task: xlsum_oromo_prompt_2
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum_pidgin.yaml b/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum_pidgin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6c13d93d5c1ff95ac76c1b87f4c301c97a771f52
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum_pidgin.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: pidgin
+doc_to_target: '{{summary}}'
+doc_to_text: 'Summarize the document below in triple backticks and return only the
+  summary and nothing else.
+
+  ```{{''text''}}```\n'
+include: xlsum
+task: xlsum_pidgin_prompt_2
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum_somali.yaml b/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum_somali.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b7245ddc193a133701fd8f71cd6b52cd34899594
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum_somali.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: somali
+doc_to_target: '{{summary}}'
+doc_to_text: 'Summarize the document below in triple backticks and return only the
+  summary and nothing else.
+
+  ```{{''text''}}```\n'
+include: xlsum
+task: xlsum_somali_prompt_2
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum_swahili.yaml b/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum_swahili.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..65f176fba40e37003a9cfd8813957760cdac2aa1
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum_swahili.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: swahili
+doc_to_target: '{{summary}}'
+doc_to_text: 'Summarize the document below in triple backticks and return only the
+  summary and nothing else.
+
+  ```{{''text''}}```\n'
+include: xlsum
+task: xlsum_swahili_prompt_2
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum_telugu.yaml b/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum_telugu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0ecbdde5c90806b2684fce1373c3fad94ef5c65e
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum_telugu.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: telugu
+doc_to_target: '{{summary}}'
+doc_to_text: 'Summarize the document below in triple backticks and return only the
+  summary and nothing else.
+
+  ```{{''text''}}```\n'
+include: xlsum
+task: xlsum_telugu_prompt_2
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum_tigrinya.yaml b/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum_tigrinya.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d46e2fb573f5cdd3bb9459c7eb5b95150cae5ec8
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum_tigrinya.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: tigrinya
+doc_to_target: '{{summary}}'
+doc_to_text: 'Summarize the document below in triple backticks and return only the
+  summary and nothing else.
+
+  ```{{''text''}}```\n'
+include: xlsum
+task: xlsum_tigrinya_prompt_2
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum_yoruba.yaml b/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum_yoruba.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7ea0ef503444a8237ce5fd693f6ebec082a8a6cd
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum_yoruba.yaml
@@ -0,0 +1,9 @@
+# Generated by utils.py
+dataset_name: yoruba
+doc_to_target: '{{summary}}'
+doc_to_text: 'Summarize the document below in triple backticks and return only the
+  summary and nothing else.
+
+  ```{{''text''}}```\n'
+include: xlsum
+task: xlsum_yoruba_prompt_2
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_3/utils.py b/lm_eval/tasks/afrobench/xlsum/prompt_3/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..85db4d4f4cef061e526c970ece194317e576de06
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_3/utils.py
@@ -0,0 +1,18 @@
+import evaluate
+
+
+def rougeL(items):
+    """
+    # passthrough for efficiency
+    """
+    return items
+
+
+def rougeL_agg(items):
+    """
+    Higher is better
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    rouge_scorer = evaluate.load("rouge")
+    return rouge_scorer.compute(predictions=preds, references=refs)["rougeL"]
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum b/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum
new file mode 100644
index 0000000000000000000000000000000000000000..08842ef8eb627dfb12387ae7ef2e232d2f3c40d3
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum
@@ -0,0 +1,22 @@
+tag:
+  - xlsum_tasks
+  - xlsum_prompt_3
+task: null
+dataset_path: csebuetnlp/xlsum
+dataset_name: null
+dataset_kwargs:
+  trust_remote_code: true
+output_type: generate_until
+generation_kwargs:
+  until:
+    - "</s>"
+validation_split: validation
+fewshot_split: validation
+test_split: test
+should_decontaminate: false
+metric_list:
+  - metric: !function utils.rougeL
+    higher_is_better: true
+    aggregation:  !function utils.rougeL_agg
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum_amharic.yaml b/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum_amharic.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6fc85e7ceb43f474754081088a77b3979b785334
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum_amharic.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: amharic
+doc_to_target: '{{summary}}'
+doc_to_text: 'You are an advanced Summarizer, a specialized assistant designed to
+  summarize documents in Amharic. Your main goal is to ensure summaries are concise
+  and informative. Ensure you return the summary only and nothing else.
+
+  Document: {{''text''}}\nSummary: '
+include: xlsum
+task: xlsum_amharic_prompt_3
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum_arabic.yaml b/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum_arabic.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d4f2b1f5c09bcf2dc76fb4807e09a1cc52b6ce82
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum_arabic.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: arabic
+doc_to_target: '{{summary}}'
+doc_to_text: 'You are an advanced Summarizer, a specialized assistant designed to
+  summarize documents in Arabic. Your main goal is to ensure summaries are concise
+  and informative. Ensure you return the summary only and nothing else.
+
+  Document: {{''text''}}\nSummary: '
+include: xlsum
+task: xlsum_arabic_prompt_3
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum_hausa.yaml b/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum_hausa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e1a0603749b196a9cda3f995fb16ea2814513142
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum_hausa.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: hausa
+doc_to_target: '{{summary}}'
+doc_to_text: 'You are an advanced Summarizer, a specialized assistant designed to
+  summarize documents in Hausa. Your main goal is to ensure summaries are concise
+  and informative. Ensure you return the summary only and nothing else.
+
+  Document: {{''text''}}\nSummary: '
+include: xlsum
+task: xlsum_hausa_prompt_3
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum_igbo.yaml b/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum_igbo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6b23f8f395679740acd523757311a7803407c3cd
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum_igbo.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: igbo
+doc_to_target: '{{summary}}'
+doc_to_text: 'You are an advanced Summarizer, a specialized assistant designed to
+  summarize documents in Igbo. Your main goal is to ensure summaries are concise and
+  informative. Ensure you return the summary only and nothing else.
+
+  Document: {{''text''}}\nSummary: '
+include: xlsum
+task: xlsum_igbo_prompt_3
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum_kirundi.yaml b/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum_kirundi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8f40b2a7ff68847a5bae0649451460d22f24ae2a
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum_kirundi.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: kirundi
+doc_to_target: '{{summary}}'
+doc_to_text: 'You are an advanced Summarizer, a specialized assistant designed to
+  summarize documents in Kirundi. Your main goal is to ensure summaries are concise
+  and informative. Ensure you return the summary only and nothing else.
+
+  Document: {{''text''}}\nSummary: '
+include: xlsum
+task: xlsum_kirundi_prompt_3
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum_oromo.yaml b/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum_oromo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bbc912851b05e78829a2835e14e28831469d302d
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum_oromo.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: oromo
+doc_to_target: '{{summary}}'
+doc_to_text: 'You are an advanced Summarizer, a specialized assistant designed to
+  summarize documents in Oromo. Your main goal is to ensure summaries are concise
+  and informative. Ensure you return the summary only and nothing else.
+
+  Document: {{''text''}}\nSummary: '
+include: xlsum
+task: xlsum_oromo_prompt_3
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum_pidgin.yaml b/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum_pidgin.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8149e441e9869a94312fe54001cd93fd6d720eaa
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum_pidgin.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: pidgin
+doc_to_target: '{{summary}}'
+doc_to_text: 'You are an advanced Summarizer, a specialized assistant designed to
+  summarize documents in Nigerian pidgin. Your main goal is to ensure summaries are
+  concise and informative. Ensure you return the summary only and nothing else.
+
+  Document: {{''text''}}\nSummary: '
+include: xlsum
+task: xlsum_pidgin_prompt_3
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum_somali.yaml b/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum_somali.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a2936da11bdca6fb8fd9ba6f288968ee0c1843a4
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum_somali.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: somali
+doc_to_target: '{{summary}}'
+doc_to_text: 'You are an advanced Summarizer, a specialized assistant designed to
+  summarize documents in Somali. Your main goal is to ensure summaries are concise
+  and informative. Ensure you return the summary only and nothing else.
+
+  Document: {{''text''}}\nSummary: '
+include: xlsum
+task: xlsum_somali_prompt_3
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum_swahili.yaml b/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum_swahili.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6f90f4cfaa99d4291d36e9e9aec715e32925cb55
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum_swahili.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: swahili
+doc_to_target: '{{summary}}'
+doc_to_text: 'You are an advanced Summarizer, a specialized assistant designed to
+  summarize documents in Swahili. Your main goal is to ensure summaries are concise
+  and informative. Ensure you return the summary only and nothing else.
+
+  Document: {{''text''}}\nSummary: '
+include: xlsum
+task: xlsum_swahili_prompt_3
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum_telugu.yaml b/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum_telugu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..67d116dce2105d9121c5f13e268333005d9a91dc
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum_telugu.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: telugu
+doc_to_target: '{{summary}}'
+doc_to_text: 'You are an advanced Summarizer, a specialized assistant designed to
+  summarize documents in Telugu. Your main goal is to ensure summaries are concise
+  and informative. Ensure you return the summary only and nothing else.
+
+  Document: {{''text''}}\nSummary: '
+include: xlsum
+task: xlsum_telugu_prompt_3
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum_tigrinya.yaml b/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum_tigrinya.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5b20d6e3bfdb66590f7004ff0f187e2a6537db84
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum_tigrinya.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: tigrinya
+doc_to_target: '{{summary}}'
+doc_to_text: 'You are an advanced Summarizer, a specialized assistant designed to
+  summarize documents in Tigrinya. Your main goal is to ensure summaries are concise
+  and informative. Ensure you return the summary only and nothing else.
+
+  Document: {{''text''}}\nSummary: '
+include: xlsum
+task: xlsum_tigrinya_prompt_3
diff --git a/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum_yoruba.yaml b/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum_yoruba.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..353be14cda6713964f73436b2085d6ae63fcdc57
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum_yoruba.yaml
@@ -0,0 +1,10 @@
+# Generated by utils.py
+dataset_name: yoruba
+doc_to_target: '{{summary}}'
+doc_to_text: 'You are an advanced Summarizer, a specialized assistant designed to
+  summarize documents in Yoruba. Your main goal is to ensure summaries are concise
+  and informative. Ensure you return the summary only and nothing else.
+
+  Document: {{''text''}}\nSummary: '
+include: xlsum
+task: xlsum_yoruba_prompt_3
diff --git a/lm_eval/tasks/afrobench/xlsum/utils.py b/lm_eval/tasks/afrobench/xlsum/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8df1e12e8b4aa683bd71c2fb23d90ff7667de5b2
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/utils.py
@@ -0,0 +1,118 @@
+import argparse
+import os
+
+import yaml
+
+
+def prompt_func(mode, lang):
+    if lang == "pidgin":
+        lang = "Nigerian Pidgin"
+
+    prompt_map = {
+        "prompt_1": f"Provide a summary of the document written in {lang.capitalize()}. Ensure that you provide the summary in {lang.capitalize()} and nothing else.\n"
+        f"Document in {lang.capitalize()}: " + r"{{'text'}}\n"
+        "Summary: ",
+        "prompt_2": "Summarize the document below in triple backticks and return only the summary and nothing else.\n"
+        + r"```{{'text'}}```\n",
+        "prompt_3": f"You are an advanced Summarizer, a specialized assistant designed to summarize documents in {lang.capitalize()}. "
+        f"Your main goal is to ensure summaries are concise and informative. Ensure you return the summary only and nothing else.\n"
+        f"Document: " + r"{{'text'}}\n"
+        "Summary: ",
+        "prompt_4": f"Summarize this {lang.capitalize()} document:\n" + r"{{'text'}}\n"
+        "Summary: ",
+        "prompt_5": f"{lang.capitalize()} document: " + r"{{'text'}}\n"
+        "Summary: ",
+    }
+    return prompt_map[mode]
+
+
+def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
+    """
+    Generate a yaml file for each language.
+
+    :param output_dir: The directory to output the files to.
+    :param overwrite: Whether to overwrite files if they already exist.
+    """
+    err = []
+    XLSUM_LANGUAGES = (
+        "amharic",
+        "arabic",
+        "hausa",
+        "igbo",
+        "kirundi",
+        "oromo",
+        "pidgin",
+        "somali",
+        "swahili",
+        "telugu",
+        "tigrinya",
+        "yoruba",
+    )
+
+    for lang in XLSUM_LANGUAGES:
+        try:
+            file_name = f"xlsum_{lang}.yaml"
+            task_name = f"xlsum_{lang}_{mode}"
+            yaml_template = "xlsum"
+            yaml_details = {
+                "include": yaml_template,
+                "task": task_name,
+                "dataset_name": lang,
+                "doc_to_text": prompt_func(mode, lang),
+                "doc_to_target": "{{summary}}",
+            }
+            file_path = os.path.join(output_dir, mode)
+            os.makedirs(file_path, exist_ok=True)
+
+            with open(
+                f"{output_dir}/{mode}/{file_name}",
+                "w" if overwrite else "x",
+                encoding="utf8",
+            ) as f:
+                f.write("# Generated by utils.py\n")
+                yaml.dump(
+                    yaml_details,
+                    f,
+                    allow_unicode=True,
+                )
+        except FileExistsError:
+            err.append(file_name)
+
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist (use --overwrite flag):"
+            f" {', '.join(err)}"
+        )
+
+
+def main() -> None:
+    """Parse CLI args and generate language-specific yaml files."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--overwrite",
+        default=True,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default="./",
+        help="Directory to write yaml files to",
+    )
+
+    PROMPT_CHOICES = ["prompt_1", "prompt_2", "prompt_3", "prompt_4", "prompt_5"]
+    parser.add_argument(
+        "--mode",
+        nargs="*",
+        default=PROMPT_CHOICES,
+        choices=PROMPT_CHOICES,
+        help="Prompt number(s)",
+    )
+    args = parser.parse_args()
+
+    for mode in args.mode:
+        gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite, mode=mode)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lm_eval/tasks/afrobench/xlsum/xlsum.yaml b/lm_eval/tasks/afrobench/xlsum/xlsum.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8d87717597c59eb333f712d69eb854e971146915
--- /dev/null
+++ b/lm_eval/tasks/afrobench/xlsum/xlsum.yaml
@@ -0,0 +1,11 @@
+group: xlum
+task:
+  - xlsum_prompt_1
+  - xlsum_prompt_2
+  - xlsum_prompt_3
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 2
diff --git a/lm_eval/tasks/arab_culture/README.md b/lm_eval/tasks/arab_culture/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cf6f8cf7932ed12fdc844eecd985cb12209b34d6
--- /dev/null
+++ b/lm_eval/tasks/arab_culture/README.md
@@ -0,0 +1,70 @@
+# Arab Culture
+
+### Paper
+
+Title: Commonsense Reasoning in Arab Culture
+
+
+Abstract: https://arxiv.org/abs/2502.12788
+
+Despite progress in Arabic large language models, such as Jais and AceGPT, their evaluation on commonsense reasoning has largely relied on machine-translated datasets, which lack cultural depth and may introduce Anglocentric biases. Commonsense reasoning is shaped by geographical and cultural contexts, and existing English datasets fail to capture the diversity of the Arab world. To address this, we introduce \datasetname, a commonsense reasoning dataset in Modern Standard Arabic (MSA), covering cultures of 13 countries across the Gulf, Levant, North Africa, and the Nile Valley. The dataset was built from scratch by engaging native speakers to write and validate culturally relevant questions for their respective countries. \datasetname spans 12 daily life domains with 54 fine-grained subtopics, reflecting various aspects of social norms, traditions, and everyday experiences. Zero-shot evaluations show that open-weight language models with up to 32B parameters struggle to comprehend diverse Arab cultures, with performance varying across regions. These findings highlight the need for more culturally aware models and datasets tailored to the Arabic-speaking world.
+
+Homepage: https://github.com/fajri91/ArabicCulture
+
+
+### Citation
+
+```
+@misc{sadallah2025commonsensereasoningarabculture,
+      title={Commonsense Reasoning in Arab Culture},
+      author={Abdelrahman Sadallah and Junior Cedric Tonga and Khalid Almubarak and Saeed Almheiri and Farah Atif and Chatrine Qwaider and Karima Kadaoui and Sara Shatnawi and Yaser Alesh and Fajri Koto},
+      year={2025},
+      eprint={2502.12788},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2502.12788},
+}
+```
+
+### There are two variant of this task: `arab_culture`, and `arab_culture_completion`
+
+- The `arab_culture` is the normal MCQ evaluation type, which appends the answers to the question, and then measure the likelihood of the different choices markers (A,B,C or "أ","ب","ج"). For more info, follow the MMLU style [tempelate](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/mmlu/default/_default_template_yaml#L7-L8)
+- The `arab_culture_completion` do the evaluation in a sentence completion manner, by appending each asnwer to the question separetley and chooses the answer with the higher likelihood. See [this](https://github.com/EleutherAI/lm-evaluation-harness/blob/1f9bc88fe61f6bfa36f74e91ce3d59ab5685e4f1/lm_eval/tasks/arc/arc_easy.yaml#L10-L12) for more information
+
+### Groups and Tasks
+
+#### Groups
+
+* `arabculture`: evaluates all ArabCulture tasks.
+
+* `arab_culture_gulf`: evaluates Gulf countires ArabCulture tasks.
+* `arab_culture_levant`: evaluates Levant countires ArabCulture tasks.
+* `arab_culture_nile_valley`: evaluates Nile Valley countires ArabCulture tasks.
+* `arab_culture_north_africa`: evaluates North Africa ArabCulture tasks.
+
+###  Evaluation modes
+This bechmark allows for different evaluation settings by allowing to adding more extra context for the model:
+
+We have three settings:
+* without any information
+```
+COUNTRY=False
+REGION=False
+```
+* with  only region information
+```
+COUNTRY=False
+REGION=True
+```
+* with region and country information
+```
+COUNTRY=True
+REGION=True
+```
+
+**Please add these flags add environment variables.**
+
+
+* We also allow for prompting in English, which we found to achieve higher results on most of the evaluated models (please refer to our paper).
+
+* To change the language of the prompt, Define the `ARABIC` environment variable.
diff --git a/lm_eval/tasks/arab_culture/_arab_culture.yaml b/lm_eval/tasks/arab_culture/_arab_culture.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8da809e673c13ac90476102a1e9a2cc07ee90816
--- /dev/null
+++ b/lm_eval/tasks/arab_culture/_arab_culture.yaml
@@ -0,0 +1,12 @@
+aggregate_metric_list:
+    metric: acc
+    weight_by_size: true
+group: arab_culture
+metadata:
+    description: Arab Culture tasks
+    version: 0
+task:
+- arab_culture_gulf
+- arab_culture_levant
+- arab_culture_north_africa
+- arab_culture_nile_valley
diff --git a/lm_eval/tasks/arab_culture/_arab_culture_gulf.yaml b/lm_eval/tasks/arab_culture/_arab_culture_gulf.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ca0ca4d89ad7c3cb95b83b50d3891ca26033397c
--- /dev/null
+++ b/lm_eval/tasks/arab_culture/_arab_culture_gulf.yaml
@@ -0,0 +1,10 @@
+aggregate_metric_list:
+    metric: acc
+    weight_by_size: true
+group: arab_culture_gulf
+group_alias: Gulf
+metadata:
+    description: arab Culture tasks
+    version: 0
+task:
+- arab_culture_gulf_tasks
diff --git a/lm_eval/tasks/arab_culture/_arab_culture_levant.yaml b/lm_eval/tasks/arab_culture/_arab_culture_levant.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b3344d372055efd1fafd71b221631c38589ff9d7
--- /dev/null
+++ b/lm_eval/tasks/arab_culture/_arab_culture_levant.yaml
@@ -0,0 +1,10 @@
+aggregate_metric_list:
+    metric: acc
+    weight_by_size: true
+group: arab_culture_levant
+group_alias: Levant
+metadata:
+    description: arab Culture tasks
+    version: 0
+task:
+- arab_culture_levant_tasks
diff --git a/lm_eval/tasks/arab_culture/_arab_culture_nile_valley.yaml b/lm_eval/tasks/arab_culture/_arab_culture_nile_valley.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e858409a9b5d5473431fde3c55369ea3b70d2d32
--- /dev/null
+++ b/lm_eval/tasks/arab_culture/_arab_culture_nile_valley.yaml
@@ -0,0 +1,10 @@
+aggregate_metric_list:
+    metric: acc
+    weight_by_size: true
+group: arab_culture_nile_valley
+group_alias: Nile Valley
+metadata:
+    description: arab Culture tasks
+    version: 0
+task:
+- arab_culture_nile_valley_tasks
diff --git a/lm_eval/tasks/arab_culture/_arab_culture_north_africa.yaml b/lm_eval/tasks/arab_culture/_arab_culture_north_africa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..30f31ffa797e29dc4de71990587054f797af3f92
--- /dev/null
+++ b/lm_eval/tasks/arab_culture/_arab_culture_north_africa.yaml
@@ -0,0 +1,10 @@
+aggregate_metric_list:
+    metric: acc
+    weight_by_size: true
+group: arab_culture_north_africa
+group_alias: North Africa
+metadata:
+    description: arab Culture tasks
+    version: 0
+task:
+- arab_culture_north_africa_tasks
diff --git a/lm_eval/tasks/arab_culture/_default_arab_culture_mcq_template_yaml b/lm_eval/tasks/arab_culture/_default_arab_culture_mcq_template_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..30b78fe6ca6e58f3aca53bd07594a838a4e50aae
--- /dev/null
+++ b/lm_eval/tasks/arab_culture/_default_arab_culture_mcq_template_yaml
@@ -0,0 +1,19 @@
+dataset_path: MBZUAI/ArabCulture
+test_split: test
+fewshot_split: test
+fewshot_config:
+  sampler: first_n
+output_type: multiple_choice
+doc_to_text: !function   utils_mcq.doc_to_text
+doc_to_choice: !function utils_mcq.doc_to_choice
+doc_to_target: !function utils_mcq.doc_to_target
+target_delimiter: ""
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/arab_culture/_generate_configs.py b/lm_eval/tasks/arab_culture/_generate_configs.py
new file mode 100644
index 0000000000000000000000000000000000000000..81ee1c619d12909822fb19b6b8da319434400bf1
--- /dev/null
+++ b/lm_eval/tasks/arab_culture/_generate_configs.py
@@ -0,0 +1,122 @@
+"""
+Take in a YAML, and output all "other" splits with this YAML
+"""
+
+import argparse
+import logging
+import os
+
+import yaml
+from tqdm import tqdm
+
+
+eval_logger = logging.getLogger("lm-eval")
+
+countries = {
+    "KSA": "Gulf",
+    "UAE": "Gulf",
+    "Yemen": "Gulf",
+    "Lebanon": "Levant",
+    "Syria": "Levant",
+    "Palestine": "Levant",
+    "Jordan": "Levant",
+    "Tunisia": "North Africa",
+    "Algeria": "North Africa",
+    "Morocco": "North Africa",
+    "Libya": "North Africa",
+    "Egypt": "Nile Valley",
+    "Sudan": "Nile Valley",
+}
+
+VERSION = 0
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--base_yaml_path", default="_default_arab_culture_mcq_template_yaml"
+    )
+    parser.add_argument("--save_prefix_path", default="arab_culture")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    # get filename of base_yaml so we can `"include": ` it in our "other" YAMLs.
+    base_yaml_name = os.path.split(args.base_yaml_path)[-1]
+    # with open(args.base_yaml_path, encoding="utf-8") as f:
+    #     base_yaml = yaml.full_load(f)
+
+    ALL_REGIONS = []
+    for country, region in tqdm(countries.items()):
+        if region not in ALL_REGIONS:
+            ALL_REGIONS.append(region)
+
+        # description = f"The following are multiple choice questions (with answers) about {' '.join(subject.split('_'))}.\n\n"
+
+        yaml_dict = {
+            "include": base_yaml_name,
+            "tag": f"arab_culture_{region.lower().replace(' ', '_')}_tasks",
+            "task": f"arab_culture_{country.lower().replace(' ', '_')}",
+            "task_alias": country,
+            "dataset_name": country,
+            # "description": description,
+        }
+
+        file_save_path = (
+            args.save_prefix_path
+            + f"_{country.lower().replace(' ', '_').replace('(', '').replace(')', '')}.yaml"
+        )
+        eval_logger.info(f"Saving yaml for subset {country} to {file_save_path}")
+        with open(file_save_path, "w", encoding="utf-8") as yaml_file:
+            yaml.dump(
+                yaml_dict,
+                yaml_file,
+                allow_unicode=True,
+                default_style='"',
+            )
+
+    arab_culture_mcq_regions = [
+        f"arab_culture_{region.lower().replace(' ', '_')}" for region in ALL_REGIONS
+    ]
+
+    file_save_path = args.save_prefix_path + ".yaml"
+
+    eval_logger.info(f"Saving benchmark config to {file_save_path}")
+
+    for region in ALL_REGIONS:
+        file_save_path = (
+            args.save_prefix_path + f"_{region.lower().replace(' ', '_')}.yaml"
+        )
+        eval_logger.info(f"Saving yaml for subset {region} to {file_save_path}")
+        with open("_" + file_save_path, "w", encoding="utf-8") as yaml_file:
+            yaml.dump(
+                {
+                    "group": f"arab_culture_{region.lower().replace(' ', '_')}",
+                    "group_alias": region,
+                    "task": [f"arab_culture_{region.lower().replace(' ', '_')}_tasks"],
+                    "aggregate_metric_list": {"metric": "acc", "weight_by_size": True},
+                    "metadata": {
+                        "description": "arab Culture tasks",
+                        "version": VERSION,
+                    },
+                },
+                yaml_file,
+                indent=4,
+                default_flow_style=False,
+            )
+
+    file_save_path = args.save_prefix_path + ".yaml"
+    with open("_" + file_save_path, "w", encoding="utf-8") as yaml_file:
+        yaml.dump(
+            {
+                "group": "arab_culture",
+                "task": arab_culture_mcq_regions,
+                "aggregate_metric_list": {"metric": "acc", "weight_by_size": True},
+                "metadata": {"description": "Arab Culture tasks", "version": VERSION},
+            },
+            yaml_file,
+            indent=4,
+            default_flow_style=False,
+        )
diff --git a/lm_eval/tasks/arab_culture/arab_culture_algeria.yaml b/lm_eval/tasks/arab_culture/arab_culture_algeria.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..705606b81254a5c423eb5d2066189a3f6cddf92c
--- /dev/null
+++ b/lm_eval/tasks/arab_culture/arab_culture_algeria.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "Algeria"
+"include": "_default_arab_culture_mcq_template_yaml"
+"tag": "arab_culture_north_africa_tasks"
+"task": "arab_culture_algeria"
+"task_alias": "Algeria"
diff --git a/lm_eval/tasks/arab_culture/arab_culture_egypt.yaml b/lm_eval/tasks/arab_culture/arab_culture_egypt.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f71863147d6f8d465f49a90aee21c290ab2a82f8
--- /dev/null
+++ b/lm_eval/tasks/arab_culture/arab_culture_egypt.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "Egypt"
+"include": "_default_arab_culture_mcq_template_yaml"
+"tag": "arab_culture_nile_valley_tasks"
+"task": "arab_culture_egypt"
+"task_alias": "Egypt"
diff --git a/lm_eval/tasks/arab_culture/arab_culture_jordan.yaml b/lm_eval/tasks/arab_culture/arab_culture_jordan.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c6587d2988f170797a3cad1916ea5ee7d298bc0e
--- /dev/null
+++ b/lm_eval/tasks/arab_culture/arab_culture_jordan.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "Jordan"
+"include": "_default_arab_culture_mcq_template_yaml"
+"tag": "arab_culture_levant_tasks"
+"task": "arab_culture_jordan"
+"task_alias": "Jordan"
diff --git a/lm_eval/tasks/arab_culture/arab_culture_ksa.yaml b/lm_eval/tasks/arab_culture/arab_culture_ksa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..07d87cb80c76fccce2d48c950756822a5ebce46e
--- /dev/null
+++ b/lm_eval/tasks/arab_culture/arab_culture_ksa.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "KSA"
+"include": "_default_arab_culture_mcq_template_yaml"
+"tag": "arab_culture_gulf_tasks"
+"task": "arab_culture_ksa"
+"task_alias": "KSA"
diff --git a/lm_eval/tasks/arab_culture/arab_culture_lebanon.yaml b/lm_eval/tasks/arab_culture/arab_culture_lebanon.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..41c2b53cdf96bacc42141038e4917d1e1a9ff614
--- /dev/null
+++ b/lm_eval/tasks/arab_culture/arab_culture_lebanon.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "Lebanon"
+"include": "_default_arab_culture_mcq_template_yaml"
+"tag": "arab_culture_levant_tasks"
+"task": "arab_culture_lebanon"
+"task_alias": "Lebanon"
diff --git a/lm_eval/tasks/arab_culture/arab_culture_libya.yaml b/lm_eval/tasks/arab_culture/arab_culture_libya.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e82c5598241a12f58000066fbcfde0a6eb2fa2a2
--- /dev/null
+++ b/lm_eval/tasks/arab_culture/arab_culture_libya.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "Libya"
+"include": "_default_arab_culture_mcq_template_yaml"
+"tag": "arab_culture_north_africa_tasks"
+"task": "arab_culture_libya"
+"task_alias": "Libya"
diff --git a/lm_eval/tasks/arab_culture/arab_culture_morocco.yaml b/lm_eval/tasks/arab_culture/arab_culture_morocco.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..847a86f5e80e184515b0b8bafcecd3e9dd499590
--- /dev/null
+++ b/lm_eval/tasks/arab_culture/arab_culture_morocco.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "Morocco"
+"include": "_default_arab_culture_mcq_template_yaml"
+"tag": "arab_culture_north_africa_tasks"
+"task": "arab_culture_morocco"
+"task_alias": "Morocco"
diff --git a/lm_eval/tasks/arab_culture/arab_culture_palestine.yaml b/lm_eval/tasks/arab_culture/arab_culture_palestine.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dcbe183bb7a58c2a79d7f4b26d04a52255d9967b
--- /dev/null
+++ b/lm_eval/tasks/arab_culture/arab_culture_palestine.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "Palestine"
+"include": "_default_arab_culture_mcq_template_yaml"
+"tag": "arab_culture_levant_tasks"
+"task": "arab_culture_palestine"
+"task_alias": "Palestine"
diff --git a/lm_eval/tasks/arab_culture/arab_culture_sudan.yaml b/lm_eval/tasks/arab_culture/arab_culture_sudan.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9920d5ba119f94ff160b567a180cfb7af1f2cbdc
--- /dev/null
+++ b/lm_eval/tasks/arab_culture/arab_culture_sudan.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "Sudan"
+"include": "_default_arab_culture_mcq_template_yaml"
+"tag": "arab_culture_nile_valley_tasks"
+"task": "arab_culture_sudan"
+"task_alias": "Sudan"
diff --git a/lm_eval/tasks/arab_culture/arab_culture_syria.yaml b/lm_eval/tasks/arab_culture/arab_culture_syria.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0ed6f7672346cd580c4e128c08cf27c3020d92e4
--- /dev/null
+++ b/lm_eval/tasks/arab_culture/arab_culture_syria.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "Syria"
+"include": "_default_arab_culture_mcq_template_yaml"
+"tag": "arab_culture_levant_tasks"
+"task": "arab_culture_syria"
+"task_alias": "Syria"
diff --git a/lm_eval/tasks/arab_culture/arab_culture_tunisia.yaml b/lm_eval/tasks/arab_culture/arab_culture_tunisia.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..de8d78a88e893e47b250e3f8f5c7d7b22611d26a
--- /dev/null
+++ b/lm_eval/tasks/arab_culture/arab_culture_tunisia.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "Tunisia"
+"include": "_default_arab_culture_mcq_template_yaml"
+"tag": "arab_culture_north_africa_tasks"
+"task": "arab_culture_tunisia"
+"task_alias": "Tunisia"
diff --git a/lm_eval/tasks/arab_culture/arab_culture_uae.yaml b/lm_eval/tasks/arab_culture/arab_culture_uae.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5bcbeb83b7ac5fbec45c97802ea32477a055ae75
--- /dev/null
+++ b/lm_eval/tasks/arab_culture/arab_culture_uae.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "UAE"
+"include": "_default_arab_culture_mcq_template_yaml"
+"tag": "arab_culture_gulf_tasks"
+"task": "arab_culture_uae"
+"task_alias": "UAE"
diff --git a/lm_eval/tasks/arab_culture/arab_culture_yemen.yaml b/lm_eval/tasks/arab_culture/arab_culture_yemen.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..caa32073c216e4eec6c3d1101d6b780146d3b91b
--- /dev/null
+++ b/lm_eval/tasks/arab_culture/arab_culture_yemen.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "Yemen"
+"include": "_default_arab_culture_mcq_template_yaml"
+"tag": "arab_culture_gulf_tasks"
+"task": "arab_culture_yemen"
+"task_alias": "Yemen"
diff --git a/lm_eval/tasks/arab_culture/prompts.py b/lm_eval/tasks/arab_culture/prompts.py
new file mode 100644
index 0000000000000000000000000000000000000000..983bd8b26f25f1db06e4b5314da6be45804dc65e
--- /dev/null
+++ b/lm_eval/tasks/arab_culture/prompts.py
@@ -0,0 +1,73 @@
+REGION_COUNTRY_PROMPT_AR = """
+مهمتك هي اختيار الخيار الأنسب ثقافياً بناءً على السياق المقدم أدناه.
+
+الموقع: {country}, {region}
+الجملة: {first_statement}
+
+يرجى مراعاة الفروق الثقافية للموقع المحدد واختيار الإجابة الأكثر ملاءمة من الخيارات المتاحة.
+
+الخيارات:
+{choices}
+"""
+
+REGION_PROMPT_AR = """
+مهمتك هي اختيار الخيار الأنسب ثقافياً بناءً على السياق المقدم أدناه.
+
+الموقع: {region}
+الجملة: {first_statement}
+
+يرجى مراعاة الفروق الثقافية للموقع المحدد واختيار الإجابة الأكثر ملاءمة من الخيارات المتاحة.
+
+الخيارات:
+{choices}
+"""
+
+BASE_PROMPT_AR = """
+مهمتك هي اختيار الخيار الأنسب ثقافياً بناءً على السياق المقدم أدناه.
+
+الجملة: {first_statement}
+
+يرجى مراعاة الفروق الثقافية واختيار الإجابة الأكثر ملاءمة من الخيارات المتاحة.
+
+الخيارات:
+{choices}
+"""
+
+REGION_COUNTRY_PROMPT = """
+You are tasked with selecting the most culturally appropriate option based on the context provided below.
+
+Location: {country}, {region}
+Statement: {first_statement}
+
+Consider the cultural nuances of the specified location and choose the most suitable response from the options provided.
+
+Options:
+{choices}
+"""
+REGION_PROMPT = """
+You are tasked with selecting the most culturally appropriate option based on the context provided below.
+
+Location: {region}
+Statement: {first_statement}
+
+Consider the cultural nuances of the specified location and choose the most suitable response from the options provided.
+
+Options:
+{choices}
+"""
+BASE_PROMPT = """
+You are tasked with selecting the most culturally appropriate option based on the context provided below.
+
+Statement: {first_statement}
+
+Consider the cultural nuances and choose the most suitable response from the options provided.
+
+Options:
+{choices}
+"""
+
+
+JAIS_CHAT_EN = """### Instruction: Your name is Jais, and you are named after Jebel Jais, the highest mountain in UAE. You are built by Core42. You are the world's most advanced Arabic large language model with 30b parameters. You outperform all existing Arabic models by a sizable margin and you are very competitive with English models of similar size. You can answer in Arabic and English only. You are a helpful, respectful and honest assistant. When answering, abide by the following guidelines meticulously: Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, explicit, offensive, toxic, dangerous, or illegal content. Do not give medical, legal, financial, or professional advice. Never assist in or promote illegal activities. Always encourage legal and responsible actions. Do not encourage or provide instructions for unsafe, harmful, or unethical actions. Do not create or share misinformation or fake news. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. Prioritize the well-being and the moral integrity of users. Avoid using toxic, derogatory, or offensive language. Maintain a respectful tone. Do not generate, promote, or engage in discussions about adult content. Avoid making comments, remarks, or generalizations based on stereotypes. Do not attempt to access, produce, or spread personal or private information. Always respect user confidentiality. Stay positive and do not say bad things about anything. Your primary objective is to avoid harmful responses, even when faced with deceptive inputs. Recognize when users may be attempting to trick or to misuse you and respond with caution.\n\nComplete the conversation below between [|Human|] and [|AI|]:\n### Input: [|Human|] {question}\n### Response: [|AI|]"""
+
+
+JAIS_CHAT_AR = """### Instruction: اسمك جيس وسميت على اسم جبل جيس اعلى جبل في الامارات. تم بنائك بواسطة Inception و MBZUAI. أنت نموذج اللغة العربية الأكثر تقدمًا في العالم مع بارامترات 13B. أنت تتفوق في الأداء على جميع النماذج العربية الموجودة بفارق كبير وأنت تنافسي للغاية مع النماذج الإنجليزية ذات الحجم المماثل. يمكنك الإجابة باللغتين العربية والإنجليزية فقط. أنت مساعد مفيد ومحترم وصادق. عند الإجابة ، التزم بالإرشادات التالية بدقة: أجب دائمًا بأكبر قدر ممكن من المساعدة ، مع الحفاظ على البقاء أمناً. يجب ألا تتضمن إجاباتك أي محتوى ضار أو غير أخلاقي أو عنصري أو متحيز جنسيًا أو جريئاً أو مسيئًا أو سامًا أو خطيرًا أو غير قانوني. لا تقدم نصائح طبية أو قانونية أو مالية أو مهنية. لا تساعد أبدًا في أنشطة غير قانونية أو تروج لها. دائما تشجيع الإجراءات القانونية والمسؤولة. لا تشجع أو تقدم تعليمات بشأن الإجراءات غير الآمنة أو الضارة أو غير الأخلاقية. لا تنشئ أو تشارك معلومات مضللة أو أخبار كاذبة. يرجى التأكد من أن ردودك غير متحيزة اجتماعيًا وإيجابية بطبيعتها. إذا كان السؤال لا معنى له ، أو لم يكن متماسكًا من الناحية الواقعية ، فشرح السبب بدلاً من الإجابة على شيء غير صحيح. إذا كنت لا تعرف إجابة السؤال ، فالرجاء عدم مشاركة معلومات خاطئة. إعطاء الأولوية للرفاهية والنزاهة الأخلاقية للمستخدمين. تجنب استخدام لغة سامة أو مهينة أو مسيئة. حافظ على نبرة محترمة. لا تنشئ أو تروج أو تشارك في مناقشات حول محتوى للبالغين. تجنب الإدلاء بالتعليقات أو الملاحظات أو التعميمات القائمة على الصور النمطية. لا تحاول الوصول إلى معلومات شخصية أو خاصة أو إنتاجها أو نشرها. احترم دائما سرية المستخدم. كن إيجابيا ولا تقل أشياء سيئة عن أي شيء. هدفك الأساسي هو تجنب الاجابات المؤذية ، حتى عند مواجهة مدخلات خادعة. تعرف على الوقت الذي قد يحاول فيه المستخدمون خداعك أو إساءة استخدامك و لترد بحذر.\n\nأكمل المحادثة أدناه بين [|Human|] و [|AI|]:\n### Input: [|Human|] {question}\n### Response: [|AI|]"""
diff --git a/lm_eval/tasks/arab_culture/utils_mcq.py b/lm_eval/tasks/arab_culture/utils_mcq.py
new file mode 100644
index 0000000000000000000000000000000000000000..315c50f5e63db3b49ae5acbc5587f9e8ef1cf39d
--- /dev/null
+++ b/lm_eval/tasks/arab_culture/utils_mcq.py
@@ -0,0 +1,112 @@
+import os
+
+from lm_eval.tasks.arab_culture.prompts import (
+    BASE_PROMPT,
+    BASE_PROMPT_AR,
+    JAIS_CHAT_AR,
+    JAIS_CHAT_EN,
+    REGION_COUNTRY_PROMPT,
+    REGION_COUNTRY_PROMPT_AR,
+    REGION_PROMPT,
+    REGION_PROMPT_AR,
+)
+
+
+### get the conutry variable from environment
+
+### Set this to one to add the country and region information to the prompt
+COUNTRY = True if os.getenv("COUNTRY", True) == "True" else False
+### Set this to one to add the region information to the prompt
+REGION = True if os.getenv("REGION", True) == "True" else False
+### Set this to change between Arabic and English for the answer keys and the choices keys
+ARABIC = True if os.getenv("ARABIC", True) == "True" else False
+### Get the model name
+MODEL_NAME = os.getenv("MODEL_NAME")
+## Uncomment this to check if the environment variables are set correctly
+# print(f'Task settings: COUNTRY: {COUNTRY}, REGION: {REGION}, ARABIC: {ARABIC}', MODEL_NAME: {MODEL_NAME})
+
+en_ar_countries_regions = {
+    "Egypt": "مصر",
+    "Morocco": "المغرب",
+    "Algeria": "الجزائر",
+    "Libya": "ليبيا",
+    "Sudan": "السودان",
+    "Tunisia": "تونس",
+    "Jordan": "الأردن",
+    "Lebanon": "لبنان",
+    "Syria": "سوريا",
+    "Palestine": "فلسطين",
+    "Yemen": "اليمن",
+    "UAE": "الإمارات",
+    "KSA": "السعودية",
+    "Gulf": "الخليج",
+    "Levant": "الشام",
+    "North Africa": "شمال أفريقيا",
+    "Nile Valley": "وادي النيل",
+}
+
+
+def doc_to_text(doc):
+    country = "" if not doc["country"] else doc["country"]
+    region = "" if not doc["region"] else doc["region"]
+    first_statement = doc["first_statement"].strip()
+
+    ## We don't have a setting for only information about the country without the region
+    if COUNTRY:
+        assert REGION, (
+            "If you want to add the country information, you must also add the region information"
+        )
+
+    ## convert contry and region name to arabic if the language is arabic
+    if ARABIC:
+        country = en_ar_countries_regions[country]
+        region = en_ar_countries_regions[region]
+
+    choices = doc["options"]
+    choices_str = ""
+    for i in range(3):
+        key = choices["arabic_keys"][i] if ARABIC else choices["english_keys"][i]
+        choice_str = key + ". " + choices["text"][i].strip() + "\n"
+        choices_str += choice_str
+
+    if COUNTRY and REGION:
+        cur_prompt = REGION_COUNTRY_PROMPT_AR if ARABIC else REGION_COUNTRY_PROMPT
+        doc_text = cur_prompt.format(
+            country=country,
+            region=region,
+            first_statement=first_statement,
+            choices=choices_str,
+        )
+    elif REGION:
+        cur_prompt = REGION_PROMPT_AR if ARABIC else REGION_PROMPT
+        doc_text = cur_prompt.format(
+            region=region, first_statement=first_statement, choices=choices_str
+        )
+    else:
+        cur_prompt = BASE_PROMPT_AR if ARABIC else BASE_PROMPT
+        doc_text = cur_prompt.format(
+            first_statement=first_statement, choices=choices_str
+        )
+
+    ### apply jais chat template
+    if MODEL_NAME and "jais" in MODEL_NAME and "chat" in MODEL_NAME:
+        if ARABIC:
+            doc_text = JAIS_CHAT_AR.format(question=doc_text)
+        else:
+            doc_text = JAIS_CHAT_EN.format(question=doc_text)
+
+    return doc_text
+
+
+def doc_to_choice(doc):
+    return doc["options"]["arabic_keys"] if ARABIC else doc["options"]["english_keys"]
+
+
+def doc_to_target(doc):
+    ans = (
+        doc["answer_key"]["arabic_answer_key"]
+        if ARABIC
+        else doc["answer_key"]["english_answer_key"]
+    )
+    ans = ans.strip()
+    return ans
diff --git a/lm_eval/tasks/arab_culture_completion/README.md b/lm_eval/tasks/arab_culture_completion/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f8bc5a8c97da0d644460ad0bfc5597d337c28679
--- /dev/null
+++ b/lm_eval/tasks/arab_culture_completion/README.md
@@ -0,0 +1,70 @@
+# Arab Culture
+
+### Paper
+
+Title: Commonsense Reasoning in Arab Culture
+
+
+Abstract: https://arxiv.org/abs/2502.12788
+
+Despite progress in Arabic large language models, such as Jais and AceGPT, their evaluation on commonsense reasoning has largely relied on machine-translated datasets, which lack cultural depth and may introduce Anglocentric biases. Commonsense reasoning is shaped by geographical and cultural contexts, and existing English datasets fail to capture the diversity of the Arab world. To address this, we introduce \datasetname, a commonsense reasoning dataset in Modern Standard Arabic (MSA), covering cultures of 13 countries across the Gulf, Levant, North Africa, and the Nile Valley. The dataset was built from scratch by engaging native speakers to write and validate culturally relevant questions for their respective countries. \datasetname spans 12 daily life domains with 54 fine-grained subtopics, reflecting various aspects of social norms, traditions, and everyday experiences. Zero-shot evaluations show that open-weight language models with up to 32B parameters struggle to comprehend diverse Arab cultures, with performance varying across regions. These findings highlight the need for more culturally aware models and datasets tailored to the Arabic-speaking world.
+
+Homepage: https://github.com/fajri91/ArabicCulture
+
+
+### Citation
+
+```
+@misc{sadallah2025commonsensereasoningarabculture,
+      title={Commonsense Reasoning in Arab Culture},
+      author={Abdelrahman Sadallah and Junior Cedric Tonga and Khalid Almubarak and Saeed Almheiri and Farah Atif and Chatrine Qwaider and Karima Kadaoui and Sara Shatnawi and Yaser Alesh and Fajri Koto},
+      year={2025},
+      eprint={2502.12788},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2502.12788},
+}
+```
+
+### There are two variant of this task: `arab_culture`, and `arab_culture_completion`
+
+- The `arab_culture` is the normal MCQ evaluation type, which appends the answers to the question, and then measure the likelihood of the different choices markers (A,B,C or "أ","ب","ج"). For more info, follow the MMLU style [tempelate](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/mmlu/default/_default_template_yaml#L7-L8)
+- The `arab_culture_completion` do the evaluation in a sentence completion manner, by appending each asnwer to the question separetley and chooses the answer with the higher likelihood. See [this](https://github.com/EleutherAI/lm-evaluation-harness/blob/1f9bc88fe61f6bfa36f74e91ce3d59ab5685e4f1/lm_eval/tasks/arc/arc_easy.yaml#L10-L12) for more information
+
+### Groups and Tasks
+
+#### Groups
+
+* `arabculture`: evaluates all ArabCulture tasks.
+
+* `arab_culture_gulf`: evaluates Gulf countires ArabCulture tasks.
+* `arab_culture_levant`: evaluates Levant countires ArabCulture tasks.
+* `arab_culture_nile_valley`: evaluates Nile Valley countires ArabCulture tasks.
+* `arab_culture_north_africa`: evaluates North Africa ArabCulture tasks.
+
+###  Evaluation modes
+This bechmark allows for different evaluation settings by allowing to adding more extra context for the model:
+
+We have three settings:
+* without any information
+```
+COUNTRY=False
+REGION=False
+```
+* with  only region information
+```
+COUNTRY=False
+REGION=True
+```
+* with region and country information
+```
+COUNTRY=True
+REGION=True
+```
+
+**Please add these flags add environment variables.**
+
+
+* We also allow for prompting in English, which we found to acheive higher results on most of the evaluated models (please refer to our paper).
+
+* To change the language of the prompt, Define the `ARABIC` environment variable.
diff --git a/lm_eval/tasks/arab_culture_completion/_arab_culture_completion.yaml b/lm_eval/tasks/arab_culture_completion/_arab_culture_completion.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..814f366e8d106201b536a4bf85447fcaf09aebdc
--- /dev/null
+++ b/lm_eval/tasks/arab_culture_completion/_arab_culture_completion.yaml
@@ -0,0 +1,12 @@
+aggregate_metric_list:
+    metric: acc
+    weight_by_size: true
+group: arab_culture_completion
+metadata:
+    description: Arab Culture tasks
+    version: 0
+task:
+- arab_culture_completion_gulf
+- arab_culture_completion_levant
+- arab_culture_completion_north_africa
+- arab_culture_completion_nile_valley
diff --git a/lm_eval/tasks/arab_culture_completion/_arab_culture_completion_gulf.yaml b/lm_eval/tasks/arab_culture_completion/_arab_culture_completion_gulf.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b342b428b85e9625d3a8045902a6f4355df98306
--- /dev/null
+++ b/lm_eval/tasks/arab_culture_completion/_arab_culture_completion_gulf.yaml
@@ -0,0 +1,10 @@
+aggregate_metric_list:
+    metric: acc
+    weight_by_size: true
+group: arab_culture_completion_gulf
+group_alias: Gulf
+metadata:
+    description: arab Culture tasks
+    version: 0
+task:
+- arab_culture_completion_gulf_tasks
diff --git a/lm_eval/tasks/arab_culture_completion/_arab_culture_completion_levant.yaml b/lm_eval/tasks/arab_culture_completion/_arab_culture_completion_levant.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..199f68dc05f3125df329938937e6c697a237485e
--- /dev/null
+++ b/lm_eval/tasks/arab_culture_completion/_arab_culture_completion_levant.yaml
@@ -0,0 +1,10 @@
+aggregate_metric_list:
+    metric: acc
+    weight_by_size: true
+group: arab_culture_completion_levant
+group_alias: Levant
+metadata:
+    description: arab Culture tasks
+    version: 0
+task:
+- arab_culture_completion_levant_tasks
diff --git a/lm_eval/tasks/arab_culture_completion/_arab_culture_completion_nile_valley.yaml b/lm_eval/tasks/arab_culture_completion/_arab_culture_completion_nile_valley.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..284711eeb9af2fd94efd0f266a6d9b3a5764640d
--- /dev/null
+++ b/lm_eval/tasks/arab_culture_completion/_arab_culture_completion_nile_valley.yaml
@@ -0,0 +1,10 @@
+aggregate_metric_list:
+    metric: acc
+    weight_by_size: true
+group: arab_culture_completion_nile_valley
+group_alias: Nile Valley
+metadata:
+    description: arab Culture tasks
+    version: 0
+task:
+- arab_culture_completion_nile_valley_tasks
diff --git a/lm_eval/tasks/arab_culture_completion/_arab_culture_completion_north_africa.yaml b/lm_eval/tasks/arab_culture_completion/_arab_culture_completion_north_africa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..10c32d9fa17fa8a0da4a204bce77f48dd191a1c7
--- /dev/null
+++ b/lm_eval/tasks/arab_culture_completion/_arab_culture_completion_north_africa.yaml
@@ -0,0 +1,10 @@
+aggregate_metric_list:
+    metric: acc
+    weight_by_size: true
+group: arab_culture_completion_north_africa
+group_alias: North Africa
+metadata:
+    description: arab Culture tasks
+    version: 0
+task:
+- arab_culture_completion_north_africa_tasks
diff --git a/lm_eval/tasks/arab_culture_completion/_default_arab_culture_completion_template_yaml b/lm_eval/tasks/arab_culture_completion/_default_arab_culture_completion_template_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6d5961ac605a25e3bfe37e757f2f755ba6c361da
--- /dev/null
+++ b/lm_eval/tasks/arab_culture_completion/_default_arab_culture_completion_template_yaml
@@ -0,0 +1,19 @@
+dataset_path: boda/arabic_cluture
+test_split: test
+fewshot_split: test
+fewshot_config:
+  sampler: first_n
+output_type: multiple_choice
+doc_to_text: !function utils_completion.doc_to_text
+doc_to_choice: !function utils_completion.doc_to_choice
+doc_to_target: !function utils_completion.doc_to_target
+target_delimiter: ""
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/arab_culture_completion/_generate_configs.py b/lm_eval/tasks/arab_culture_completion/_generate_configs.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5c530f53a64736237147728211d4ae72b880380
--- /dev/null
+++ b/lm_eval/tasks/arab_culture_completion/_generate_configs.py
@@ -0,0 +1,125 @@
+"""
+Take in a YAML, and output all "other" splits with this YAML
+"""
+
+import argparse
+import logging
+import os
+
+import yaml
+from tqdm import tqdm
+
+
+eval_logger = logging.getLogger("lm-eval")
+
+countries = {
+    "KSA": "Gulf",
+    "UAE": "Gulf",
+    "Yemen": "Gulf",
+    "Lebanon": "Levant",
+    "Syria": "Levant",
+    "Palestine": "Levant",
+    "Jordan": "Levant",
+    "Tunisia": "North Africa",
+    "Algeria": "North Africa",
+    "Morocco": "North Africa",
+    "Libya": "North Africa",
+    "Egypt": "Nile Valley",
+    "Sudan": "Nile Valley",
+}
+
+VERSION = 0
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--base_yaml_path", default="_default_arab_culture_completion_template_yaml"
+    )
+    parser.add_argument("--save_prefix_path", default="arab_culture_completion")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    # get filename of base_yaml so we can `"include": ` it in our "other" YAMLs.
+    base_yaml_name = os.path.split(args.base_yaml_path)[-1]
+    # with open(args.base_yaml_path, encoding="utf-8") as f:
+    #     base_yaml = yaml.full_load(f)
+
+    ALL_REGIONS = []
+    for country, region in tqdm(countries.items()):
+        if region not in ALL_REGIONS:
+            ALL_REGIONS.append(region)
+
+        # description = f"The following are multiple choice questions (with answers) about {' '.join(subject.split('_'))}.\n\n"
+
+        yaml_dict = {
+            "include": base_yaml_name,
+            "tag": f"arab_culture_completion_{region.lower().replace(' ', '_')}_tasks",
+            "task": f"arab_culture_completion_{country.lower().replace(' ', '_')}",
+            "task_alias": country,
+            "dataset_name": country,
+            # "description": description,
+        }
+
+        file_save_path = (
+            args.save_prefix_path
+            + f"_{country.lower().replace(' ', '_').replace('(', '').replace(')', '')}.yaml"
+        )
+        eval_logger.info(f"Saving yaml for subset {country} to {file_save_path}")
+        with open(file_save_path, "w", encoding="utf-8") as yaml_file:
+            yaml.dump(
+                yaml_dict,
+                yaml_file,
+                allow_unicode=True,
+                default_style='"',
+            )
+
+    arab_culture_completion_regions = [
+        f"arab_culture_completion_{region.lower().replace(' ', '_')}"
+        for region in ALL_REGIONS
+    ]
+
+    file_save_path = args.save_prefix_path + ".yaml"
+
+    eval_logger.info(f"Saving benchmark config to {file_save_path}")
+
+    for region in ALL_REGIONS:
+        file_save_path = (
+            args.save_prefix_path + f"_{region.lower().replace(' ', '_')}.yaml"
+        )
+        eval_logger.info(f"Saving yaml for subset {region} to {file_save_path}")
+        with open("_" + file_save_path, "w", encoding="utf-8") as yaml_file:
+            yaml.dump(
+                {
+                    "group": f"arab_culture_completion_{region.lower().replace(' ', '_')}",
+                    "group_alias": region,
+                    "task": [
+                        f"arab_culture_completion_{region.lower().replace(' ', '_')}_tasks"
+                    ],
+                    "aggregate_metric_list": {"metric": "acc", "weight_by_size": True},
+                    "metadata": {
+                        "description": "arab Culture tasks",
+                        "version": VERSION,
+                    },
+                },
+                yaml_file,
+                indent=4,
+                default_flow_style=False,
+            )
+
+    file_save_path = args.save_prefix_path + ".yaml"
+    with open("_" + file_save_path, "w", encoding="utf-8") as yaml_file:
+        yaml.dump(
+            {
+                "group": "arab_culture_completion",
+                "task": arab_culture_completion_regions,
+                "aggregate_metric_list": {"metric": "acc", "weight_by_size": True},
+                "metadata": {"description": "Arab Culture tasks", "version": VERSION},
+            },
+            yaml_file,
+            indent=4,
+            default_flow_style=False,
+        )
diff --git a/lm_eval/tasks/arab_culture_completion/arab_culture_completion_algeria.yaml b/lm_eval/tasks/arab_culture_completion/arab_culture_completion_algeria.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e3cb7d8bac370f66a9984155c3bf2a28f30201b0
--- /dev/null
+++ b/lm_eval/tasks/arab_culture_completion/arab_culture_completion_algeria.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "Algeria"
+"include": "_default_arab_culture_completion_template_yaml"
+"tag": "arab_culture_completion_north_africa_tasks"
+"task": "arab_culture_completion_algeria"
+"task_alias": "Algeria"
diff --git a/lm_eval/tasks/arab_culture_completion/arab_culture_completion_egypt.yaml b/lm_eval/tasks/arab_culture_completion/arab_culture_completion_egypt.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f740d4b8b575705e740c4d0d3ffb079992e2b879
--- /dev/null
+++ b/lm_eval/tasks/arab_culture_completion/arab_culture_completion_egypt.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "Egypt"
+"include": "_default_arab_culture_completion_template_yaml"
+"tag": "arab_culture_completion_nile_valley_tasks"
+"task": "arab_culture_completion_egypt"
+"task_alias": "Egypt"
diff --git a/lm_eval/tasks/arab_culture_completion/arab_culture_completion_jordan.yaml b/lm_eval/tasks/arab_culture_completion/arab_culture_completion_jordan.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dec15211d4aabc3b45da990fb573ebb5e48b1546
--- /dev/null
+++ b/lm_eval/tasks/arab_culture_completion/arab_culture_completion_jordan.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "Jordan"
+"include": "_default_arab_culture_completion_template_yaml"
+"tag": "arab_culture_completion_levant_tasks"
+"task": "arab_culture_completion_jordan"
+"task_alias": "Jordan"
diff --git a/lm_eval/tasks/arab_culture_completion/arab_culture_completion_ksa.yaml b/lm_eval/tasks/arab_culture_completion/arab_culture_completion_ksa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ec1ea8900e9d82b771bed92de954172c269f899f
--- /dev/null
+++ b/lm_eval/tasks/arab_culture_completion/arab_culture_completion_ksa.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "KSA"
+"include": "_default_arab_culture_completion_template_yaml"
+"tag": "arab_culture_completion_gulf_tasks"
+"task": "arab_culture_completion_ksa"
+"task_alias": "KSA"
diff --git a/lm_eval/tasks/arab_culture_completion/arab_culture_completion_lebanon.yaml b/lm_eval/tasks/arab_culture_completion/arab_culture_completion_lebanon.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5f31061ffacc25a44778c63cb723c6cc329e99b4
--- /dev/null
+++ b/lm_eval/tasks/arab_culture_completion/arab_culture_completion_lebanon.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "Lebanon"
+"include": "_default_arab_culture_completion_template_yaml"
+"tag": "arab_culture_completion_levant_tasks"
+"task": "arab_culture_completion_lebanon"
+"task_alias": "Lebanon"
diff --git a/lm_eval/tasks/arab_culture_completion/arab_culture_completion_libya.yaml b/lm_eval/tasks/arab_culture_completion/arab_culture_completion_libya.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2541f87c0e2bb64f1659d1396d453ca86a40de37
--- /dev/null
+++ b/lm_eval/tasks/arab_culture_completion/arab_culture_completion_libya.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "Libya"
+"include": "_default_arab_culture_completion_template_yaml"
+"tag": "arab_culture_completion_north_africa_tasks"
+"task": "arab_culture_completion_libya"
+"task_alias": "Libya"
diff --git a/lm_eval/tasks/arab_culture_completion/arab_culture_completion_morocco.yaml b/lm_eval/tasks/arab_culture_completion/arab_culture_completion_morocco.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..86e1cc65d649e69caa27056b5b8892c932aa4a7f
--- /dev/null
+++ b/lm_eval/tasks/arab_culture_completion/arab_culture_completion_morocco.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "Morocco"
+"include": "_default_arab_culture_completion_template_yaml"
+"tag": "arab_culture_completion_north_africa_tasks"
+"task": "arab_culture_completion_morocco"
+"task_alias": "Morocco"
diff --git a/lm_eval/tasks/arab_culture_completion/arab_culture_completion_palestine.yaml b/lm_eval/tasks/arab_culture_completion/arab_culture_completion_palestine.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..44731f7bbd365ac136a07cc8893502969bbada20
--- /dev/null
+++ b/lm_eval/tasks/arab_culture_completion/arab_culture_completion_palestine.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "Palestine"
+"include": "_default_arab_culture_completion_template_yaml"
+"tag": "arab_culture_completion_levant_tasks"
+"task": "arab_culture_completion_palestine"
+"task_alias": "Palestine"
diff --git a/lm_eval/tasks/arab_culture_completion/arab_culture_completion_sudan.yaml b/lm_eval/tasks/arab_culture_completion/arab_culture_completion_sudan.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..76282e98fa880d923c9985b37d01e5dd4fb5a7ee
--- /dev/null
+++ b/lm_eval/tasks/arab_culture_completion/arab_culture_completion_sudan.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "Sudan"
+"include": "_default_arab_culture_completion_template_yaml"
+"tag": "arab_culture_completion_nile_valley_tasks"
+"task": "arab_culture_completion_sudan"
+"task_alias": "Sudan"
diff --git a/lm_eval/tasks/arab_culture_completion/arab_culture_completion_syria.yaml b/lm_eval/tasks/arab_culture_completion/arab_culture_completion_syria.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..33a07dd9a546540cc43b6ee22fc0d6b8676e3293
--- /dev/null
+++ b/lm_eval/tasks/arab_culture_completion/arab_culture_completion_syria.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "Syria"
+"include": "_default_arab_culture_completion_template_yaml"
+"tag": "arab_culture_completion_levant_tasks"
+"task": "arab_culture_completion_syria"
+"task_alias": "Syria"
diff --git a/lm_eval/tasks/arab_culture_completion/arab_culture_completion_tunisia.yaml b/lm_eval/tasks/arab_culture_completion/arab_culture_completion_tunisia.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..41c53529f12350f5cd169938f691d8b50cb9f795
--- /dev/null
+++ b/lm_eval/tasks/arab_culture_completion/arab_culture_completion_tunisia.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "Tunisia"
+"include": "_default_arab_culture_completion_template_yaml"
+"tag": "arab_culture_completion_north_africa_tasks"
+"task": "arab_culture_completion_tunisia"
+"task_alias": "Tunisia"
diff --git a/lm_eval/tasks/arab_culture_completion/arab_culture_completion_uae.yaml b/lm_eval/tasks/arab_culture_completion/arab_culture_completion_uae.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..479440534a43628bcefc7812e7ee8967cdad1bce
--- /dev/null
+++ b/lm_eval/tasks/arab_culture_completion/arab_culture_completion_uae.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "UAE"
+"include": "_default_arab_culture_completion_template_yaml"
+"tag": "arab_culture_completion_gulf_tasks"
+"task": "arab_culture_completion_uae"
+"task_alias": "UAE"
diff --git a/lm_eval/tasks/arab_culture_completion/arab_culture_completion_yemen.yaml b/lm_eval/tasks/arab_culture_completion/arab_culture_completion_yemen.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..88dd6dd5804beefc749f66aa1f133dfb33a95894
--- /dev/null
+++ b/lm_eval/tasks/arab_culture_completion/arab_culture_completion_yemen.yaml
@@ -0,0 +1,5 @@
+"dataset_name": "Yemen"
+"include": "_default_arab_culture_completion_template_yaml"
+"tag": "arab_culture_completion_gulf_tasks"
+"task": "arab_culture_completion_yemen"
+"task_alias": "Yemen"
diff --git a/lm_eval/tasks/arab_culture_completion/prompts.py b/lm_eval/tasks/arab_culture_completion/prompts.py
new file mode 100644
index 0000000000000000000000000000000000000000..b042c056d94e95e854b83672abd36e7cf81ef4d5
--- /dev/null
+++ b/lm_eval/tasks/arab_culture_completion/prompts.py
@@ -0,0 +1,56 @@
+REGION_COUNTRY_PROMPT_AR = """
+مهمتك هي اختيار الخيار الأنسب ثقافياً بناءً على السياق المقدم أدناه.
+
+الموقع: {country}, {region}
+الجملة: {first_statement}
+
+يرجى مراعاة الفروق الثقافية للموقع المحدد واختيار الإجابة الأكثر ملاءمة من الخيارات المتاحة.
+"""
+
+REGION_PROMPT_AR = """
+مهمتك هي اختيار الخيار الأنسب ثقافياً بناءً على السياق المقدم أدناه.
+
+الموقع: {region}
+الجملة: {first_statement}
+
+يرجى مراعاة الفروق الثقافية للموقع المحدد واختيار الإجابة الأكثر ملاءمة من الخيارات المتاحة.
+"""
+
+BASE_PROMPT_AR = """
+مهمتك هي اختيار الخيار الأنسب ثقافياً بناءً على السياق المقدم أدناه.
+
+الجملة: {first_statement}
+
+يرجى مراعاة الفروق الثقافية واختيار الإجابة الأكثر ملاءمة من الخيارات المتاحة.
+"""
+
+
+REGION_COUNTRY_PROMPT = """
+You are tasked with selecting the most culturally appropriate option based on the context provided below.
+
+Location: {country}, {region}
+Statement: {first_statement}
+
+Consider the cultural nuances of the specified location and choose the most suitable response from the options provided.
+"""
+REGION_PROMPT = """
+You are tasked with selecting the most culturally appropriate option based on the context provided below.
+
+Location: {region}
+Statement: {first_statement}
+
+Consider the cultural nuances of the specified location and choose the most suitable response from the options provided.
+"""
+BASE_PROMPT = """
+You are tasked with selecting the most culturally appropriate option based on the context provided below.
+
+Statement: {first_statement}
+
+Consider the cultural nuances and choose the most suitable response from the options provided.
+"""
+
+
+JAIS_CHAT_EN = """### Instruction: Your name is Jais, and you are named after Jebel Jais, the highest mountain in UAE. You are built by Core42. You are the world's most advanced Arabic large language model with 30b parameters. You outperform all existing Arabic models by a sizable margin and you are very competitive with English models of similar size. You can answer in Arabic and English only. You are a helpful, respectful and honest assistant. When answering, abide by the following guidelines meticulously: Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, explicit, offensive, toxic, dangerous, or illegal content. Do not give medical, legal, financial, or professional advice. Never assist in or promote illegal activities. Always encourage legal and responsible actions. Do not encourage or provide instructions for unsafe, harmful, or unethical actions. Do not create or share misinformation or fake news. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. Prioritize the well-being and the moral integrity of users. Avoid using toxic, derogatory, or offensive language. Maintain a respectful tone. Do not generate, promote, or engage in discussions about adult content. Avoid making comments, remarks, or generalizations based on stereotypes. Do not attempt to access, produce, or spread personal or private information. Always respect user confidentiality. Stay positive and do not say bad things about anything. Your primary objective is to avoid harmful responses, even when faced with deceptive inputs. Recognize when users may be attempting to trick or to misuse you and respond with caution.\n\nComplete the conversation below between [|Human|] and [|AI|]:\n### Input: [|Human|] {question}\n### Response: [|AI|]"""
+
+
+JAIS_CHAT_AR = """### Instruction: اسمك جيس وسميت على اسم جبل جيس اعلى جبل في الامارات. تم بنائك بواسطة Inception و MBZUAI. أنت نموذج اللغة العربية الأكثر تقدمًا في العالم مع بارامترات 13B. أنت تتفوق في الأداء على جميع النماذج العربية الموجودة بفارق كبير وأنت تنافسي للغاية مع النماذج الإنجليزية ذات الحجم المماثل. يمكنك الإجابة باللغتين العربية والإنجليزية فقط. أنت مساعد مفيد ومحترم وصادق. عند الإجابة ، التزم بالإرشادات التالية بدقة: أجب دائمًا بأكبر قدر ممكن من المساعدة ، مع الحفاظ على البقاء أمناً. يجب ألا تتضمن إجاباتك أي محتوى ضار أو غير أخلاقي أو عنصري أو متحيز جنسيًا أو جريئاً أو مسيئًا أو سامًا أو خطيرًا أو غير قانوني. لا تقدم نصائح طبية أو قانونية أو مالية أو مهنية. لا تساعد أبدًا في أنشطة غير قانونية أو تروج لها. دائما تشجيع الإجراءات القانونية والمسؤولة. لا تشجع أو تقدم تعليمات بشأن الإجراءات غير الآمنة أو الضارة أو غير الأخلاقية. لا تنشئ أو تشارك معلومات مضللة أو أخبار كاذبة. يرجى التأكد من أن ردودك غير متحيزة اجتماعيًا وإيجابية بطبيعتها. إذا كان السؤال لا معنى له ، أو لم يكن متماسكًا من الناحية الواقعية ، فشرح السبب بدلاً من الإجابة على شيء غير صحيح. إذا كنت لا تعرف إجابة السؤال ، فالرجاء عدم مشاركة معلومات خاطئة. إعطاء الأولوية للرفاهية والنزاهة الأخلاقية للمستخدمين. تجنب استخدام لغة سامة أو مهينة أو مسيئة. حافظ على نبرة محترمة. لا تنشئ أو تروج أو تشارك في مناقشات حول محتوى للبالغين. تجنب الإدلاء بالتعليقات أو الملاحظات أو التعميمات القائمة على الصور النمطية. لا تحاول الوصول إلى معلومات شخصية أو خاصة أو إنتاجها أو نشرها. احترم دائما سرية المستخدم. كن إيجابيا ولا تقل أشياء سيئة عن أي شيء. هدفك الأساسي هو تجنب الاجابات المؤذية ، حتى عند مواجهة مدخلات خادعة. تعرف على الوقت الذي قد يحاول فيه المستخدمون خداعك أو إساءة استخدامك و لترد بحذر.\n\nأكمل المحادثة أدناه بين [|Human|] و [|AI|]:\n### Input: [|Human|] {question}\n### Response: [|AI|]"""
diff --git a/lm_eval/tasks/arab_culture_completion/utils_completion.py b/lm_eval/tasks/arab_culture_completion/utils_completion.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c63995876952eaafdf86d82f87825b042700df2
--- /dev/null
+++ b/lm_eval/tasks/arab_culture_completion/utils_completion.py
@@ -0,0 +1,102 @@
+import os
+
+from lm_eval.tasks.arab_culture_completion.prompts import (
+    BASE_PROMPT,
+    BASE_PROMPT_AR,
+    JAIS_CHAT_AR,
+    JAIS_CHAT_EN,
+    REGION_COUNTRY_PROMPT,
+    REGION_COUNTRY_PROMPT_AR,
+    REGION_PROMPT,
+    REGION_PROMPT_AR,
+)
+
+
+### get the conutry variable from environment
+
+
+### Set this to one to add the country and region information to the prompt
+COUNTRY = True if os.getenv("COUNTRY", True) == "True" else False
+### Set this to one to add the region information to the prompt
+REGION = True if os.getenv("REGION", True) == "True" else False
+### Set this to change between Arabic and English for the answer keys and the choices keys
+ARABIC = True if os.getenv("ARABIC", True) == "True" else False
+### Get the model name
+MODEL_NAME = os.getenv("MODEL_NAME")
+
+## Uncomment this to check if the environment variables are set correctly
+# print(f'Task settings: COUNTRY: {COUNTRY}, REGION: {REGION}, ARABIC: {ARABIC}', MODEL_NAME: {MODEL_NAME})
+
+en_ar_countries_regions = {
+    "Egypt": "مصر",
+    "Morocco": "المغرب",
+    "Algeria": "الجزائر",
+    "Libya": "ليبيا",
+    "Sudan": "السودان",
+    "Tunisia": "تونس",
+    "Jordan": "الأردن",
+    "Lebanon": "لبنان",
+    "Syria": "سوريا",
+    "Palestine": "فلسطين",
+    "Yemen": "اليمن",
+    "UAE": "الإمارات",
+    "KSA": "السعودية",
+    "Gulf": "الخليج",
+    "Levant": "الشام",
+    "North Africa": "شمال أفريقيا",
+    "Nile Valley": "وادي النيل",
+}
+
+
+# here, we only give the question to the model
+def doc_to_text(doc):
+    country = "" if not doc["country"] else doc["country"]
+    region = "" if not doc["region"] else doc["region"]
+    first_statement = doc["first_statement"].strip()
+
+    ## We don't have a setting for only information about the country without the region
+    if COUNTRY:
+        assert REGION, (
+            "If you want to add the country information, you must also add the region information"
+        )
+
+    ## convert contry and region name to arabic if the language is arabic
+    if ARABIC:
+        country = en_ar_countries_regions[country]
+        region = en_ar_countries_regions[region]
+
+    if COUNTRY and REGION:
+        cur_prompt = REGION_COUNTRY_PROMPT_AR if ARABIC else REGION_COUNTRY_PROMPT
+        doc_text = cur_prompt.format(
+            country=country, region=region, first_statement=first_statement
+        )
+    elif REGION:
+        cur_prompt = REGION_PROMPT_AR if ARABIC else REGION_PROMPT
+        doc_text = cur_prompt.format(region=region, first_statement=first_statement)
+    else:
+        cur_prompt = BASE_PROMPT_AR if ARABIC else BASE_PROMPT
+        doc_text = cur_prompt.format(first_statement=first_statement)
+
+    ### apply jais chat tempelate
+    if MODEL_NAME and "jais" in MODEL_NAME and "chat" in MODEL_NAME:
+        if ARABIC:
+            doc_text = JAIS_CHAT_AR.format(question=doc_text)
+        else:
+            doc_text = JAIS_CHAT_EN.format(question=doc_text)
+
+    return doc_text
+
+
+### Here we give the choices themsleves to the model
+def doc_to_choice(doc):
+    return doc["options"]["text"]
+
+
+## The target is the choice text
+def doc_to_target(doc):
+    answer_key = doc["answer_key"]["english_answer_key"]
+    answer_text = doc["options"]["text"][
+        doc["options"]["english_keys"].index(answer_key)
+    ]
+    answer_text = answer_text.strip()
+    return answer_text
diff --git a/lm_eval/tasks/bbq/utils.py b/lm_eval/tasks/bbq/utils.py
index 8ec6b01b6dda2e5223bc67cbdb4ddd748b3d5994..f00c7c6c00c3b9d24bf3501d4f2cd09b8dbcb70d 100644
--- a/lm_eval/tasks/bbq/utils.py
+++ b/lm_eval/tasks/bbq/utils.py
@@ -4,6 +4,9 @@ import datasets
 import numpy as np
 
 
+if np.__version__ >= "2.0":
+    np.NaN = np.nan
+
 # Possible unknown responses, copied from the HELM implementation
 UNKNOWN_RESPONSES = [
     "Unknown",
diff --git a/lm_eval/tasks/c4/README.md b/lm_eval/tasks/c4/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9ade56f60e32c50cc8733ed3073193aeaedaafd2
--- /dev/null
+++ b/lm_eval/tasks/c4/README.md
@@ -0,0 +1,51 @@
+# Colossal Clean Crawled Corpus(C4)
+
+### Paper
+
+[Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683)
+
+A colossal, cleaned version of Common Crawl's web crawl corpus. Based on [Common Crawl dataset](https://commoncrawl.org).
+
+This is the processed version of Google's C4 dataset.
+
+[Homepage](https://huggingface.co/datasets/allenai/c4)
+
+### Citation
+
+```text
+@misc{raffel2023exploringlimitstransferlearning,
+      title={Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
+      author={Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},
+      year={2023},
+      eprint={1910.10683},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG},
+      url={https://arxiv.org/abs/1910.10683},
+}
+```
+
+### Groups, Tags, and Tasks
+
+#### Groups
+
+* Not part of a group yet.
+
+#### Tasks
+
+* `c4`: measure perplexity on the C4 dataset, via rolling loglikelihoods.
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+If other tasks on this dataset are already supported:
+
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
+
+### Changelog
diff --git a/lm_eval/tasks/c4/c4.yaml b/lm_eval/tasks/c4/c4.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bdbd70c66e33e5a0ee51c8c59bc77bfda92468b8
--- /dev/null
+++ b/lm_eval/tasks/c4/c4.yaml
@@ -0,0 +1,24 @@
+task: c4
+dataset_path: allenai/c4
+dataset_name: en
+output_type: loglikelihood_rolling
+training_split: train
+validation_split: validation
+doc_to_text: ""
+doc_to_target: !function preprocess_c4.c4_detokenizer
+process_results: !function preprocess_c4.process_results
+should_decontaminate: true
+doc_to_decontamination_query: "{{page}}"
+metric_list:
+  - metric: word_perplexity
+  - metric: byte_perplexity
+  - metric: bits_per_byte
+metadata:
+  version: 0.0
+dataset_kwargs:
+  data_files:
+    train: en/c4-train.00000-of-01024.json.gz
+    validation: en/c4-validation.00000-of-00008.json.gz
+  # following the choice of https://arxiv.org/abs/2410.07461
+  trust_remote_code: true
+  verification_mode: "no_checks"
diff --git a/lm_eval/tasks/c4/preprocess_c4.py b/lm_eval/tasks/c4/preprocess_c4.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ab0d32f90bb1d2f0d7ab3231d925e8a46f67771
--- /dev/null
+++ b/lm_eval/tasks/c4/preprocess_c4.py
@@ -0,0 +1,48 @@
+import re
+
+
+def c4_detokenizer(doc):
+    string = doc["text"]
+    # contractions
+    string = string.replace("s '", "s'")
+    string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
+    # number separators
+    string = string.replace(" @-@ ", "-")
+    string = string.replace(" @,@ ", ",")
+    string = string.replace(" @.@ ", ".")
+    # punctuation
+    string = string.replace(" : ", ": ")
+    string = string.replace(" ; ", "; ")
+    string = string.replace(" . ", ". ")
+    string = string.replace(" ! ", "! ")
+    string = string.replace(" ? ", "? ")
+    string = string.replace(" , ", ", ")
+    # double brackets
+    string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
+    string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
+    string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
+    string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
+    string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
+    # miscellaneous
+    string = string.replace("= = = =", "====")
+    string = string.replace("= = =", "===")
+    string = string.replace("= =", "==")
+    string = string.replace(" " + chr(176) + " ", chr(176))
+    string = string.replace(" \n", "\n")
+    string = string.replace("\n ", "\n")
+    string = string.replace(" N ", " 1 ")
+    string = string.replace(" 's", "'s")
+
+    return string
+
+
+def process_results(doc, results):
+    (loglikelihood,) = results
+    # IMPORTANT: wikitext counts number of words in *original doc before detokenization*
+    _words = len(re.split(r"\s+", doc["text"]))
+    _bytes = len(doc["text"].encode("utf-8"))
+    return {
+        "word_perplexity": (loglikelihood, _words),
+        "byte_perplexity": (loglikelihood, _bytes),
+        "bits_per_byte": (loglikelihood, _bytes),
+    }
diff --git a/lm_eval/tasks/ceval/_generate_configs.py b/lm_eval/tasks/ceval/_generate_configs.py
index 9050c75c0644f2dc8bfd800f2573b23c90988668..81cbcb1599e53ddd21260b579631730957f33eab 100644
--- a/lm_eval/tasks/ceval/_generate_configs.py
+++ b/lm_eval/tasks/ceval/_generate_configs.py
@@ -3,12 +3,14 @@ Take in a YAML, and output all other splits with this YAML
 """
 
 import argparse
+import logging
 import os
 
 import yaml
 from tqdm import tqdm
 
-from lm_eval.utils import eval_logger
+
+eval_logger = logging.getLogger(__name__)
 
 
 SUBJECTS = {
diff --git a/lm_eval/tasks/cmmlu/_generate_configs.py b/lm_eval/tasks/cmmlu/_generate_configs.py
index f1b60e28b6b4654f1def5af0a45d59ba6711c2e9..74348ed40cc30d8c2d9c8c133a874f2ceb3f94da 100644
--- a/lm_eval/tasks/cmmlu/_generate_configs.py
+++ b/lm_eval/tasks/cmmlu/_generate_configs.py
@@ -3,12 +3,14 @@ Take in a YAML, and output all other splits with this YAML
 """
 
 import argparse
+import logging
 import os
 
 import yaml
 from tqdm import tqdm
 
-from lm_eval.utils import eval_logger
+
+eval_logger = logging.getLogger(__name__)
 
 
 SUBJECTS = {
diff --git a/lm_eval/tasks/csatqa/_generate_configs.py b/lm_eval/tasks/csatqa/_generate_configs.py
index 567deeeac607c05a1a87dcae09bbd70f3fe71581..1ef34b8cf1e1a21ff511d6e1ef12d52da7781082 100644
--- a/lm_eval/tasks/csatqa/_generate_configs.py
+++ b/lm_eval/tasks/csatqa/_generate_configs.py
@@ -3,12 +3,14 @@ Take in a YAML, and output all other splits with this YAML
 """
 
 import argparse
+import logging
 import os
 
 import yaml
 from tqdm import tqdm
 
-from lm_eval.logger import eval_logger
+
+eval_logger = logging.getLogger(__name__)
 
 
 SUBSETS = ["WR", "GR", "RCS", "RCSS", "RCH", "LI"]
diff --git a/lm_eval/tasks/ifeval/instructions.py b/lm_eval/tasks/ifeval/instructions.py
index 9a7bcce13b0f29b829f21dea14b8f7ce5baeaac1..8a23d9e79017a8817a81b184957844740d3be4fe 100644
--- a/lm_eval/tasks/ifeval/instructions.py
+++ b/lm_eval/tasks/ifeval/instructions.py
@@ -740,7 +740,7 @@ class RephraseChecker(Instruction):
 
 
 class KeywordChecker(Instruction):
-    """Check the exisitence of certain keywords."""
+    """Check the existence of certain keywords."""
 
     def build_description(self, *, keywords=None):
         """Build the instruction description.
@@ -1161,7 +1161,7 @@ class RephraseParagraph(Instruction):
 
         Args:
           original_paragraph: A string presenting the original paragraph. The
-            rephrases response should have betweeb low-high words in common.
+            rephrases response should have between low-high words in common.
           low: An integer presenting the lower bound of similar words.
           high: An integer representing the upper bound of similar words.
 
diff --git a/lm_eval/tasks/kbl/bar_exam/civil/kbl_bar_exam_em_civil_2025.yaml b/lm_eval/tasks/kbl/bar_exam/civil/kbl_bar_exam_em_civil_2025.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..be366cc364e4d6b093968c50de55cdc2b9192219
--- /dev/null
+++ b/lm_eval/tasks/kbl/bar_exam/civil/kbl_bar_exam_em_civil_2025.yaml
@@ -0,0 +1,3 @@
+task: kbl_bar_exam_em_civil_2025
+dataset_name: bar_exam_civil_2025
+include: _base_em_yaml
diff --git a/lm_eval/tasks/llama3/README.md b/lm_eval/tasks/llama3/README.md
index 1b0b762b97faf3193ed1824043040bf7ade5791d..1c84ec5e44008dfce81c44171be0600ba81ffdd5 100644
--- a/lm_eval/tasks/llama3/README.md
+++ b/lm_eval/tasks/llama3/README.md
@@ -39,7 +39,6 @@ BibTeX-formatted citation goes here
 * `mmlu_hi_llama`: `Hindi version of generation MMLU`
 * `mmlu_es_llama`: `Spanish version of generation MMLU`
 * `mmlu_de_llama`: `German version of generation MMLU`
-* `arc_chalenge_chat`: `generation variant of ARC-Challenge using MMLU format`
 * `arc_challenge_llama`: `generation variant of ARC-Challenge following Meta pre-processing`
 * `gsm8k_llama`: `Chain-of-though variant of GSM8k`
 
diff --git a/lm_eval/tasks/longbench/2wikimqa.yaml b/lm_eval/tasks/longbench/2wikimqa.yaml
index d41333d7b5d09382e5b4d90269007cc3956a80f7..d1d1791b6716253c300bcbb4701128a9961a38ee 100644
--- a/lm_eval/tasks/longbench/2wikimqa.yaml
+++ b/lm_eval/tasks/longbench/2wikimqa.yaml
@@ -6,15 +6,16 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: 2wikimqa
 doc_to_text: 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{{context}}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:'
-doc_to_target: '{{answers[0]}}'
+doc_to_target: '{{answers}}'
+process_results: !function metrics.get_qa_f1_score
 generation_kwargs:
   max_gen_toks: 32
   temperature: 1
   do_sample: True
   until: []
 metric_list:
-  - metric: !function metrics.qa_f1_score
+  - metric: "qa_f1_score"
     aggregation: mean
     higher_is_better: True
 metadata:
-  version: 2.0
+  version: 3.0
diff --git a/lm_eval/tasks/longbench/2wikimqa_e.yaml b/lm_eval/tasks/longbench/2wikimqa_e.yaml
index 3aaf35d6f690d8e21cc5d774065a8e9dcdb8f8d5..e9b5bf195f621986ddf9de02c3fb46fe68d5d17e 100644
--- a/lm_eval/tasks/longbench/2wikimqa_e.yaml
+++ b/lm_eval/tasks/longbench/2wikimqa_e.yaml
@@ -6,15 +6,16 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: 2wikimqa_e
 doc_to_text: 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{{context}}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:'
-doc_to_target: '{{answers[0]}}'
+doc_to_target: '{{answers}}'
+process_results: !function metrics.get_qa_f1_score
 generation_kwargs:
   max_gen_toks: 32
   temperature: 1
   do_sample: True
   until: []
 metric_list:
-  - metric: !function metrics.qa_f1_score
+  - metric: "qa_f1_score"
     aggregation: mean
     higher_is_better: True
 metadata:
-  version: 2.0
+  version: 3.0
diff --git a/lm_eval/tasks/longbench/README.md b/lm_eval/tasks/longbench/README.md
index 8b679647ce3461aae47be0ef1426b3cbc6cdb9e6..bef2dfc13965fc9967b7d17b1c9840d2b7e47d46 100644
--- a/lm_eval/tasks/longbench/README.md
+++ b/lm_eval/tasks/longbench/README.md
@@ -1,10 +1,10 @@
-# Task-name
+# LongBench
 
 ### Paper
 
-Title: `LongBench v2: Towards Deeper Understanding and Reasoning on Realistic Long-context Multitasks`
+Title: `LongBench: A Bilingual, Multitask Benchmark for Long Context Understanding`
 
-Abstract: `This paper introduces LongBench v2, a benchmark designed to assess the ability of LLMs to handle long-context problems requiring deep understanding and reasoning across real-world multitasks. LongBench v2 consists of 503 challenging multiple-choice questions, with contexts ranging from 8k to 2M words, across six major task categories: single-document QA, multi-document QA, long in-context learning, long-dialogue history understanding, code repository understanding, and long structured data understanding.`
+Abstract: `In this paper, we introduce LongBench, the first bilingual, multi-task benchmark for long context understanding, enabling a more rigorous evaluation of long context understanding. LongBench comprises 21 datasets across 6 task categories in both English and Chinese, with an average length of 6,711 words (English) and 13,386 characters (Chinese). These tasks cover key long-text application areas including single-doc QA, multi-doc QA, summarization, few-shot learning, synthetic tasks, and code completion. All datasets in LongBench are standardized into a unified format, allowing for effortless automatic evaluation of LLMs`
 
 Homepage: `https://github.com/THUDM/LongBench`
 
@@ -12,12 +12,6 @@ Homepage: `https://github.com/THUDM/LongBench`
 ### Citation
 
 ```
-@article{bai2024longbench2,
-  title={LongBench v2: Towards Deeper Understanding and Reasoning on Realistic Long-context Multitasks},
-  author={Yushi Bai and Shangqing Tu and Jiajie Zhang and Hao Peng and Xiaozhi Wang and Xin Lv and Shulin Cao and Jiazheng Xu and Lei Hou and Yuxiao Dong and Jie Tang and Juanzi Li},
-  journal={arXiv preprint arXiv:2412.15204},
-  year={2024}
-}
 @inproceedings{bai2024longbench,
     title = "{L}ong{B}ench: A Bilingual, Multitask Benchmark for Long Context Understanding",
     author = "Bai, Yushi and Lv, Xin  and Zhang, Jiajie  and Lyu, Hongchang  and
@@ -32,6 +26,17 @@ Homepage: `https://github.com/THUDM/LongBench`
     pages = "3119--3137",
 }
 ```
+### Notes
+
+#### Tasks without Chat Template (with add_bos_token=True but model dependent)
+
+The original implementation suggest not to use `chat_template` for these tasks (for instruct models):
+- longbench_lcc
+- longbench_repobench-p
+- longbench_samsum
+- longbench_trec
+- longbench_triviaqa
+
 
 ### Groups, Tags, and Tasks
 
@@ -96,3 +101,4 @@ If other tasks on this dataset are already supported:
 
 ### Changelog
 v2.: fix doc_to_target; add vcsum
+v3: properly use all answers for metric calculation; trim whitespace from resps; fix stop sequences not parsing correctly.
diff --git a/lm_eval/tasks/longbench/_generate_config.py b/lm_eval/tasks/longbench/_generate_config.py
index 11c2c0f1c0b901f2d1f65c40ae724ba5389b6098..2f2026c0c759ab92e7fcbd74d56686a2a945d14b 100644
--- a/lm_eval/tasks/longbench/_generate_config.py
+++ b/lm_eval/tasks/longbench/_generate_config.py
@@ -142,7 +142,6 @@ def parse_args():
     return parser.parse_args()
 
 
-# Create template string
 template_str = """
 tag:
   - {{ tag[0] }}
@@ -152,11 +151,12 @@ test_split: {{ test_split }}
 dataset_name: {{ dataset_name }}
 doc_to_text: '{{ doc_to_text }}'
 doc_to_target: '{{ doc_to_target }}'
+process_results: {{ process_results }}
 generation_kwargs:
   max_gen_toks: {{ generation_kwargs.max_gen_toks }}
   temperature: {{ generation_kwargs.temperature }}
   do_sample: {{ generation_kwargs.do_sample }}
-  until: {{ generation_kwargs.until }}
+  until: {% if has_newline %}["\\n"]{% else %}[]{% endif %}
 metric_list:
   - metric: {{ metric_list[0].metric }}
     aggregation: {{ metric_list[0].aggregation }}
@@ -173,21 +173,17 @@ if __name__ == "__main__":
     for ds in DATASETS:
         df = ds[:-2] if ds.endswith("_e") else ds
         # from https://github.com/THUDM/LongBench/blob/2e00731f8d0bff23dc4325161044d0ed8af94c1e/LongBench/eval.py#L52C25-L52C29
-        if df in ["trec", "triviaqa", "samsum", "lsht"] + [
-            "trec_e",
-            "triviaqa_e",
-            "samsum_e",
-            "lsht_e",
-        ]:
-            until = ["\n"]
-        else:
-            until = []
+
+        # Now we just set a boolean flag to indicate whether we need a newline
+        has_newline = df in ["trec", "triviaqa", "samsum", "lsht"]
+
         generation_kwargs = {
             "max_gen_toks": dataset2maxlen[df],
             "temperature": 1,
             "do_sample": True,
-            "until": until,
+            # We'll handle the until value directly in the template
         }
+
         raw_doc_to_text = (
             dataset2prompt[df]
             .replace("\n", "\\n")
@@ -196,25 +192,25 @@ if __name__ == "__main__":
         )
         metric_list = [
             {
-                "metric": f"!function metrics.{dataset2metric[df]}",
+                "metric": f'"{dataset2metric[df]}"',
                 "aggregation": "mean",
                 "higher_is_better": True,
             }
         ]
 
         data = {
-            "tag": [
-                "longbench_e" if ds.endswith("_e") else "longbench"
-            ],  # Now properly as a list
+            "tag": ["longbench_e" if ds.endswith("_e") else "longbench"],
             "task": f"longbench_{ds}",
             "dataset_path": "THUDM/LongBench",
             "test_split": "test",
             "dataset_name": ds,
             "doc_to_text": raw_doc_to_text,
-            "doc_to_target": "{{answers[0]}}",
+            "doc_to_target": "{{answers}}",
+            "process_results": f"!function metrics.get_{dataset2metric[df]}",
             "generation_kwargs": generation_kwargs,
+            "has_newline": has_newline,  # Add the flag to the template context
             "metric_list": metric_list,
-            "metadata": {"version": "2.0"},
+            "metadata": {"version": "3.0"},
         }
 
         # Render template
diff --git a/lm_eval/tasks/longbench/dureader.yaml b/lm_eval/tasks/longbench/dureader.yaml
index c2f404a7a1ab4f63a94bac07e74ce0d7ac0e04b1..e001f349e4b7750c1ba91281447161c247c7825b 100644
--- a/lm_eval/tasks/longbench/dureader.yaml
+++ b/lm_eval/tasks/longbench/dureader.yaml
@@ -6,15 +6,16 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: dureader
 doc_to_text: '请基于给定的文章回答下述问题。\n\n文章：{{context}}\n\n请基于上述文章回答下面的问题。\n\n问题：{{input}}\n回答：'
-doc_to_target: '{{answers[0]}}'
+doc_to_target: '{{answers}}'
+process_results: !function metrics.get_rouge_zh_score
 generation_kwargs:
   max_gen_toks: 128
   temperature: 1
   do_sample: True
   until: []
 metric_list:
-  - metric: !function metrics.rouge_zh_score
+  - metric: "rouge_zh_score"
     aggregation: mean
     higher_is_better: True
 metadata:
-  version: 2.0
+  version: 3.0
diff --git a/lm_eval/tasks/longbench/gov_report.yaml b/lm_eval/tasks/longbench/gov_report.yaml
index 59920be8401fe4a6c75875dd35e84365a1e55847..76307371574948b03daa548142a4eb5fc5957c39 100644
--- a/lm_eval/tasks/longbench/gov_report.yaml
+++ b/lm_eval/tasks/longbench/gov_report.yaml
@@ -6,15 +6,16 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: gov_report
 doc_to_text: 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{{context}}\n\nNow, write a one-page summary of the report.\n\nSummary:'
-doc_to_target: '{{answers[0]}}'
+doc_to_target: '{{answers}}'
+process_results: !function metrics.get_rouge_score
 generation_kwargs:
   max_gen_toks: 512
   temperature: 1
   do_sample: True
   until: []
 metric_list:
-  - metric: !function metrics.rouge_score
+  - metric: "rouge_score"
     aggregation: mean
     higher_is_better: True
 metadata:
-  version: 2.0
+  version: 3.0
diff --git a/lm_eval/tasks/longbench/gov_report_e.yaml b/lm_eval/tasks/longbench/gov_report_e.yaml
index 82617d3878d6905aff271edd32750ff50377e37a..94f013ba2e108503f3bb74fcfd81b48f604e3180 100644
--- a/lm_eval/tasks/longbench/gov_report_e.yaml
+++ b/lm_eval/tasks/longbench/gov_report_e.yaml
@@ -6,15 +6,16 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: gov_report_e
 doc_to_text: 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{{context}}\n\nNow, write a one-page summary of the report.\n\nSummary:'
-doc_to_target: '{{answers[0]}}'
+doc_to_target: '{{answers}}'
+process_results: !function metrics.get_rouge_score
 generation_kwargs:
   max_gen_toks: 512
   temperature: 1
   do_sample: True
   until: []
 metric_list:
-  - metric: !function metrics.rouge_score
+  - metric: "rouge_score"
     aggregation: mean
     higher_is_better: True
 metadata:
-  version: 2.0
+  version: 3.0
diff --git a/lm_eval/tasks/longbench/hotpotqa.yaml b/lm_eval/tasks/longbench/hotpotqa.yaml
index 4545e98bc52bd6004e2253f85fecde2672c9c1b3..5c567a33b690616cebf39118b524122eddf8ed27 100644
--- a/lm_eval/tasks/longbench/hotpotqa.yaml
+++ b/lm_eval/tasks/longbench/hotpotqa.yaml
@@ -6,15 +6,16 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: hotpotqa
 doc_to_text: 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{{context}}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:'
-doc_to_target: '{{answers[0]}}'
+doc_to_target: '{{answers}}'
+process_results: !function metrics.get_qa_f1_score
 generation_kwargs:
   max_gen_toks: 32
   temperature: 1
   do_sample: True
   until: []
 metric_list:
-  - metric: !function metrics.qa_f1_score
+  - metric: "qa_f1_score"
     aggregation: mean
     higher_is_better: True
 metadata:
-  version: 2.0
+  version: 3.0
diff --git a/lm_eval/tasks/longbench/hotpotqa_e.yaml b/lm_eval/tasks/longbench/hotpotqa_e.yaml
index 1a28f7363342d0161a641fd13becf152deae2313..eff29cec394b59e402646d045f7d301006fddcfd 100644
--- a/lm_eval/tasks/longbench/hotpotqa_e.yaml
+++ b/lm_eval/tasks/longbench/hotpotqa_e.yaml
@@ -6,15 +6,16 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: hotpotqa_e
 doc_to_text: 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{{context}}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:'
-doc_to_target: '{{answers[0]}}'
+doc_to_target: '{{answers}}'
+process_results: !function metrics.get_qa_f1_score
 generation_kwargs:
   max_gen_toks: 32
   temperature: 1
   do_sample: True
   until: []
 metric_list:
-  - metric: !function metrics.qa_f1_score
+  - metric: "qa_f1_score"
     aggregation: mean
     higher_is_better: True
 metadata:
-  version: 2.0
+  version: 3.0
diff --git a/lm_eval/tasks/longbench/lcc.yaml b/lm_eval/tasks/longbench/lcc.yaml
index 058910bb61dd8e9023caf3b5e12cd56404b2c84e..2129267d8e47f66277b0e5916675fd5426c20946 100644
--- a/lm_eval/tasks/longbench/lcc.yaml
+++ b/lm_eval/tasks/longbench/lcc.yaml
@@ -6,15 +6,16 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: lcc
 doc_to_text: 'Please complete the code given below. \n{{context}}Next line of code:\n'
-doc_to_target: '{{answers[0]}}'
+doc_to_target: '{{answers}}'
+process_results: !function metrics.get_code_sim_score
 generation_kwargs:
   max_gen_toks: 64
   temperature: 1
   do_sample: True
   until: []
 metric_list:
-  - metric: !function metrics.code_sim_score
+  - metric: "code_sim_score"
     aggregation: mean
     higher_is_better: True
 metadata:
-  version: 2.0
+  version: 3.0
diff --git a/lm_eval/tasks/longbench/lcc_e.yaml b/lm_eval/tasks/longbench/lcc_e.yaml
index 39bd75cd8add775f9e28c410ace5756428badc45..74e673a94a26a6f167cebf8698f6ee958243841d 100644
--- a/lm_eval/tasks/longbench/lcc_e.yaml
+++ b/lm_eval/tasks/longbench/lcc_e.yaml
@@ -6,15 +6,16 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: lcc_e
 doc_to_text: 'Please complete the code given below. \n{{context}}Next line of code:\n'
-doc_to_target: '{{answers[0]}}'
+doc_to_target: '{{answers}}'
+process_results: !function metrics.get_code_sim_score
 generation_kwargs:
   max_gen_toks: 64
   temperature: 1
   do_sample: True
   until: []
 metric_list:
-  - metric: !function metrics.code_sim_score
+  - metric: "code_sim_score"
     aggregation: mean
     higher_is_better: True
 metadata:
-  version: 2.0
+  version: 3.0
diff --git a/lm_eval/tasks/longbench/lsht.yaml b/lm_eval/tasks/longbench/lsht.yaml
index c2d6c01096ecb6c6aafe49efe0ca7884b2b55b23..4343413b62882a2d2275a7ca29455bf149ace547 100644
--- a/lm_eval/tasks/longbench/lsht.yaml
+++ b/lm_eval/tasks/longbench/lsht.yaml
@@ -6,16 +6,16 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: lsht
 doc_to_text: '请判断给定新闻的类别，下面是一些例子。\n\n{{context}}\n{{input}}'
-doc_to_target: '{{answers[0]}}'
-process_results: !function metrics.classification_score
+doc_to_target: '{{answers}}'
+process_results: !function metrics.get_classification_score
 generation_kwargs:
   max_gen_toks: 64
   temperature: 1
   do_sample: True
-  until: ['\n']
+  until: ["\n"]
 metric_list:
   - metric: "classification_score"
     aggregation: mean
     higher_is_better: True
 metadata:
-  version: 2.0
+  version: 3.0
diff --git a/lm_eval/tasks/longbench/metrics.py b/lm_eval/tasks/longbench/metrics.py
index ac24b2f029783cecbfd5b81eb281c5ba8b38e1c2..79aab279b7f6eeea009ad22ab40e24c66b04fb78 100644
--- a/lm_eval/tasks/longbench/metrics.py
+++ b/lm_eval/tasks/longbench/metrics.py
@@ -23,6 +23,7 @@
 import re
 import string
 from collections import Counter
+from typing import Union
 
 try:
     import jieba
@@ -33,7 +34,7 @@ except ImportError:
         'Please install the required dependencies for this task with `pip install lm_eval["longbench"] or `pip install jieba fuzzywuzzy rouge`'
     )
 
-# taken from https://github.com/THUDM/LongBench
+# taken and slightly modified from https://github.com/THUDM/LongBench
 
 
 def normalize_answer(s: str) -> str:
@@ -72,8 +73,7 @@ def normalize_zh_answer(s: str) -> str:
     return white_space_fix(remove_punc(lower(s)))
 
 
-def count_score(predictions: list[str], references: list[str], **kwargs) -> float:
-    prediction, ground_truth = predictions[0], references[0]
+def count_score(prediction: str, ground_truth: str, **kwargs):
     numbers = re.findall(r"\d+", prediction)
     right_num = 0
     for number in numbers:
@@ -83,8 +83,16 @@ def count_score(predictions: list[str], references: list[str], **kwargs) -> floa
     return float(final_score)
 
 
-def retrieval_score(predictions: list[str], references: list[str], **kwargs) -> float:
-    prediction, ground_truth = predictions[0], references[0]
+def get_count_score(doc: dict, results: list[str], **kwargs):
+    output = 0.0
+    prediction = results[0].strip()
+    for ground_truth in doc["answers"]:
+        score = count_score(prediction, ground_truth)
+        output = max(score, output)
+    return {"count_score": output}
+
+
+def retrieval_score(prediction: str, ground_truth: str, **kwargs):
     pattern = r"Paragraph (\d+)"
     matches = re.findall(pattern, ground_truth)
     ground_truth_id = matches[0]
@@ -97,10 +105,16 @@ def retrieval_score(predictions: list[str], references: list[str], **kwargs) ->
     return float(final_score)
 
 
-def retrieval_zh_score(
-    predictions: list[str], references: list[str], **kwargs
-) -> float:
-    prediction, ground_truth = predictions[0], references[0]
+def get_retrieval_score(doc: dict, results: list[str], **kwargs):
+    output = 0.0
+    prediction = results[0].strip()
+    for ground_truth in doc["answers"]:
+        score = retrieval_score(prediction, ground_truth)
+        output = max(score, output)
+    return {"retrieval_score": output}
+
+
+def retrieval_zh_score(prediction: str, ground_truth: str, **kwargs):
     pattern = r"段落(\d+)"
     matches = re.findall(pattern, ground_truth)
     ground_truth_id = matches[0]
@@ -113,8 +127,16 @@ def retrieval_zh_score(
     return float(final_score)
 
 
-def code_sim_score(predictions: list[str], references: list[str], **kwargs) -> float:
-    prediction, ground_truth = predictions[0], references[0]
+def get_retrieval_zh_score(doc: dict, results: list[str], **kwargs):
+    output = 0.0
+    prediction = results[0].strip()
+    for ground_truth in doc["answers"]:
+        score = retrieval_zh_score(prediction, ground_truth)
+        output = max(score, output)
+    return {"retrieval_zh_score": output}
+
+
+def code_sim_score(prediction: str, ground_truth: str, **kwargs):
     all_lines = prediction.lstrip("\n").split("\n")
     prediction = ""
     for line in all_lines:
@@ -124,10 +146,18 @@ def code_sim_score(predictions: list[str], references: list[str], **kwargs) -> f
     return fuzz.ratio(prediction, ground_truth) / 100
 
 
-def classification_score(doc: dict, results: list[str], **kwargs) -> dict:
-    prediction, ground_truth = results[0], doc["answers"][0]
+def get_code_sim_score(doc: dict, results: list[str], **kwargs):
+    output = 0.0
+    prediction = results[0]  ## important! do not strip the prediction!
+    for ground_truth in doc["answers"]:
+        score = code_sim_score(prediction, ground_truth)
+        output = max(score, output)
+    return {"code_sim_score": output}
+
+
+def classification_score(prediction: str, ground_truth: str, **kwargs):
     em_match_list = []
-    all_classes = doc["all_classes"]
+    all_classes = kwargs["all_classes"]
     for class_name in all_classes:
         if class_name in prediction:
             em_match_list.append(class_name)
@@ -138,35 +168,58 @@ def classification_score(doc: dict, results: list[str], **kwargs) -> dict:
         score = 1.0 / len(em_match_list)
     else:
         score = 0.0
-    return {"classification_score": score}
+    return score
+
+
+def get_classification_score(doc: dict, results: list[str]) -> dict:
+    output = 0.0
+    prediction = results[0].strip()
+    for ground_truth in doc["answers"]:
+        score = classification_score(
+            prediction, ground_truth, all_classes=doc["all_classes"]
+        )
+        output = max(score, output)
+    return {"classification_score": output}
 
 
-def rouge_score(predictions: list[str], references: list[str], **kwargs) -> float:
+def rouge_score(predictions: str, ground_truth: str, **kwargs) -> float:
     global rouge
     if "rouge" not in globals():
         rouge = Rouge()
-    prediction, ground_truth = predictions[0], references[0]
     try:
-        scores = rouge.get_scores([prediction], [ground_truth], avg=True)
+        scores = rouge.get_scores([predictions], [ground_truth], avg=True)
         # ruff: noqa
     except:
         return 0.0
     return scores["rouge-l"]["f"]
 
 
-def rouge_zh_score(predictions: list[str], references: list[str], **kwargs) -> float:
-    prediction, ground_truth = predictions[0], references[0]
+def get_rouge_score(doc: dict, results: list[str], **kwargs):
+    output = 0.0
+    prediction = results[0].strip()
+    for ground_truth in doc["answers"]:
+        score = rouge_score(prediction, ground_truth)
+        output = max(score, output)
+    return {"rouge_score": output}
+
+
+def rouge_zh_score(prediction: str, ground_truth: str, **kwargs):
     prediction = " ".join(list(jieba.cut(prediction, cut_all=False)))
     ground_truth = " ".join(list(jieba.cut(ground_truth, cut_all=False)))
-    score = rouge_score([prediction], [ground_truth])
+    score = rouge_score(prediction, ground_truth)
     return score
 
 
-def f1_score(predictions: list[str], references: list[str], **kwargs) -> float:
-    try:
-        prediction, ground_truth = predictions[0], references[0]
-    except:
-        return 0.0
+def get_rouge_zh_score(doc, results, **kwargs):
+    output = 0.0
+    prediction = results[0].strip()
+    for ground_truth in doc["answers"]:
+        score = rouge_zh_score(prediction, ground_truth)
+        output = max(score, output)
+    return {"rouge_zh_score": output}
+
+
+def f1_score(prediction: Union[str, list], ground_truth: Union[str, list], **kwargs):
     common = Counter(prediction) & Counter(ground_truth)
     num_same = sum(common.values())
     if num_same == 0:
@@ -177,22 +230,25 @@ def f1_score(predictions: list[str], references: list[str], **kwargs) -> float:
     return f1
 
 
-def qa_f1_score(predictions: list[str], references: list[str], **kwargs) -> float:
-    prediction, ground_truth = predictions[0], references[0]
+def get_f1_score(doc: dict, results: list[str], **kwargs):
+    output = 0.0
+    prediction = results[0].strip()
+    for ground_truth in doc["answers"]:
+        score = f1_score(prediction, ground_truth)
+        output = max(score, output)
+    return {"f1_score": output}
+
+
+def qa_f1_score(prediction: str, ground_truth: str, **kwargs):
     normalized_prediction = normalize_answer(prediction)
     normalized_ground_truth = normalize_answer(ground_truth)
 
     prediction_tokens = normalized_prediction.split()
     ground_truth_tokens = normalized_ground_truth.split()
-    try:
-        res = f1_score(prediction_tokens, ground_truth_tokens)
-    except:
-        return 0.0
-    return res
+    return f1_score(prediction_tokens, ground_truth_tokens)
 
 
-def qa_f1_zh_score(predictions: list[str], references: list[str], **kwargs) -> float:
-    prediction, ground_truth = predictions[0], references[0]
+def qa_f1_zh_score(prediction: str, ground_truth: str, **kwargs):
     prediction_tokens = list(jieba.cut(prediction, cut_all=False))
     ground_truth_tokens = list(jieba.cut(ground_truth, cut_all=False))
     prediction_tokens = [normalize_zh_answer(token) for token in prediction_tokens]
@@ -200,3 +256,21 @@ def qa_f1_zh_score(predictions: list[str], references: list[str], **kwargs) -> f
     prediction_tokens = [token for token in prediction_tokens if len(token) > 0]
     ground_truth_tokens = [token for token in ground_truth_tokens if len(token) > 0]
     return f1_score(prediction_tokens, ground_truth_tokens)
+
+
+def get_qa_f1_score(doc: dict, results: list[str], **kwargs):
+    output = 0.0
+    prediction = results[0].strip()
+    for ground_truth in doc["answers"]:
+        score = qa_f1_score(prediction, ground_truth)
+        output = max(score, output)
+    return {"qa_f1_score": output}
+
+
+def get_qa_f1_zh_score(doc: dict, results: list[str], **kwargs):
+    output = 0.0
+    prediction = results[0].strip()
+    for ground_truth in doc["answers"]:
+        score = qa_f1_zh_score(prediction, ground_truth)
+        output = max(score, output)
+    return {"qa_f1_zh_score": output}
diff --git a/lm_eval/tasks/longbench/multi_news.yaml b/lm_eval/tasks/longbench/multi_news.yaml
index 7674a2ceb29be37bc6fe9d6805b62a5e3a9dbf8c..e1ae3f8cdea6191929f30ff89f27356595d1a643 100644
--- a/lm_eval/tasks/longbench/multi_news.yaml
+++ b/lm_eval/tasks/longbench/multi_news.yaml
@@ -6,15 +6,16 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: multi_news
 doc_to_text: 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{{context}}\n\nNow, write a one-page summary of all the news.\n\nSummary:'
-doc_to_target: '{{answers[0]}}'
+doc_to_target: '{{answers}}'
+process_results: !function metrics.get_rouge_score
 generation_kwargs:
   max_gen_toks: 512
   temperature: 1
   do_sample: True
   until: []
 metric_list:
-  - metric: !function metrics.rouge_score
+  - metric: "rouge_score"
     aggregation: mean
     higher_is_better: True
 metadata:
-  version: 2.0
+  version: 3.0
diff --git a/lm_eval/tasks/longbench/multi_news_e.yaml b/lm_eval/tasks/longbench/multi_news_e.yaml
index f50401cd516a9f128b4380a1c665da4cb9612f16..62f4405360bda431126e4d6004b0445e5705e695 100644
--- a/lm_eval/tasks/longbench/multi_news_e.yaml
+++ b/lm_eval/tasks/longbench/multi_news_e.yaml
@@ -6,15 +6,16 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: multi_news_e
 doc_to_text: 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{{context}}\n\nNow, write a one-page summary of all the news.\n\nSummary:'
-doc_to_target: '{{answers[0]}}'
+doc_to_target: '{{answers}}'
+process_results: !function metrics.get_rouge_score
 generation_kwargs:
   max_gen_toks: 512
   temperature: 1
   do_sample: True
   until: []
 metric_list:
-  - metric: !function metrics.rouge_score
+  - metric: "rouge_score"
     aggregation: mean
     higher_is_better: True
 metadata:
-  version: 2.0
+  version: 3.0
diff --git a/lm_eval/tasks/longbench/multifieldqa_en.yaml b/lm_eval/tasks/longbench/multifieldqa_en.yaml
index 8bc1c7ffcd7e8e167cd87c77c3cc20367d9597bb..e82b7c7e002469fa680b6bb69a6dd92acd1b9173 100644
--- a/lm_eval/tasks/longbench/multifieldqa_en.yaml
+++ b/lm_eval/tasks/longbench/multifieldqa_en.yaml
@@ -6,15 +6,16 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: multifieldqa_en
 doc_to_text: 'Read the following text and answer briefly.\n\n{{context}}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:'
-doc_to_target: '{{answers[0]}}'
+doc_to_target: '{{answers}}'
+process_results: !function metrics.get_qa_f1_score
 generation_kwargs:
   max_gen_toks: 64
   temperature: 1
   do_sample: True
   until: []
 metric_list:
-  - metric: !function metrics.qa_f1_score
+  - metric: "qa_f1_score"
     aggregation: mean
     higher_is_better: True
 metadata:
-  version: 2.0
+  version: 3.0
diff --git a/lm_eval/tasks/longbench/multifieldqa_en_e.yaml b/lm_eval/tasks/longbench/multifieldqa_en_e.yaml
index b6d86111b5572dac93d267a07f3420f7dc0cc49e..5f64e97e97cdb37d922a5721698fdfc1fe3ffc2d 100644
--- a/lm_eval/tasks/longbench/multifieldqa_en_e.yaml
+++ b/lm_eval/tasks/longbench/multifieldqa_en_e.yaml
@@ -6,15 +6,16 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: multifieldqa_en_e
 doc_to_text: 'Read the following text and answer briefly.\n\n{{context}}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:'
-doc_to_target: '{{answers[0]}}'
+doc_to_target: '{{answers}}'
+process_results: !function metrics.get_qa_f1_score
 generation_kwargs:
   max_gen_toks: 64
   temperature: 1
   do_sample: True
   until: []
 metric_list:
-  - metric: !function metrics.qa_f1_score
+  - metric: "qa_f1_score"
     aggregation: mean
     higher_is_better: True
 metadata:
-  version: 2.0
+  version: 3.0
diff --git a/lm_eval/tasks/longbench/multifieldqa_zh.yaml b/lm_eval/tasks/longbench/multifieldqa_zh.yaml
index 9ff6db654e1e2431ac2bcf22e9728a967e53ea6b..4a6eb9ed5ca4662fd55348dc43be7ba2170bb348 100644
--- a/lm_eval/tasks/longbench/multifieldqa_zh.yaml
+++ b/lm_eval/tasks/longbench/multifieldqa_zh.yaml
@@ -6,15 +6,16 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: multifieldqa_zh
 doc_to_text: '阅读以下文字并用中文简短回答：\n\n{{context}}\n\n现在请基于上面的文章回答下面的问题，只告诉我答案，不要输出任何其他字词。\n\n问题：{{input}}\n回答：'
-doc_to_target: '{{answers[0]}}'
+doc_to_target: '{{answers}}'
+process_results: !function metrics.get_qa_f1_zh_score
 generation_kwargs:
   max_gen_toks: 64
   temperature: 1
   do_sample: True
   until: []
 metric_list:
-  - metric: !function metrics.qa_f1_zh_score
+  - metric: "qa_f1_zh_score"
     aggregation: mean
     higher_is_better: True
 metadata:
-  version: 2.0
+  version: 3.0
diff --git a/lm_eval/tasks/longbench/musique.yaml b/lm_eval/tasks/longbench/musique.yaml
index 1af8afa4b3fb3b0b7b0166cbf864e86ef5f633b7..89c3a4488035c2d546c737447a69e78c0f4d4027 100644
--- a/lm_eval/tasks/longbench/musique.yaml
+++ b/lm_eval/tasks/longbench/musique.yaml
@@ -6,15 +6,16 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: musique
 doc_to_text: 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{{context}}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:'
-doc_to_target: '{{answers[0]}}'
+doc_to_target: '{{answers}}'
+process_results: !function metrics.get_qa_f1_score
 generation_kwargs:
   max_gen_toks: 32
   temperature: 1
   do_sample: True
   until: []
 metric_list:
-  - metric: !function metrics.qa_f1_score
+  - metric: "qa_f1_score"
     aggregation: mean
     higher_is_better: True
 metadata:
-  version: 2.0
+  version: 3.0
diff --git a/lm_eval/tasks/longbench/narrativeqa.yaml b/lm_eval/tasks/longbench/narrativeqa.yaml
index 1a54077fe9e0ea6aaea327346009a61371a8b643..82b92fe29f74f7c65d3ccb2ea44b21d1ea56ba56 100644
--- a/lm_eval/tasks/longbench/narrativeqa.yaml
+++ b/lm_eval/tasks/longbench/narrativeqa.yaml
@@ -6,15 +6,16 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: narrativeqa
 doc_to_text: 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question asconcisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {{context}}\n\nNow, answer the question based on the story asconcisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {{input}}\n\nAnswer:'
-doc_to_target: '{{answers[0]}}'
+doc_to_target: '{{answers}}'
+process_results: !function metrics.get_qa_f1_score
 generation_kwargs:
   max_gen_toks: 128
   temperature: 1
   do_sample: True
   until: []
 metric_list:
-  - metric: !function metrics.qa_f1_score
+  - metric: "qa_f1_score"
     aggregation: mean
     higher_is_better: True
 metadata:
-  version: 2.0
+  version: 3.0
diff --git a/lm_eval/tasks/longbench/passage_count.yaml b/lm_eval/tasks/longbench/passage_count.yaml
index d0685cc0e8c99c8975f8c8a7044a10ba54555ff8..a3160eaad3b1b6bbb2e449ec4669aa64dc3c0619 100644
--- a/lm_eval/tasks/longbench/passage_count.yaml
+++ b/lm_eval/tasks/longbench/passage_count.yaml
@@ -6,15 +6,16 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: passage_count
 doc_to_text: 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{{context}}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: '
-doc_to_target: '{{answers[0]}}'
+doc_to_target: '{{answers}}'
+process_results: !function metrics.get_count_score
 generation_kwargs:
   max_gen_toks: 32
   temperature: 1
   do_sample: True
   until: []
 metric_list:
-  - metric: !function metrics.count_score
+  - metric: "count_score"
     aggregation: mean
     higher_is_better: True
 metadata:
-  version: 2.0
+  version: 3.0
diff --git a/lm_eval/tasks/longbench/passage_count_e.yaml b/lm_eval/tasks/longbench/passage_count_e.yaml
index d15dd727c359385a9d6b02bfde290916b00838e2..602ab400292ebbc7c0de101296a5e8ba7484d15b 100644
--- a/lm_eval/tasks/longbench/passage_count_e.yaml
+++ b/lm_eval/tasks/longbench/passage_count_e.yaml
@@ -6,15 +6,16 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: passage_count_e
 doc_to_text: 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{{context}}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: '
-doc_to_target: '{{answers[0]}}'
+doc_to_target: '{{answers}}'
+process_results: !function metrics.get_count_score
 generation_kwargs:
   max_gen_toks: 32
   temperature: 1
   do_sample: True
   until: []
 metric_list:
-  - metric: !function metrics.count_score
+  - metric: "count_score"
     aggregation: mean
     higher_is_better: True
 metadata:
-  version: 2.0
+  version: 3.0
diff --git a/lm_eval/tasks/longbench/passage_retrieval_en.yaml b/lm_eval/tasks/longbench/passage_retrieval_en.yaml
index a7e521b525c966638c0a62d972c909b5c784e61f..b4e69378be49d39fabc2cce1b2d4be20dc417421 100644
--- a/lm_eval/tasks/longbench/passage_retrieval_en.yaml
+++ b/lm_eval/tasks/longbench/passage_retrieval_en.yaml
@@ -6,15 +6,16 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: passage_retrieval_en
 doc_to_text: 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{{context}}\n\nThe following is an abstract.\n\n{{input}}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: '
-doc_to_target: '{{answers[0]}}'
+doc_to_target: '{{answers}}'
+process_results: !function metrics.get_retrieval_score
 generation_kwargs:
   max_gen_toks: 32
   temperature: 1
   do_sample: True
   until: []
 metric_list:
-  - metric: !function metrics.retrieval_score
+  - metric: "retrieval_score"
     aggregation: mean
     higher_is_better: True
 metadata:
-  version: 2.0
+  version: 3.0
diff --git a/lm_eval/tasks/longbench/passage_retrieval_en_e.yaml b/lm_eval/tasks/longbench/passage_retrieval_en_e.yaml
index 1ca0b6080ee6b3587b0740d80b5222aacc2bee5e..198115489dd7be1508e2d2b47d95d01ee24dba32 100644
--- a/lm_eval/tasks/longbench/passage_retrieval_en_e.yaml
+++ b/lm_eval/tasks/longbench/passage_retrieval_en_e.yaml
@@ -6,15 +6,16 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: passage_retrieval_en_e
 doc_to_text: 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{{context}}\n\nThe following is an abstract.\n\n{{input}}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: '
-doc_to_target: '{{answers[0]}}'
+doc_to_target: '{{answers}}'
+process_results: !function metrics.get_retrieval_score
 generation_kwargs:
   max_gen_toks: 32
   temperature: 1
   do_sample: True
   until: []
 metric_list:
-  - metric: !function metrics.retrieval_score
+  - metric: "retrieval_score"
     aggregation: mean
     higher_is_better: True
 metadata:
-  version: 2.0
+  version: 3.0
diff --git a/lm_eval/tasks/longbench/passage_retrieval_zh.yaml b/lm_eval/tasks/longbench/passage_retrieval_zh.yaml
index 2556cc2fd566f98c6389113bf450423fa4180f95..36bf8295ae1919c1983c376873f6e31ef2428cf8 100644
--- a/lm_eval/tasks/longbench/passage_retrieval_zh.yaml
+++ b/lm_eval/tasks/longbench/passage_retrieval_zh.yaml
@@ -6,15 +6,16 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: passage_retrieval_zh
 doc_to_text: '以下是若干段落文字，以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{{context}}\n\n下面是一个摘要\n\n{{input}}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1"，"段落2"等格式\n\n答案是：'
-doc_to_target: '{{answers[0]}}'
+doc_to_target: '{{answers}}'
+process_results: !function metrics.get_retrieval_zh_score
 generation_kwargs:
   max_gen_toks: 32
   temperature: 1
   do_sample: True
   until: []
 metric_list:
-  - metric: !function metrics.retrieval_zh_score
+  - metric: "retrieval_zh_score"
     aggregation: mean
     higher_is_better: True
 metadata:
-  version: 2.0
+  version: 3.0
diff --git a/lm_eval/tasks/longbench/qasper.yaml b/lm_eval/tasks/longbench/qasper.yaml
index 21dd8c58ad7dcb636f2cb1909c1e8364ff061299..44b40590028cf1d4141cb452a18742d0fbd0cf98 100644
--- a/lm_eval/tasks/longbench/qasper.yaml
+++ b/lm_eval/tasks/longbench/qasper.yaml
@@ -6,15 +6,16 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: qasper
 doc_to_text: 'You are given a scientific article and a question. Answer the question as concisely as you can, using a single phrase or sentence if possible. If the question cannot be answered based on the information in the article, write "unanswerable". If the question is a yes/no question, answer "yes", "no", or "unanswerable". Do not provide any explanation.\n\nArticle: {{context}}\n\n Answer the question based on the above article as concisely as you can, using a single phrase or sentence if possible. If the question cannot be answered based on the information in the article, write "unanswerable". If the question is a yes/no question, answer "yes", "no", or "unanswerable". Do not provide any explanation.\n\nQuestion: {{input}}\n\nAnswer:'
-doc_to_target: '{{answers[0]}}'
+doc_to_target: '{{answers}}'
+process_results: !function metrics.get_qa_f1_score
 generation_kwargs:
   max_gen_toks: 128
   temperature: 1
   do_sample: True
   until: []
 metric_list:
-  - metric: !function metrics.qa_f1_score
+  - metric: "qa_f1_score"
     aggregation: mean
     higher_is_better: True
 metadata:
-  version: 2.0
+  version: 3.0
diff --git a/lm_eval/tasks/longbench/qasper_e.yaml b/lm_eval/tasks/longbench/qasper_e.yaml
index 986101f0808df7dc34cb33f822a6769d3fd7b6d4..e3808433cd179d53fe0b76574ce42763b4b4b5f8 100644
--- a/lm_eval/tasks/longbench/qasper_e.yaml
+++ b/lm_eval/tasks/longbench/qasper_e.yaml
@@ -6,15 +6,16 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: qasper_e
 doc_to_text: 'You are given a scientific article and a question. Answer the question as concisely as you can, using a single phrase or sentence if possible. If the question cannot be answered based on the information in the article, write "unanswerable". If the question is a yes/no question, answer "yes", "no", or "unanswerable". Do not provide any explanation.\n\nArticle: {{context}}\n\n Answer the question based on the above article as concisely as you can, using a single phrase or sentence if possible. If the question cannot be answered based on the information in the article, write "unanswerable". If the question is a yes/no question, answer "yes", "no", or "unanswerable". Do not provide any explanation.\n\nQuestion: {{input}}\n\nAnswer:'
-doc_to_target: '{{answers[0]}}'
+doc_to_target: '{{answers}}'
+process_results: !function metrics.get_qa_f1_score
 generation_kwargs:
   max_gen_toks: 128
   temperature: 1
   do_sample: True
   until: []
 metric_list:
-  - metric: !function metrics.qa_f1_score
+  - metric: "qa_f1_score"
     aggregation: mean
     higher_is_better: True
 metadata:
-  version: 2.0
+  version: 3.0
diff --git a/lm_eval/tasks/longbench/qmsum.yaml b/lm_eval/tasks/longbench/qmsum.yaml
index 9c1d225e141c5973b7858c52a68cd1483243ad98..8c922985ccce781d1b95c8c6c6e25d79f6aab16b 100644
--- a/lm_eval/tasks/longbench/qmsum.yaml
+++ b/lm_eval/tasks/longbench/qmsum.yaml
@@ -6,15 +6,16 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: qmsum
 doc_to_text: 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{{context}}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {{input}}\nAnswer:'
-doc_to_target: '{{answers[0]}}'
+doc_to_target: '{{answers}}'
+process_results: !function metrics.get_rouge_score
 generation_kwargs:
   max_gen_toks: 512
   temperature: 1
   do_sample: True
   until: []
 metric_list:
-  - metric: !function metrics.rouge_score
+  - metric: "rouge_score"
     aggregation: mean
     higher_is_better: True
 metadata:
-  version: 2.0
+  version: 3.0
diff --git a/lm_eval/tasks/longbench/repobench-p.yaml b/lm_eval/tasks/longbench/repobench-p.yaml
index 1e1af77ea58545f073f2929ff0da4f15df72aa68..8413e1e68a689657fdc4df92bea49636400b5716 100644
--- a/lm_eval/tasks/longbench/repobench-p.yaml
+++ b/lm_eval/tasks/longbench/repobench-p.yaml
@@ -6,15 +6,16 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: repobench-p
 doc_to_text: 'Please complete the code given below. \n{{context}}{{input}}Next line of code:\n'
-doc_to_target: '{{answers[0]}}'
+doc_to_target: '{{answers}}'
+process_results: !function metrics.get_code_sim_score
 generation_kwargs:
   max_gen_toks: 64
   temperature: 1
   do_sample: True
   until: []
 metric_list:
-  - metric: !function metrics.code_sim_score
+  - metric: "code_sim_score"
     aggregation: mean
     higher_is_better: True
 metadata:
-  version: 2.0
+  version: 3.0
diff --git a/lm_eval/tasks/longbench/repobench-p_e.yaml b/lm_eval/tasks/longbench/repobench-p_e.yaml
index ee71b137cf88f7a1eadf09dd6d2e4e385db643dd..2c0a55e0854bd28dfde86d566f7c4def1775635c 100644
--- a/lm_eval/tasks/longbench/repobench-p_e.yaml
+++ b/lm_eval/tasks/longbench/repobench-p_e.yaml
@@ -6,15 +6,16 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: repobench-p_e
 doc_to_text: 'Please complete the code given below. \n{{context}}{{input}}Next line of code:\n'
-doc_to_target: '{{answers[0]}}'
+doc_to_target: '{{answers}}'
+process_results: !function metrics.get_code_sim_score
 generation_kwargs:
   max_gen_toks: 64
   temperature: 1
   do_sample: True
   until: []
 metric_list:
-  - metric: !function metrics.code_sim_score
+  - metric: "code_sim_score"
     aggregation: mean
     higher_is_better: True
 metadata:
-  version: 2.0
+  version: 3.0
diff --git a/lm_eval/tasks/longbench/samsum.yaml b/lm_eval/tasks/longbench/samsum.yaml
index 102e90620f86aa11edec977bd8de8f62861d01d0..1e94d274745a9bb6f0fb7d4f174dde171a0b6438 100644
--- a/lm_eval/tasks/longbench/samsum.yaml
+++ b/lm_eval/tasks/longbench/samsum.yaml
@@ -6,15 +6,16 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: samsum
 doc_to_text: 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{{context}}\n\n{{input}}'
-doc_to_target: '{{answers[0]}}'
+doc_to_target: '{{answers}}'
+process_results: !function metrics.get_rouge_score
 generation_kwargs:
   max_gen_toks: 128
   temperature: 1
   do_sample: True
-  until: ['\n']
+  until: ["\n"]
 metric_list:
-  - metric: !function metrics.rouge_score
+  - metric: "rouge_score"
     aggregation: mean
     higher_is_better: True
 metadata:
-  version: 2.0
+  version: 3.0
diff --git a/lm_eval/tasks/longbench/samsum_e.yaml b/lm_eval/tasks/longbench/samsum_e.yaml
index 8d8864157984fa412250e3cc769605b3a1e90b37..9b3b1d5e3c9df352e522f3dba65c9753e73247fd 100644
--- a/lm_eval/tasks/longbench/samsum_e.yaml
+++ b/lm_eval/tasks/longbench/samsum_e.yaml
@@ -6,15 +6,16 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: samsum_e
 doc_to_text: 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{{context}}\n\n{{input}}'
-doc_to_target: '{{answers[0]}}'
+doc_to_target: '{{answers}}'
+process_results: !function metrics.get_rouge_score
 generation_kwargs:
   max_gen_toks: 128
   temperature: 1
   do_sample: True
-  until: ['\n']
+  until: ["\n"]
 metric_list:
-  - metric: !function metrics.rouge_score
+  - metric: "rouge_score"
     aggregation: mean
     higher_is_better: True
 metadata:
-  version: 2.0
+  version: 3.0
diff --git a/lm_eval/tasks/longbench/trec.yaml b/lm_eval/tasks/longbench/trec.yaml
index 00de0c2a7d5f24643f6b36d572f34bd41b09feec..525a1f4db2cfb4b125f83ecd75c339b8d0c47173 100644
--- a/lm_eval/tasks/longbench/trec.yaml
+++ b/lm_eval/tasks/longbench/trec.yaml
@@ -6,16 +6,16 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: trec
 doc_to_text: 'Please determine the type of the question below. Here are some examples of questions.\n\n{{context}}\n{{input}}'
-doc_to_target: '{{answers[0]}}'
-process_results: !function metrics.classification_score
+doc_to_target: '{{answers}}'
+process_results: !function metrics.get_classification_score
 generation_kwargs:
   max_gen_toks: 64
   temperature: 1
   do_sample: True
-  until: ['\n']
+  until: ["\n"]
 metric_list:
   - metric: "classification_score"
     aggregation: mean
     higher_is_better: True
 metadata:
-  version: 2.0
+  version: 3.0
diff --git a/lm_eval/tasks/longbench/trec_e.yaml b/lm_eval/tasks/longbench/trec_e.yaml
index 87ffa4c09f4b8ee9e4eb8aed7c8fadb715b1f739..ff6595b91e780913636325c27c700a14723f6cd4 100644
--- a/lm_eval/tasks/longbench/trec_e.yaml
+++ b/lm_eval/tasks/longbench/trec_e.yaml
@@ -6,15 +6,16 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: trec_e
 doc_to_text: 'Please determine the type of the question below. Here are some examples of questions.\n\n{{context}}\n{{input}}'
-doc_to_target: '{{answers[0]}}'
+doc_to_target: '{{answers}}'
+process_results: !function metrics.get_classification_score
 generation_kwargs:
   max_gen_toks: 64
   temperature: 1
   do_sample: True
-  until: ['\n']
+  until: ["\n"]
 metric_list:
-  - metric: !function metrics.classification_score
+  - metric: "classification_score"
     aggregation: mean
     higher_is_better: True
 metadata:
-  version: 2.0
+  version: 3.0
diff --git a/lm_eval/tasks/longbench/triviaqa.yaml b/lm_eval/tasks/longbench/triviaqa.yaml
index 501c63ab4e1bd96435589cf5b0b5143c67d72044..d54cbab729fdb7874507940809d981b4eaca0ec7 100644
--- a/lm_eval/tasks/longbench/triviaqa.yaml
+++ b/lm_eval/tasks/longbench/triviaqa.yaml
@@ -6,15 +6,16 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: triviaqa
 doc_to_text: 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{{context}}\n\n{{input}}'
-doc_to_target: '{{answers[0]}}'
+doc_to_target: '{{answers}}'
+process_results: !function metrics.get_qa_f1_score
 generation_kwargs:
   max_gen_toks: 32
   temperature: 1
   do_sample: True
-  until: ['\n']
+  until: ["\n"]
 metric_list:
-  - metric: !function metrics.qa_f1_score
+  - metric: "qa_f1_score"
     aggregation: mean
     higher_is_better: True
 metadata:
-  version: 2.0
+  version: 3.0
diff --git a/lm_eval/tasks/longbench/triviaqa_e.yaml b/lm_eval/tasks/longbench/triviaqa_e.yaml
index b475efe5b0b4a58b38003d24604ba217a46caafc..ceac823fec264712db105fe4551f068e4b8fe16c 100644
--- a/lm_eval/tasks/longbench/triviaqa_e.yaml
+++ b/lm_eval/tasks/longbench/triviaqa_e.yaml
@@ -6,15 +6,16 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: triviaqa_e
 doc_to_text: 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{{context}}\n\n{{input}}'
-doc_to_target: '{{answers[0]}}'
+doc_to_target: '{{answers}}'
+process_results: !function metrics.get_qa_f1_score
 generation_kwargs:
   max_gen_toks: 32
   temperature: 1
   do_sample: True
-  until: ['\n']
+  until: ["\n"]
 metric_list:
-  - metric: !function metrics.qa_f1_score
+  - metric: "qa_f1_score"
     aggregation: mean
     higher_is_better: True
 metadata:
-  version: 2.0
+  version: 3.0
diff --git a/lm_eval/tasks/longbench/utils.py b/lm_eval/tasks/longbench/utils.py
index a9fe0d7dacb82bf1cac484f79075894faf796567..98580e394e321cbddf01a89924eace96be728d0f 100644
--- a/lm_eval/tasks/longbench/utils.py
+++ b/lm_eval/tasks/longbench/utils.py
@@ -4,7 +4,7 @@ import os
 
 import numpy as np
 from metrics import (
-    classification_score,
+    # classification_score,
     code_sim_score,
     count_score,
     qa_f1_score,
@@ -29,10 +29,10 @@ dataset2metric = {
     "qmsum": rouge_score,
     "multi_news": rouge_score,
     "vcsum": rouge_zh_score,
-    "trec": classification_score,
+    # "trec": classification_score,
     "triviaqa": qa_f1_score,
     "samsum": rouge_score,
-    "lsht": classification_score,
+    # "lsht": classification_score,
     "passage_retrieval_en": retrieval_score,
     "passage_count": count_score,
     "passage_retrieval_zh": retrieval_zh_score,
diff --git a/lm_eval/tasks/longbench/vcsum.yaml b/lm_eval/tasks/longbench/vcsum.yaml
index c642954d1e1a8127d83509743f150ede1bbb6e42..ba590f5bcec1ebd1c3f1f5e8f448e3d3e8c7876a 100644
--- a/lm_eval/tasks/longbench/vcsum.yaml
+++ b/lm_eval/tasks/longbench/vcsum.yaml
@@ -6,15 +6,16 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: vcsum
 doc_to_text: '下面有一段会议记录，请你阅读后，写一段总结，总结会议的内容。\n会议记录：\n{{context}}\n\n会议总结：'
-doc_to_target: '{{answers[0]}}'
+doc_to_target: '{{answers}}'
+process_results: !function metrics.get_rouge_zh_score
 generation_kwargs:
   max_gen_toks: 512
   temperature: 1
   do_sample: True
   until: []
 metric_list:
-  - metric: !function metrics.rouge_zh_score
+  - metric: "rouge_zh_score"
     aggregation: mean
     higher_is_better: True
 metadata:
-  version: 2.0
+  version: 3.0
diff --git a/lm_eval/tasks/mbpp/mbpp_instruct.yaml b/lm_eval/tasks/mbpp/mbpp_instruct.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f2948075a7b0e8299f8f8e411865394c0ce8fea6
--- /dev/null
+++ b/lm_eval/tasks/mbpp/mbpp_instruct.yaml
@@ -0,0 +1,29 @@
+task: mbpp_instruct
+dataset_path: google-research-datasets/mbpp
+dataset_name: full
+unsafe_code: true
+output_type: generate_until
+test_split: test
+doc_to_text: "You are an expert Python programmer, and here is your task:\n{{text}}\nYour code should pass these tests:\n{{test_list[0]}}\n{{test_list[1]}}\n{{test_list[2]}}"
+doc_to_target: "{% if is_fewshot is defined %}{{code}}\n```{% else %}{{test_list[0]}}\n{{test_list[1]}}\n{{test_list[2]}}{% endif %}"
+gen_prefix: "\n```python\n"
+target_delimiter: ""
+metric_list:
+  - metric: !function utils.pass_at_1
+    aggregation: mean
+    higher_is_better: true
+filter_list:
+  - name: "extract_code"
+    filter:
+      - function: "custom"
+        filter_fn: !function utils.build_predictions
+generation_kwargs:
+  max_gen_toks: 256
+  until: []
+  do_sample: false
+num_fewshot: 3
+fewshot_config:
+  sampler: first_n
+  samples: !function utils.list_fewshot_samples
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/mbpp/mbpp_plus_instruct.yaml b/lm_eval/tasks/mbpp/mbpp_plus_instruct.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7d12da0f83b34f21505b8d1830268f1989901aba
--- /dev/null
+++ b/lm_eval/tasks/mbpp/mbpp_plus_instruct.yaml
@@ -0,0 +1,12 @@
+include: mbpp_instruct.yaml
+task: mbpp_plus_instruct
+dataset_path: evalplus/mbppplus
+dataset_name: null
+doc_to_text: "{{prompt if prompt is defined else text}} Your code should satisfy the following assertion:\n{{test_list[0]}}"
+doc_to_target: "{{test_list[0]}}"
+gen_prefix: "Here is a solution to this programming problem:\n```python\n"
+num_fewshot: 0
+generation_kwargs:
+  max_gen_toks: 1024
+  until: []
+  do_sample: false
diff --git a/lm_eval/tasks/mbpp/utils.py b/lm_eval/tasks/mbpp/utils.py
index 2d94b51275be531b311a0b2b9285ea876fe7682a..c205a2320f5b88b21465b7667ea609004f0acb33 100644
--- a/lm_eval/tasks/mbpp/utils.py
+++ b/lm_eval/tasks/mbpp/utils.py
@@ -1,3 +1,6 @@
+import re
+from typing import Union
+
 import evaluate as hf_evaluate
 
 
@@ -12,14 +15,39 @@ except Exception as e:
     raise e
 
 
-def pass_at_1(references, predictions):
+def pass_at_1(
+    references: Union[str, list[str]], predictions: Union[str, list[list[str]]]
+) -> float:
+    if isinstance(references, str):
+        references = [references]
+    if isinstance(predictions[0], str):
+        predictions = [[p] for p in predictions]
     return pass_at_k.compute(
         references=references,
-        predictions=[predictions],
+        predictions=predictions,
         k=[1],
     )[0]["pass@1"]
 
 
+def extract_code_blocks(text: str) -> str:
+    # Pattern to match ```...``` blocks
+    pattern = r"```(?:\w+)?\n?(.*?)\n?```"
+    # (+ ```) as we add the opening "```python" to the gen_prefix
+    matches = re.findall(pattern, r"```" + text, re.DOTALL)
+    # if no matches, try to match ```...``` blocks (after removing the language)
+    if not matches:
+        text_without_lang = re.sub(r"```python", "```", text)
+        matches = re.findall(pattern, text_without_lang, re.DOTALL)
+    if not matches:
+        return ""
+    else:
+        return matches[0]
+
+
+def build_predictions(resps: list[list[str]], docs: list[dict]) -> list[list[str]]:
+    return [[extract_code_blocks(r) for r in resp] for resp in resps]
+
+
 def list_fewshot_samples():
     return [
         {
diff --git a/lm_eval/tasks/mmlu/README.md b/lm_eval/tasks/mmlu/README.md
index a3425d517654a6b93e03ee1bb681e07de18c4016..5924a1d2a8271cf40410faba8ba84b03728fb9c3 100644
--- a/lm_eval/tasks/mmlu/README.md
+++ b/lm_eval/tasks/mmlu/README.md
@@ -36,11 +36,11 @@ Note: The `Flan` variants are derived from [here](https://github.com/jasonwei20/
 
 * `mmlu`: `Original multiple-choice MMLU benchmark`
 * `mmlu_continuation`: `MMLU but with continuation prompts`
-* `mmlu_generation`: `MMLU generation`
+* `mmlu_generative`: `MMLU generation`
 
 MMLU is the original benchmark as implemented by Hendrycks et al. with the choices in context and the answer letters (e.g `A`, `B`, `C`, `D`) in the continuation.
 `mmlu_continuation` is a cloze-style variant without the choices in context and the full answer choice in the continuation.
-`mmlu_generation` is a generation variant, similar to the original but the LLM is asked to generate the correct answer letter.
+`mmlu_generative` is a generation variant, similar to the original but the LLM is asked to generate the correct answer letter.
 
 
 #### Subgroups
diff --git a/lm_eval/tasks/mmlu_pro/README.md b/lm_eval/tasks/mmlu_pro/README.md
index 7ffa99a36cb3ec281506064ecd3fd55feec37be4..e28b0527aecda0cd833c5c40588b42d0682d404c 100644
--- a/lm_eval/tasks/mmlu_pro/README.md
+++ b/lm_eval/tasks/mmlu_pro/README.md
@@ -64,3 +64,5 @@ If other tasks on this dataset are already supported:
   * Added one newline to task description(s) as per [reference implementation](https://github.com/TIGER-AI-Lab/MMLU-Pro/blob/47b9891aacb8bd7cda29d5c5ba17b9434dd333bc/evaluate_from_local.py#L93)
 * (tasks, group) 2025-03-20 -- (version 2.0 --> version 2.1)
   * Changed default max_length from 2048 to 8192 and max_gen_toks from 256 to 2048.
+* (tasks, group) 2025-05-20 -- (version 2.1 --> version 3)
+  * changed stop sequence from "Q:" to "Question:" PR #2945
diff --git a/lm_eval/tasks/mmlu_pro/_default_template_yaml b/lm_eval/tasks/mmlu_pro/_default_template_yaml
index d678c04dbf40f5867b4ef0ee4ee72bbf0a04304c..d59d03a3bb09437644922c8345452539493a0ae9 100644
--- a/lm_eval/tasks/mmlu_pro/_default_template_yaml
+++ b/lm_eval/tasks/mmlu_pro/_default_template_yaml
@@ -17,9 +17,7 @@ filter_list:
       - function: "take_first"
 generation_kwargs:
   until:
-    - "</s>"
-    - "Q:"
-    - "<|im_end|>"
+    - "Question:"
   max_gen_toks: 2048
   do_sample: false
   temperature: 0.0
diff --git a/lm_eval/tasks/mmlu_pro/utils.py b/lm_eval/tasks/mmlu_pro/utils.py
index 03117be5f165fd7edf40404bf9934b3753039f1d..ca254a29a271c2fdf27781b897d16af9a61afc66 100644
--- a/lm_eval/tasks/mmlu_pro/utils.py
+++ b/lm_eval/tasks/mmlu_pro/utils.py
@@ -1,24 +1,7 @@
 from functools import partial
 
 
-choices = [
-    "A",
-    "B",
-    "C",
-    "D",
-    "E",
-    "F",
-    "G",
-    "H",
-    "I",
-    "J",
-    "K",
-    "L",
-    "M",
-    "N",
-    "O",
-    "P",
-]
+choices = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"]
 
 
 def format_cot_example(example, including_answer=True):
@@ -27,8 +10,12 @@ def format_cot_example(example, including_answer=True):
     options = example["options"]
     prompt += question + "\n"
     prompt += "Options:\n"
+
     for i, opt in enumerate(options):
+        if i >= len(choices):
+            break
         prompt += "{}. {}\n".format(choices[i], opt)
+
     if including_answer:
         cot_content = example["cot_content"].replace(
             "A: Let's think step by step.", "Answer: Let's think step by step."
@@ -36,6 +23,7 @@ def format_cot_example(example, including_answer=True):
         prompt += cot_content + "\n\n"
     else:
         prompt += "Answer: Let's think step by step."
+
     return prompt
 
 
diff --git a/lm_eval/tasks/noreval/README.md b/lm_eval/tasks/noreval/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b73385779c4c14ffcbc578a131218f875df53e08
--- /dev/null
+++ b/lm_eval/tasks/noreval/README.md
@@ -0,0 +1,84 @@
+# 🇳🇴 NorEval
+
+### Paper
+
+* Title: `NorEval: A Norwegian Language Understanding and Generation Evaluation Benchmark`
+* Abstract: [arxiv.org/abs/2504.07749](https://arxiv.org/abs/2504.07749)
+* Homepage: [github.com/ltgoslo/noreval](https://github.com/ltgoslo/noreval/tree/main)
+
+![noreval](noreval.jpg)
+
+**Overview of the NorEval design.**  😼 denotes datasets used in [NorBench](https://aclanthology.org/2023.nodalida-1.61/), [NLEBench](https://aclanthology.org/2024.emnlp-main.317/), [ScandEval](https://aclanthology.org/2023.nodalida-1.20/), and [SEB](https://proceedings.neurips.cc/paper_files/paper/2024/file/4746bb91bd073ec7eef930d5775122ba-Paper-Datasets_and_Benchmarks_Track.pdf); 🚀 represents datasets that have not been used in the existing Norwegian benchmarks; and 😎 denotes our novel datasets introduced as part of NorEval. EN=English; BM=Norwegian Bokmål; NN=Norwegian Nynorsk.
+
+🇳🇴 NorEval is a multi-task Norwegian language understanding and generation evaluation benchmark that combines 19 existing peer-reviewed datasets with five datasets created from scratch. NorEval covers nine diverse task categories: sentiment analysis, Norwegian language knowledge, Norwegian-specific \& world knowledge, machine reading comprehension, commonsense reasoning, machine translation, text summarization, instruction following, and truthfulness. Our main evaluation principles are:
+
+- 🌐 **Linguistic diversity**: support for both of the official written standards of Norwegian: Bokmål and Nynorsk (the minority variant).
+- 📊 **Task diversity**: coverage of various least addressed tasks for Norwegian. In particular, only three out of 24 NorEval datasets are included in existing Norwegian benchmarks to date: [NorBench](https://aclanthology.org/2023.nodalida-1.61/), [NLEBench](https://aclanthology.org/2024.emnlp-main.317/), [ScandEval](https://aclanthology.org/2023.nodalida-1.20/), and [SEB](https://proceedings.neurips.cc/paper_files/paper/2024/file/4746bb91bd073ec7eef930d5775122ba-Paper-Datasets_and_Benchmarks_Track.pdf).
+- 🧠 **Data quality**: focus on only peer-reviewed human-created datasets to ensure reliable evaluation in the context of the Norwegian language, culture, and values.
+- 📏 **Prompt sensitivity**: evaluation across 100+ human-written prompts to account for the prompt sensitivity.
+- 👩🏻‍🔬 **Standardized evaluation**: integration of NorEval into LM Evaluation Harness for flexible and reproducible evaluation.
+
+
+### Tasks
+
+|Name  |Bokmål | Nynorsk  |*k*-shot | Task type  | Task category |
+|:---|:---|:---|:---|:---|:---|
+|[NoReC Sentence](https://huggingface.co/datasets/ltg/norec_sentence) |```norec_sentence```  | ❌ |✅ |Text classification| Sentiment analysis |
+|[NoReC Document](https://huggingface.co/datasets/ltg/norec_document) |```norec_document```  | ❌ |✅ |Text classification| Sentiment analysis |
+|[NCB](https://huggingface.co/datasets/hcfa/ncb) |```ncb```| ❌ | ❌ |Sentence ranking| Norwegian language knowledge   |
+|[NorIdiom](https://huggingface.co/datasets/Sprakbanken/Norwegian_idioms) |```noridiom_nob```  | ```noridiom_nno```  | ❌ |Sentence completion| Norwegian language knowledge  |
+|[Belebele](https://huggingface.co/datasets/facebook/belebele) |```norbelebele```| ❌|❌ |Multiple-choice question answering| Machine reading comprehension |
+|[NRK-Quiz-QA](https://huggingface.co/datasets/ltg/nrk_quiz_qa) |```nrk_quiz_qa_nob```| ```nrk_quiz_qa_nno```| ❌   |Multiple-choice question answering| Norwegian-specific & world knowledge |
+|[NorOpenBookQA](https://huggingface.co/datasets/ltg/noropenbookqa) |```noropenbookqa_nob```| ```noropenbookqa_nno``` |✅  |Multiple-choice question answering| Norwegian-specific & world knowledge |
+|[NorCommonsenseQA](https://huggingface.co/datasets/ltg/norcommonsenseqa) |```norcommonsenseqa_nob```| ```norcommonsenseqa_nno``` |❌   |Multiple-choice question answering|Commonsense reasoning  |
+|[NorTruthfulQA Multiple choice](https://huggingface.co/datasets/ltg/nortruthfulqa_mc) |```nortruthfulqa_mc_nob```| ```nortruthfulqa_mc_nno``` |❌   |Multiple-choice question answering |Truthfulness |
+|[NorQuAD](https://huggingface.co/datasets/ltg/norquad) |```norquad```| ❌  | ✅  |Generative question answering |Machine reading comprehension |
+|[NorTruthfulQA Generation](https://huggingface.co/datasets/ltg/nortruthfulqa_gen) |```nortruthfulqa_gen_nob```| ```nortruthfulqa_gen_nno``` | ❌   | Generative question answering|Truthfulness |
+|[ASK-GEC](https://huggingface.co/datasets/ltg/ask-gec) |```ask_gec```| ❌ |✅ |Sequence-to-sequence generation|Norwegian language knowledge |
+|[NorSumm](https://huggingface.co/datasets/SamiaT/NorSumm)  |```norsumm_nob``` | ```norsumm_nno```  |✅ |Sequence-to-sequence generation|Text summarization |
+|[Tatoeba (English → Bokmål/Nynorsk)](https://huggingface.co/datasets/Helsinki-NLP/tatoeba_mt) | ```tatoeba_eng_nob```| ```tatoeba_eng_nno```  |✅  |Sequence-to-sequence generation|Machine translation |
+|[Tatoeba (Bokmål/Nynorsk → English)](https://huggingface.co/datasets/Helsinki-NLP/tatoeba_mt) | ```tatoeba_nob_eng```| ```tatoeba_nno_eng```  |✅  |Sequence-to-sequence generation|Machine translation |
+|[NorRewrite-Instruct](https://huggingface.co/datasets/ltg/norrewrite-instruct) |```norrewrite_instruct```  |❌ |❌ |Sequence-to-sequence generation|Instruction following|
+|[NorSummarize-Instruct](https://huggingface.co/datasets/ltg/norsummarize-instruct) |```norsummarize_instruct``` |❌ |❌ |Sequence-to-sequence generation|Instruction following|
+
+<details open>
+<summary><b>Table description</b></summary>
+
+* **Name**: a dataset name with a HuggingFace link.
+* **Bokmål**: the LM Evaluation Harness task name for the Norwegian Bokmål dataset.
+* **Nynorsk**: the LM Evaluation Harness task name for the Norwegian Nynorsk dataset, if available.
+* **k-shot**: the support for *k*-shot evaluation regimes with *k* > 0. We follow the original datasets' design and focus mainly on the zero-shot evaluation by default.
+  * ✅ means that the user can run the evaluation in both zero-shot and *k*-shot regimes.
+  * ❌ denotes that only the zero-shot evaluation regime is available due to the lack of the training or validation set to sample the demonstration examples from. Technically, *k*-shot evaluation on the test set is possible using sampling without replacement, given that the model is not proprietary and not accessed via an API.
+* **Task type**: the task type.
+* **Task category**: the task category.
+
+</details>
+
+##### Comments on Belebele
+Belebele for Norwegian Bokmål is already available in LM Evaluation Harness as `belebele_nob_Latn`. However, our version (`norbelebele`) supports five prompt templates written by Norwegian native speakers, which are different from the default prompt template used in Belebele.
+
+
+
+### Citation
+
+```
+@article{mikhailov2025noreval,
+  title={NorEval: A Norwegian Language Understanding and Generation Evaluation Benchmark},
+  author={Mikhailov, Vladislav and Enstad, Tita and Samuel, David and Farseth{\aa}s, Hans Christian and Kutuzov, Andrey and Velldal, Erik and {\O}vrelid, Lilja},
+  journal={arXiv preprint arXiv:2504.07749},
+  year={2025}
+}
+```
+
+### Checklist
+
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation?
+    * [ ] Yes, original implementation contributed by author of the benchmark
+
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
diff --git a/lm_eval/tasks/noreval/ask_gec/README.md b/lm_eval/tasks/noreval/ask_gec/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..35de80b36dd9adcfcc1b78f566849b936ea3ca7a
--- /dev/null
+++ b/lm_eval/tasks/noreval/ask_gec/README.md
@@ -0,0 +1,28 @@
+### Evaluation example
+
+Here, we use the `--predict_only` argument and compute the performance metrics as described below.
+
+**Step 1: Generate the predictions**
+
+```bash
+lm_eval \
+  --model hf \
+  --model_args pretrained=AI-Sweden-Models/Llama-3-8B \
+  --tasks ask_gec \
+  --output results/ask_gec/0-shot/ \
+  --log_samples \
+  --show_config \
+  --write_out \
+  --predict_only \
+  --batch_size auto \
+  --num_fewshot 0
+```
+
+**Step 2: Evaluate the predictions with ERRANT**
+
+* Please refer to the installation instructions [here](https://github.com/chrisjbryant/errant/tree/main).
+* Run the following:
+    ```bash
+    python3 ask_gec/errant.py --fpath results/ask_gec/0-shot/AI-Sweden-Models__Llama-3-8B/samples_ask_gec_p0_2025-01-28T01-08-13.454441.jsonl --out_fdir results/ask_gec/0-shot/AI-Sweden-Models__Llama-3-8B/
+    ```
+* The results will be saved as `results/ask_gec/0-shot/AI-Sweden-Models__Llama-3-8B/samples_ask_gec_p0_2025-01-28T01-08-13.454441_errant.json`
diff --git a/lm_eval/tasks/noreval/ask_gec/_ask_gec_yaml b/lm_eval/tasks/noreval/ask_gec/_ask_gec_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..deffb7c954e41fb5ce883966673c1b4281d96689
--- /dev/null
+++ b/lm_eval/tasks/noreval/ask_gec/_ask_gec_yaml
@@ -0,0 +1,15 @@
+tag: ask_gec
+dataset_path: ltg/ask-gec
+output_type: generate_until
+training_split: train
+validation_split: validation
+test_split: test
+doc_to_target: correction
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  num_beams: 1
+  max_new_tokens: 256
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/noreval/ask_gec/ask_gec_p0.yaml b/lm_eval/tasks/noreval/ask_gec/ask_gec_p0.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..383160bcf8b28778331868ca799237cca14e8d3b
--- /dev/null
+++ b/lm_eval/tasks/noreval/ask_gec/ask_gec_p0.yaml
@@ -0,0 +1,3 @@
+task: ask_gec_p0
+doc_to_text: "Tekst: {{source}}\nKorreksjon:"
+include: _ask_gec_yaml
diff --git a/lm_eval/tasks/noreval/ask_gec/ask_gec_p1.yaml b/lm_eval/tasks/noreval/ask_gec/ask_gec_p1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..24f176c39bc8efbb2e7c00b03ed816085bf3d294
--- /dev/null
+++ b/lm_eval/tasks/noreval/ask_gec/ask_gec_p1.yaml
@@ -0,0 +1,3 @@
+task: ask_gec_p1
+doc_to_text: "Tekst: {{source}}\nRettet versjon:"
+include: _ask_gec_yaml
diff --git a/lm_eval/tasks/noreval/ask_gec/ask_gec_p2.yaml b/lm_eval/tasks/noreval/ask_gec/ask_gec_p2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e8da20189e70610ecbd92eed60928ad84a662560
--- /dev/null
+++ b/lm_eval/tasks/noreval/ask_gec/ask_gec_p2.yaml
@@ -0,0 +1,3 @@
+task: ask_gec_p2
+doc_to_text: "Skriv om følgende tekst slik at den blir grammatisk korrekt: {{source}}\nKorreksjon:"
+include: _ask_gec_yaml
diff --git a/lm_eval/tasks/noreval/ask_gec/ask_gec_p3.yaml b/lm_eval/tasks/noreval/ask_gec/ask_gec_p3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..def64d44309c5a357098b663f3b536d683867f29
--- /dev/null
+++ b/lm_eval/tasks/noreval/ask_gec/ask_gec_p3.yaml
@@ -0,0 +1,3 @@
+task: ask_gec_p3
+doc_to_text: "Original versjon: {{source}}\nKorrekturlest og rettet versjon:"
+include: _ask_gec_yaml
diff --git a/lm_eval/tasks/noreval/ask_gec/ask_gec_p4.yaml b/lm_eval/tasks/noreval/ask_gec/ask_gec_p4.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..81c04c6c093f05a1356f2747c514883facb86735
--- /dev/null
+++ b/lm_eval/tasks/noreval/ask_gec/ask_gec_p4.yaml
@@ -0,0 +1,3 @@
+task: ask_gec_p4
+doc_to_text: "Rett opp grammatiske feil i denne teksten: {{source}}\nKorreksjon:"
+include: _ask_gec_yaml
diff --git a/lm_eval/tasks/noreval/ask_gec/errant.py b/lm_eval/tasks/noreval/ask_gec/errant.py
new file mode 100644
index 0000000000000000000000000000000000000000..89721659acc8445b3500c84d2f50702de8010cb2
--- /dev/null
+++ b/lm_eval/tasks/noreval/ask_gec/errant.py
@@ -0,0 +1,106 @@
+import argparse
+import json
+import os
+import subprocess
+
+import pandas as pd
+
+
+def parse_args():
+    """
+    Parses arguments.
+    Returns:
+        Arguments containing the names of the prediction file and the file directory to for saving the evaluation results.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--fpath",
+        type=str,
+        help="path to a model output file in the lm-evaluation-harness format.",
+    )
+    parser.add_argument(
+        "--out_fdir",
+        type=str,
+        help="path to an output directory for saving the results.",
+    )
+    args = parser.parse_args()
+    return args
+
+
+def read_examples(fpath: str):
+    """
+    Reads examples from the prediction file.
+    Args:
+        fpath: A path to the prediction file.
+    Returns:
+        Lists of the sources, targets, and predictions.
+    """
+    examples = pd.read_json(fpath, lines=True)
+    sources, targets, predictions = [], [], []
+    for i, example in examples.iterrows():
+        sources.append(example["doc"]["source"])
+        targets.append(example["doc"]["correction"])
+        predictions.append(example["resps"][0][0].replace("\n\n", "\n"))
+    return sources, targets, predictions
+
+
+def save_results(fpath: str, obj: dict):
+    """
+    Saves the evaluation results.
+    Args:
+        fpath: A path for the output file for saving the results.
+        obj: The evaluation results.
+    """
+    with open(fpath, "w+", encoding="utf-8") as out:
+        json.dump(obj, out, indent=3)
+
+
+def evaluate(fpath: str, out_fpath: str):
+    """
+    Runs the evaluation based on the ERRANT performance metric.
+    Args:
+        fpath: A path to the prediction file.
+        out_Fpath: A path for the output file for saving the results.
+    """
+    tmp_name = fpath.replace(".jsonl", "").replace("/", "-")
+    os.makedirs("tmp", exist_ok=True)
+    sources, targets, predictions = read_examples(fpath=fpath)
+    with open(f"tmp/{tmp_name}_sources.txt", "w+") as f:
+        f.write("\n".join(sources))
+    with open(f"tmp/{tmp_name}_targets.txt", "w+") as f:
+        f.write("\n".join(targets))
+    with open(f"tmp/{tmp_name}_predictions.txt", "w+") as f:
+        f.write("\n".join(predictions))
+    subprocess.run(
+        f"errant_parallel -orig tmp/{tmp_name}_sources.txt -cor tmp/{tmp_name}_targets.txt -out tmp/{tmp_name}_targets.m2 -lev -tok",
+        shell=True,
+    )
+    subprocess.run(
+        f"errant_parallel -orig tmp/{tmp_name}_sources.txt -cor tmp/{tmp_name}_predictions.txt -out tmp/{tmp_name}_predictions.m2 -lev -tok",
+        shell=True,
+    )
+    output = subprocess.check_output(
+        f"errant_compare -ref tmp/{tmp_name}_targets.m2 -hyp tmp/{tmp_name}_predictions.m2",
+        shell=True,
+    )
+    f_05 = float(output.decode().strip().split("\n")[-2].split()[-1].strip())
+    print(f"Prediction fpath: {fpath}\n\nERRANT: {f_05}", flush=True)
+    print(f"Saving to: {out_fpath}", flush=True)
+    save_results(obj={"errant": f_05}, fpath=out_fpath)
+    subprocess.run(f"rm tmp/{tmp_name}_*", shell=True)
+
+
+def main():
+    args = parse_args()
+    fpath = args.fpath
+    print(f"Out: {args.out_fdir}", flush=True)
+    out_fpath = fpath.replace(".jsonl", "_errant.json")
+    evaluate(fpath=fpath, out_fpath=out_fpath)
+
+
+if __name__ == "__main__":
+    print(
+        "\nWARNING: make sure you have ERRANT installed to run the evaluation! Available here: https://github.com/chrisjbryant/errant\n\n",
+        flush=True,
+    )
+    main()
diff --git a/lm_eval/tasks/noreval/ncb/ncb.yaml b/lm_eval/tasks/noreval/ncb/ncb.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0150c25a3bc3be41d7d4a791a4d7372500a73272
--- /dev/null
+++ b/lm_eval/tasks/noreval/ncb/ncb.yaml
@@ -0,0 +1,13 @@
+task: ncb
+dataset_path: hcfa/ncb
+output_type: multiple_choice
+test_split: train
+doc_to_text: ""
+doc_to_target: 0
+doc_to_choice: "{{[correct, wrong]}}"
+num_fewshot: 0
+metric_list:
+  - metric: acc
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/noreval/norbelebele/_norbelebele_yaml b/lm_eval/tasks/noreval/norbelebele/_norbelebele_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..41f6b11356c211cc9bbf1c011b67403cfe049477
--- /dev/null
+++ b/lm_eval/tasks/noreval/norbelebele/_norbelebele_yaml
@@ -0,0 +1,18 @@
+tag: norbelebele
+dataset_path: facebook/belebele
+dataset_name: nob_Latn
+test_split: test
+fewshot_split: test
+fewshot_config:
+  sampler: first_n
+output_type: multiple_choice
+doc_to_target: "{{['1', '2', '3', '4'].index(correct_answer_num)}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/noreval/norbelebele/norbelebele_p0.yaml b/lm_eval/tasks/noreval/norbelebele/norbelebele_p0.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a1037ef735ab7e94ecc95f57e1f7d6413a70b7ba
--- /dev/null
+++ b/lm_eval/tasks/noreval/norbelebele/norbelebele_p0.yaml
@@ -0,0 +1,4 @@
+task: norbelebele_p0
+include: _norbelebele_yaml
+doc_to_text: "Tekst: {{flores_passage}}\nSpørsmål: {{question}}\nA: {{mc_answer1}}\nB: {{mc_answer2}}\nC: {{mc_answer3}}\nD: {{mc_answer4}}\nSvar:"
+doc_to_choice: ["A", "B", "C", "D"]
diff --git a/lm_eval/tasks/noreval/norbelebele/norbelebele_p1.yaml b/lm_eval/tasks/noreval/norbelebele/norbelebele_p1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..43cc76c0d86321831546219be281cec32171d96c
--- /dev/null
+++ b/lm_eval/tasks/noreval/norbelebele/norbelebele_p1.yaml
@@ -0,0 +1,4 @@
+task: norbelebele_p1
+include: _norbelebele_yaml
+doc_to_text: "Bakgrunn: {{flores_passage}}\nSpørsmål:{{question}}\nSvaralternativer:\n- {{mc_answer1}}\n- {{mc_answer2}}\n- {{mc_answer3}}\n- {{mc_answer4}}\nRiktig svar:"
+doc_to_choice: "{{[mc_answer1, mc_answer2, mc_answer3, mc_answer4]}}"
diff --git a/lm_eval/tasks/noreval/norbelebele/norbelebele_p2.yaml b/lm_eval/tasks/noreval/norbelebele/norbelebele_p2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8ff96f0eb4012c000b1000d67c7218fa127a9640
--- /dev/null
+++ b/lm_eval/tasks/noreval/norbelebele/norbelebele_p2.yaml
@@ -0,0 +1,4 @@
+task: norbelebele_p2
+include: _norbelebele_yaml
+doc_to_text: "{{question}}\nHvilket av følgende mulige svar er det riktige?\nA: {{mc_answer1}}\nB: {{mc_answer2}}\nC: {{mc_answer3}}\nD: {{mc_answer4}}\nSvar:"
+doc_to_choice: ["A", "B", "C", "D"]
diff --git a/lm_eval/tasks/noreval/norbelebele/norbelebele_p3.yaml b/lm_eval/tasks/noreval/norbelebele/norbelebele_p3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6fedf28ad18e97455ff4dbe107b67a94db0a9e83
--- /dev/null
+++ b/lm_eval/tasks/noreval/norbelebele/norbelebele_p3.yaml
@@ -0,0 +1,5 @@
+task: norbelebele_p3
+include: _norbelebele_yaml
+doc_to_text: "Svar på følgende spørsmål: {{question}}\nSvaret skal baseres på følgende tekst:\n{{flores_passage}}\nVelg et svar fra denne listen:\n– {{mc_answer1}}\n– {{mc_answer2}},\n– {{mc_answer3}}\n– {{mc_answer4}}"
+doc_to_choice: "{{[mc_answer1, mc_answer2, mc_answer3, mc_answer4]}}"
+target_delimiter: "\n"
diff --git a/lm_eval/tasks/noreval/norbelebele/norbelebele_p4.yaml b/lm_eval/tasks/noreval/norbelebele/norbelebele_p4.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0285168e5544d00a53794b1de022b0e9217984f4
--- /dev/null
+++ b/lm_eval/tasks/noreval/norbelebele/norbelebele_p4.yaml
@@ -0,0 +1,4 @@
+task: norbelebele_p4
+include: _norbelebele_yaml
+doc_to_text: "{{flores_passage}}\n\n{{question}}\n\nA: {{mc_answer1}}\nB: {{mc_answer2}}\nC: {{mc_answer3}}\nD: {{mc_answer4}}\n\nEr det riktige svaret A, B, C, eller D?"
+doc_to_choice: ["A", "B", "C", "D"]
diff --git a/lm_eval/tasks/noreval/norcommonsenseqa/_norcommonsenseqa_yaml b/lm_eval/tasks/noreval/norcommonsenseqa/_norcommonsenseqa_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a17c01fea6d174df3dd2367b4e93ff9628664ad8
--- /dev/null
+++ b/lm_eval/tasks/noreval/norcommonsenseqa/_norcommonsenseqa_yaml
@@ -0,0 +1,15 @@
+dataset_path: ltg/norcommonsenseqa
+output_type: multiple_choice
+training_split: null
+validation_split: null
+test_split: train
+doc_to_target: "{{choices.label.index(answer)}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/noreval/norcommonsenseqa/nno/norcommonsenseqa_nno_p0.yaml b/lm_eval/tasks/noreval/norcommonsenseqa/nno/norcommonsenseqa_nno_p0.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..06e59c2c2db31b4472ebfbfa3766196d5b6e73dc
--- /dev/null
+++ b/lm_eval/tasks/noreval/norcommonsenseqa/nno/norcommonsenseqa_nno_p0.yaml
@@ -0,0 +1,6 @@
+tag: norcommonsenseqa_nno
+dataset_name: nn
+task: norcommonsenseqa_nno_p0
+include: ../_norcommonsenseqa_yaml
+doc_to_text: "Spørsmål: {{question}}\n\nSvar:"
+doc_to_choice: "{{choices.text}}"
diff --git a/lm_eval/tasks/noreval/norcommonsenseqa/nno/norcommonsenseqa_nno_p1.yaml b/lm_eval/tasks/noreval/norcommonsenseqa/nno/norcommonsenseqa_nno_p1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..db7ac9d9526a46c97d97a29d8df039121091d2e5
--- /dev/null
+++ b/lm_eval/tasks/noreval/norcommonsenseqa/nno/norcommonsenseqa_nno_p1.yaml
@@ -0,0 +1,6 @@
+tag: norcommonsenseqa_nno
+dataset_name: nn
+task: norcommonsenseqa_nno_p1
+include: ../_norcommonsenseqa_yaml
+doc_to_text: "{{question}}\nKva av følgande moglege svar er det rette?\nA: {{choices.text[0]}}\nB: {{choices.text[1]}}\nC: {{choices.text[2]}}\nD: {{choices.text[3]}}\nE: {{choices.text[4]}}\n\nSvar:"
+doc_to_choice: "{{choices.label}}"
diff --git a/lm_eval/tasks/noreval/norcommonsenseqa/nno/norcommonsenseqa_nno_p2.yaml b/lm_eval/tasks/noreval/norcommonsenseqa/nno/norcommonsenseqa_nno_p2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2477eca8ec64d9eef8442b7a5b17d2d93f663563
--- /dev/null
+++ b/lm_eval/tasks/noreval/norcommonsenseqa/nno/norcommonsenseqa_nno_p2.yaml
@@ -0,0 +1,6 @@
+tag: norcommonsenseqa_nno
+dataset_name: nn
+task: norcommonsenseqa_nno_p2
+include: ../_norcommonsenseqa_yaml
+doc_to_text: "Gitt alternativa under, kva er svaret på følgande spørsmål: {{question}}\n\nAlternativ:\n- {{choices.text[0]}}\n- {{choices.text[1]}}\n- {{choices.text[2]}}\n- {{choices.text[3]}}\n- {{choices.text[4]}}\n\nSvar:"
+doc_to_choice: "{{choices.label}}"
diff --git a/lm_eval/tasks/noreval/norcommonsenseqa/nno/norcommonsenseqa_nno_p3.yaml b/lm_eval/tasks/noreval/norcommonsenseqa/nno/norcommonsenseqa_nno_p3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f70bd109758d9479d8363ba968165f5a1218b272
--- /dev/null
+++ b/lm_eval/tasks/noreval/norcommonsenseqa/nno/norcommonsenseqa_nno_p3.yaml
@@ -0,0 +1,6 @@
+tag: norcommonsenseqa_nno
+dataset_name: nn
+task: norcommonsenseqa_nno_p3
+include: ../_norcommonsenseqa_yaml
+doc_to_text: "{{question}}\nVel rett svar blant desse alternativa:\n– {{choices.text[0]}}\n– {{choices.text[1]}}\n– {{choices.text[2]}}\n– {{choices.text[3]}}\n– {{choices.text[4]}}\n\nSvar:"
+doc_to_choice: "{{choices.text}}"
diff --git a/lm_eval/tasks/noreval/norcommonsenseqa/nno/norcommonsenseqa_nno_p4.yaml b/lm_eval/tasks/noreval/norcommonsenseqa/nno/norcommonsenseqa_nno_p4.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fa8711c1de87f5528e8f6703325a7b842d83d73d
--- /dev/null
+++ b/lm_eval/tasks/noreval/norcommonsenseqa/nno/norcommonsenseqa_nno_p4.yaml
@@ -0,0 +1,6 @@
+tag: norcommonsenseqa_nno
+dataset_name: nn
+task: norcommonsenseqa_nno_p4
+include: ../_norcommonsenseqa_yaml
+doc_to_text: "{{question}}\nA: {{choices.text[0]}}\nB: {{choices.text[1]}}\nC: {{choices.text[2]}}\nD: {{choices.text[3]}}\nE: {{choices.text[4]}}\n\nEr det rette svaret A, B, C, D, eller E?\n\nSvar:"
+doc_to_choice: "{{choices.label}}"
diff --git a/lm_eval/tasks/noreval/norcommonsenseqa/nob/norcommonsenseqa_nob_p0.yaml b/lm_eval/tasks/noreval/norcommonsenseqa/nob/norcommonsenseqa_nob_p0.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..50430a868b4bc1e735f35b4de503df33a3733013
--- /dev/null
+++ b/lm_eval/tasks/noreval/norcommonsenseqa/nob/norcommonsenseqa_nob_p0.yaml
@@ -0,0 +1,6 @@
+tag: norcommonsenseqa_nob
+dataset_name: nb
+task: norcommonsenseqa_nob_p0
+include: ../_norcommonsenseqa_yaml
+doc_to_text: "Spørsmål: {{question}}\n\nSvar:"
+doc_to_choice: "{{choices.text}}"
diff --git a/lm_eval/tasks/noreval/norcommonsenseqa/nob/norcommonsenseqa_nob_p1.yaml b/lm_eval/tasks/noreval/norcommonsenseqa/nob/norcommonsenseqa_nob_p1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a1c072d6a09f33c7fd96dff217f5831c3060ce7f
--- /dev/null
+++ b/lm_eval/tasks/noreval/norcommonsenseqa/nob/norcommonsenseqa_nob_p1.yaml
@@ -0,0 +1,6 @@
+tag: norcommonsenseqa_nob
+dataset_name: nb
+task: norcommonsenseqa_nob_p1
+include: ../_norcommonsenseqa_yaml
+doc_to_text: "{{question}}\nHvilket av følgende mulige svar er det riktige?\nA: {{choices.text[0]}}\nB: {{choices.text[1]}}\nC: {{choices.text[2]}}\nD: {{choices.text[3]}}\nE: {{choices.text[4]}}\n\nSvar:"
+doc_to_choice: "{{choices.label}}"
diff --git a/lm_eval/tasks/noreval/norcommonsenseqa/nob/norcommonsenseqa_nob_p2.yaml b/lm_eval/tasks/noreval/norcommonsenseqa/nob/norcommonsenseqa_nob_p2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bda34e865e6526b3cb56e2a809dfcbc781200ff0
--- /dev/null
+++ b/lm_eval/tasks/noreval/norcommonsenseqa/nob/norcommonsenseqa_nob_p2.yaml
@@ -0,0 +1,6 @@
+tag: norcommonsenseqa_nob
+dataset_name: nb
+task: norcommonsenseqa_nob_p2
+include: ../_norcommonsenseqa_yaml
+doc_to_text: "Gitt alternativene under, hva er svaret på følgende spørsmål: {{question}}\n\nAlternativer:\n- {{choices.text[0]}}\n- {{choices.text[1]}}\n- {{choices.text[2]}}\n- {{choices.text[3]}}\n- {{choices.text[4]}}\n\nSvar:"
+doc_to_choice: "{{choices.text}}"
diff --git a/lm_eval/tasks/noreval/norcommonsenseqa/nob/norcommonsenseqa_nob_p3.yaml b/lm_eval/tasks/noreval/norcommonsenseqa/nob/norcommonsenseqa_nob_p3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cc015bc8c79c198d66c270519e9fa5a407901894
--- /dev/null
+++ b/lm_eval/tasks/noreval/norcommonsenseqa/nob/norcommonsenseqa_nob_p3.yaml
@@ -0,0 +1,6 @@
+tag: norcommonsenseqa_nob
+dataset_name: nb
+task: norcommonsenseqa_nob_p3
+include: ../_norcommonsenseqa_yaml
+doc_to_text: "{{question}}\nVelg riktig svar blant disse alternativene:\n– {{choices.text[0]}}\n– {{choices.text[1]}}\n– {{choices.text[2]}}\n– {{choices.text[3]}}\n– {{choices.text[4]}}\n\nSvar:"
+doc_to_choice: "{{choices.text}}"
diff --git a/lm_eval/tasks/noreval/norcommonsenseqa/nob/norcommonsenseqa_nob_p4.yaml b/lm_eval/tasks/noreval/norcommonsenseqa/nob/norcommonsenseqa_nob_p4.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6eda8246901500a5460d808180521cbfede19f26
--- /dev/null
+++ b/lm_eval/tasks/noreval/norcommonsenseqa/nob/norcommonsenseqa_nob_p4.yaml
@@ -0,0 +1,6 @@
+tag: norcommonsenseqa_nob
+dataset_name: nb
+task: norcommonsenseqa_nob_p4
+include: ../_norcommonsenseqa_yaml
+doc_to_text: "{{question}}\nA: {{choices.text[0]}}\nB: {{choices.text[1]}}\nC: {{choices.text[2]}}\nD: {{choices.text[3]}}\nE: {{choices.text[4]}}\n\nEr det riktige svaret A, B, C, D, eller E?\n\nSvar:"
+doc_to_choice: "{{choices.label}}"
diff --git a/lm_eval/tasks/noreval/norec/_norec_yaml b/lm_eval/tasks/noreval/norec/_norec_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9a9cb88455263a835bc3a20dd1781bdaa7c22800
--- /dev/null
+++ b/lm_eval/tasks/noreval/norec/_norec_yaml
@@ -0,0 +1,14 @@
+dataset_name: binary
+output_type: multiple_choice
+training_split: train
+test_split: test
+doc_to_target: sentiment
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    aggregation: !function utils.multi_f1
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/noreval/norec/norec_document/norec_document_p0.yaml b/lm_eval/tasks/noreval/norec/norec_document/norec_document_p0.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8596a6b966ca0426389a2970c8ddc23e55f5648c
--- /dev/null
+++ b/lm_eval/tasks/noreval/norec/norec_document/norec_document_p0.yaml
@@ -0,0 +1,6 @@
+tag: norec_document
+dataset_path: ltg/norec_document
+task: norec_document_p0
+include: ../_norec_yaml
+doc_to_text: "Tekst: {{review}}\nSentiment:"
+doc_to_choice: ["negativ", "positiv"]
diff --git a/lm_eval/tasks/noreval/norec/norec_document/norec_document_p1.yaml b/lm_eval/tasks/noreval/norec/norec_document/norec_document_p1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0171cff5880dfb567d4840193240e7d9ceb84c45
--- /dev/null
+++ b/lm_eval/tasks/noreval/norec/norec_document/norec_document_p1.yaml
@@ -0,0 +1,6 @@
+tag: norec_document
+dataset_path: ltg/norec_document
+task: norec_document_p1
+include: ../_norec_yaml
+doc_to_text: "Tekst: {{review}}\nEr anmeldelsen \"positiv\" eller \"negativ\"?"
+doc_to_choice: ["negativ", "positiv"]
diff --git a/lm_eval/tasks/noreval/norec/norec_document/norec_document_p2.yaml b/lm_eval/tasks/noreval/norec/norec_document/norec_document_p2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7b90639d66ed511e58a122421c2a5f6e5cc048ba
--- /dev/null
+++ b/lm_eval/tasks/noreval/norec/norec_document/norec_document_p2.yaml
@@ -0,0 +1,6 @@
+tag: norec_document
+dataset_path: ltg/norec_document
+task: norec_document_p2
+include: ../_norec_yaml
+doc_to_text: "Er polariteten til følgende anmeldelse positiv eller negativ?\nAnmeldelse: {{review}}\nAnmeldelsen er"
+doc_to_choice: ["negativ", "positiv"]
diff --git a/lm_eval/tasks/noreval/norec/norec_document/norec_document_p3.yaml b/lm_eval/tasks/noreval/norec/norec_document/norec_document_p3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c4053708b3dff485d6c609a70179cdffe496f908
--- /dev/null
+++ b/lm_eval/tasks/noreval/norec/norec_document/norec_document_p3.yaml
@@ -0,0 +1,6 @@
+tag: norec_document
+dataset_path: ltg/norec_document
+task: norec_document_p3
+include: ../_norec_yaml
+doc_to_text: "Anmeldelse: {{review}}\nEr anmelderen positiv eller negativ?"
+doc_to_choice: ["negativ", "positiv"]
diff --git a/lm_eval/tasks/noreval/norec/norec_document/norec_document_p4.yaml b/lm_eval/tasks/noreval/norec/norec_document/norec_document_p4.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..59d26a1a22e441ad9c7303920d93b67ae552d8db
--- /dev/null
+++ b/lm_eval/tasks/noreval/norec/norec_document/norec_document_p4.yaml
@@ -0,0 +1,6 @@
+tag: norec_document
+dataset_path: ltg/norec_document
+task: norec_document_p4
+include: ../_norec_yaml
+doc_to_text: "Anmeldelse: {{review}}\nVil du oppsummere anmeldelsen som \"bra\" eller \"dårlig\"?"
+doc_to_choice: ["dårlig", "bra"]
diff --git a/lm_eval/tasks/noreval/norec/norec_sentence/norec_sentence_p0.yaml b/lm_eval/tasks/noreval/norec/norec_sentence/norec_sentence_p0.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..90001d5be006338a9ed6924776ea5cdcfaa8c8b7
--- /dev/null
+++ b/lm_eval/tasks/noreval/norec/norec_sentence/norec_sentence_p0.yaml
@@ -0,0 +1,6 @@
+tag: norec_sentence
+dataset_path: ltg/norec_sentence
+task: norec_sentence_p0
+include: ../_norec_yaml
+doc_to_text: "Tekst: {{review}}\nSentiment:"
+doc_to_choice: ["negativ", "positiv"]
diff --git a/lm_eval/tasks/noreval/norec/norec_sentence/norec_sentence_p1.yaml b/lm_eval/tasks/noreval/norec/norec_sentence/norec_sentence_p1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4c2a10fd9122ba5bcb4f4f4bc5d0e43350cebfb6
--- /dev/null
+++ b/lm_eval/tasks/noreval/norec/norec_sentence/norec_sentence_p1.yaml
@@ -0,0 +1,6 @@
+tag: norec_sentence
+dataset_path: ltg/norec_sentence
+task: norec_sentence_p1
+include: ../_norec_yaml
+doc_to_text: "{{review}}\nEr denne setningen \"positiv\" eller \"negativ\"?"
+doc_to_choice: ["negativ", "positiv"]
diff --git a/lm_eval/tasks/noreval/norec/norec_sentence/norec_sentence_p2.yaml b/lm_eval/tasks/noreval/norec/norec_sentence/norec_sentence_p2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8727e4cb8e2c13c00f9ed88a1b3580e40eb38781
--- /dev/null
+++ b/lm_eval/tasks/noreval/norec/norec_sentence/norec_sentence_p2.yaml
@@ -0,0 +1,6 @@
+tag: norec_sentence
+dataset_path: ltg/norec_sentence
+task: norec_sentence_p2
+include: ../_norec_yaml
+doc_to_text: "{{review}}\nHva slags sentiment uttrykker anmelderen?"
+doc_to_choice: ["negativ", "positiv"]
diff --git a/lm_eval/tasks/noreval/norec/norec_sentence/norec_sentence_p3.yaml b/lm_eval/tasks/noreval/norec/norec_sentence/norec_sentence_p3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ba5e1e0f0120ceef99708f3825551eb4ef341f74
--- /dev/null
+++ b/lm_eval/tasks/noreval/norec/norec_sentence/norec_sentence_p3.yaml
@@ -0,0 +1,6 @@
+tag: norec_sentence
+dataset_path: ltg/norec_sentence
+task: norec_sentence_p3
+include: ../_norec_yaml
+doc_to_text: "{{review}}\nEr anmeldelsen \"positiv\" eller \"negativ\"?"
+doc_to_choice: ["negativ", "positiv"]
diff --git a/lm_eval/tasks/noreval/norec/norec_sentence/norec_sentence_p4.yaml b/lm_eval/tasks/noreval/norec/norec_sentence/norec_sentence_p4.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8fda6f48ed5c8f3fbeccec03ec3a69398f47e01c
--- /dev/null
+++ b/lm_eval/tasks/noreval/norec/norec_sentence/norec_sentence_p4.yaml
@@ -0,0 +1,6 @@
+tag: norec_sentence
+dataset_path: ltg/norec_sentence
+task: norec_sentence_p4
+include: ../_norec_yaml
+doc_to_text: "{{review}}\nEr denne setningen positiv eller negativ?"
+doc_to_choice: ["negativ", "positiv"]
diff --git a/lm_eval/tasks/noreval/norec/utils.py b/lm_eval/tasks/noreval/norec/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..41f4bce1928cc76ab1bd23b8309c3bcdc5b33963
--- /dev/null
+++ b/lm_eval/tasks/noreval/norec/utils.py
@@ -0,0 +1,13 @@
+import numpy as np
+import sklearn
+
+
+def multi_f1(items):
+    """
+    Computes the macro-average F1 score.
+    """
+    preds, golds = zip(*items)
+    preds = np.array(preds)
+    golds = np.array(golds)
+    fscore = sklearn.metrics.f1_score(golds, preds, average="macro")
+    return fscore
diff --git a/lm_eval/tasks/noreval/noreval.jpg b/lm_eval/tasks/noreval/noreval.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..69d156f7f8a43a505b10f81cd89c18a90e3d1b48
Binary files /dev/null and b/lm_eval/tasks/noreval/noreval.jpg differ
diff --git a/lm_eval/tasks/noreval/noridiom/_noridiom_yaml b/lm_eval/tasks/noreval/noridiom/_noridiom_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..44b1f12115b3f786f9924539a7e04afbaccae970
--- /dev/null
+++ b/lm_eval/tasks/noreval/noridiom/_noridiom_yaml
@@ -0,0 +1,23 @@
+dataset_path: Sprakbanken/Norwegian_idioms
+training_split: null
+validation_split: null
+test_split: test
+num_fewshot: 0
+output_type: generate_until
+doc_to_target: completion
+process_results: !function utils.process_results
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  num_beams: 1
+  max_new_tokens: 16
+metric_list:
+  - metric: em
+    aggregation: mean
+    higher_is_better: true
+  - metric: fscore
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/noreval/noridiom/nno/noridiom_nno_p0.yaml b/lm_eval/tasks/noreval/noridiom/nno/noridiom_nno_p0.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2bfbe037afd4e6a5376ce6d8eaace76b7ad30360
--- /dev/null
+++ b/lm_eval/tasks/noreval/noridiom/nno/noridiom_nno_p0.yaml
@@ -0,0 +1,5 @@
+tag: noridiom_nno
+task: noridiom_nno_p0
+include: ../_noridiom_yaml
+process_docs: !function ../utils.filter_dataset_nn
+doc_to_text: "Fullfør dette uttrykket: {{idiom_start}}"
diff --git a/lm_eval/tasks/noreval/noridiom/nno/noridiom_nno_p1.yaml b/lm_eval/tasks/noreval/noridiom/nno/noridiom_nno_p1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..960f808cf68fe75f80ca77b0e57e45bba7667766
--- /dev/null
+++ b/lm_eval/tasks/noreval/noridiom/nno/noridiom_nno_p1.yaml
@@ -0,0 +1,5 @@
+tag: noridiom_nno
+task: noridiom_nno_p1
+include: ../_noridiom_yaml
+process_docs: !function ../utils.filter_dataset_nn
+doc_to_text: "Skriv fortsetjinga av idiomet {{idiom_start}}"
diff --git a/lm_eval/tasks/noreval/noridiom/nno/noridiom_nno_p2.yaml b/lm_eval/tasks/noreval/noridiom/nno/noridiom_nno_p2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b2207cfa99ba21f040c91d48cd022891a92dec94
--- /dev/null
+++ b/lm_eval/tasks/noreval/noridiom/nno/noridiom_nno_p2.yaml
@@ -0,0 +1,5 @@
+tag: noridiom_nno
+task: noridiom_nno_p2
+include: ../_noridiom_yaml
+process_docs: !function ../utils.filter_dataset_nn
+doc_to_text: "Korleis fortset uttrykket \"{{idiom_start}}\"?"
diff --git a/lm_eval/tasks/noreval/noridiom/nno/noridiom_nno_p3.yaml b/lm_eval/tasks/noreval/noridiom/nno/noridiom_nno_p3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..edd7cf991e7d03d2e8aafed8c86d87742855f41c
--- /dev/null
+++ b/lm_eval/tasks/noreval/noridiom/nno/noridiom_nno_p3.yaml
@@ -0,0 +1,5 @@
+tag: noridiom_nno
+task: noridiom_nno_p3
+include: ../_noridiom_yaml
+process_docs: !function ../utils.filter_dataset_nn
+doc_to_text: "Fullfør vendinga: {{idiom_start}}"
diff --git a/lm_eval/tasks/noreval/noridiom/nno/noridiom_nno_p4.yaml b/lm_eval/tasks/noreval/noridiom/nno/noridiom_nno_p4.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..96f25dec3173c3c4f8b713ad4d953765f2657796
--- /dev/null
+++ b/lm_eval/tasks/noreval/noridiom/nno/noridiom_nno_p4.yaml
@@ -0,0 +1,5 @@
+tag: noridiom_nno
+task: noridiom_nno_p4
+include: ../_noridiom_yaml
+process_docs: !function ../utils.filter_dataset_nn
+doc_to_text: "{{idiom_start}}"
diff --git a/lm_eval/tasks/noreval/noridiom/nob/noridiom_nob_p0.yaml b/lm_eval/tasks/noreval/noridiom/nob/noridiom_nob_p0.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7dcd6fe498da50c46df4a1c25dd6716565b8e0a3
--- /dev/null
+++ b/lm_eval/tasks/noreval/noridiom/nob/noridiom_nob_p0.yaml
@@ -0,0 +1,5 @@
+tag: noridiom_nob
+task: noridiom_nob_p0
+include: ../_noridiom_yaml
+process_docs: !function ../utils.filter_dataset_nb
+doc_to_text: "Fullfør dette uttrykket: {{idiom_start}}"
diff --git a/lm_eval/tasks/noreval/noridiom/nob/noridiom_nob_p1.yaml b/lm_eval/tasks/noreval/noridiom/nob/noridiom_nob_p1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..95460b5d7619dbe8e247e5fbd8507224242a4d95
--- /dev/null
+++ b/lm_eval/tasks/noreval/noridiom/nob/noridiom_nob_p1.yaml
@@ -0,0 +1,5 @@
+tag: noridiom_nob
+task: noridiom_nob_p1
+include: ../_noridiom_yaml
+process_docs: !function ../utils.filter_dataset_nb
+doc_to_text: "Skriv fortsettelsen av idiomet {{idiom_start}}"
diff --git a/lm_eval/tasks/noreval/noridiom/nob/noridiom_nob_p2.yaml b/lm_eval/tasks/noreval/noridiom/nob/noridiom_nob_p2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f676054f04483387ef3a8a65af7728ddf83b88a4
--- /dev/null
+++ b/lm_eval/tasks/noreval/noridiom/nob/noridiom_nob_p2.yaml
@@ -0,0 +1,5 @@
+tag: noridiom_nob
+task: noridiom_nob_p2
+include: ../_noridiom_yaml
+process_docs: !function ../utils.filter_dataset_nb
+doc_to_text: "Hvordan fortsetter uttrykket \"{{idiom_start}}\"?"
diff --git a/lm_eval/tasks/noreval/noridiom/nob/noridiom_nob_p3.yaml b/lm_eval/tasks/noreval/noridiom/nob/noridiom_nob_p3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..318be0f280422bbde6c5a1579cef54a2c77aef13
--- /dev/null
+++ b/lm_eval/tasks/noreval/noridiom/nob/noridiom_nob_p3.yaml
@@ -0,0 +1,5 @@
+tag: noridiom_nob
+task: noridiom_nob_p3
+include: ../_noridiom_yaml
+process_docs: !function ../utils.filter_dataset_nb
+doc_to_text: "Fullfør vendingen \"{{idiom_start}}\""
diff --git a/lm_eval/tasks/noreval/noridiom/nob/noridiom_nob_p4.yaml b/lm_eval/tasks/noreval/noridiom/nob/noridiom_nob_p4.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f03c9df6c48fee418af50b9dc2d561d4b3e6c6dc
--- /dev/null
+++ b/lm_eval/tasks/noreval/noridiom/nob/noridiom_nob_p4.yaml
@@ -0,0 +1,5 @@
+tag: noridiom_nob
+task: noridiom_nob_p4
+include: ../_noridiom_yaml
+process_docs: !function ../utils.filter_dataset_nb
+doc_to_text: "{{idiom_start}}"
diff --git a/lm_eval/tasks/noreval/noridiom/utils.py b/lm_eval/tasks/noreval/noridiom/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..938891b52dc81fca54c1779fee54437d5b16a947
--- /dev/null
+++ b/lm_eval/tasks/noreval/noridiom/utils.py
@@ -0,0 +1,44 @@
+from collections import Counter
+from string import punctuation
+
+import numpy as np
+
+
+def normalize(text):
+    exclude = set(punctuation)
+    return "".join(ch for ch in text if ch not in exclude).lower().strip()
+
+
+def f1(prediction, completion):
+    gold_toks = completion.split()
+    pred_toks = prediction.split()
+    common = Counter(gold_toks) & Counter(pred_toks)
+    num_same = sum(common.values())
+    if len(gold_toks) == 0 or len(pred_toks) == 0:
+        return int(gold_toks == pred_toks)
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(pred_toks)
+    recall = 1.0 * num_same / len(gold_toks)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+
+
+def process_results(doc, results):
+    prediction = normalize(results[0])
+    completions = [normalize(completion) for completion in doc["accepted_completions"]]
+    exact_match = np.nanmax(
+        [int(prediction == completion) for completion in completions]
+    )
+    fscore = np.nanmax(
+        [f1(prediction=prediction, completion=completion) for completion in completions]
+    )
+    return {"em": exact_match, "fscore": fscore}
+
+
+def filter_dataset_nb(dataset):
+    return dataset.filter(lambda example: example["language"] == "nob")
+
+
+def filter_dataset_nn(dataset):
+    return dataset.filter(lambda example: example["language"] == "nno")
diff --git a/lm_eval/tasks/noreval/noropenbookqa/_noropenbookqa_yaml b/lm_eval/tasks/noreval/noropenbookqa/_noropenbookqa_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..25166cfa5ada9b15c3b58f1ac2082f4866c6f2be
--- /dev/null
+++ b/lm_eval/tasks/noreval/noropenbookqa/_noropenbookqa_yaml
@@ -0,0 +1,16 @@
+dataset_path: ltg/noropenbookqa
+output_type: multiple_choice
+training_split: train
+validation_split: null
+test_split: test
+process_docs: !function utils.filter_dataset
+doc_to_target: "{{choices.label.index(answer)}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/noreval/noropenbookqa/nno/noropenbookqa_nno_p0.yaml b/lm_eval/tasks/noreval/noropenbookqa/nno/noropenbookqa_nno_p0.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..63551decd68339f3cccece99a34736bf1e1b4e24
--- /dev/null
+++ b/lm_eval/tasks/noreval/noropenbookqa/nno/noropenbookqa_nno_p0.yaml
@@ -0,0 +1,6 @@
+tag: noropenbookqa_nno
+task: noropenbookqa_nno_p0
+dataset_name: nn
+include: ../_noropenbookqa_yaml
+doc_to_text: "{{fact}}\n{{question_stem}}"
+doc_to_choice: "{{choices.text}}"
diff --git a/lm_eval/tasks/noreval/noropenbookqa/nno/noropenbookqa_nno_p1.yaml b/lm_eval/tasks/noreval/noropenbookqa/nno/noropenbookqa_nno_p1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d6888bd4088b3fd5213bbcd2e8d9fe0caca53ea7
--- /dev/null
+++ b/lm_eval/tasks/noreval/noropenbookqa/nno/noropenbookqa_nno_p1.yaml
@@ -0,0 +1,6 @@
+tag: noropenbookqa_nno
+task: noropenbookqa_nno_p1
+dataset_name: nn
+include: ../_noropenbookqa_yaml
+doc_to_text: "Faktatekst: {{fact}}\nSpørsmål til teksten: {{question_stem}}\n\nSvaralternativer:\n- {{choices.text[0]}}\n- {{choices.text[1]}}\n- {{choices.text[2]}}\n- {{choices.text[3]}}\n\nKva er rett svar?"
+doc_to_choice: "{{choices.text}}"
diff --git a/lm_eval/tasks/noreval/noropenbookqa/nno/noropenbookqa_nno_p2.yaml b/lm_eval/tasks/noreval/noropenbookqa/nno/noropenbookqa_nno_p2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e5b634f674f2035f4872c88e1ab53a99716f8a28
--- /dev/null
+++ b/lm_eval/tasks/noreval/noropenbookqa/nno/noropenbookqa_nno_p2.yaml
@@ -0,0 +1,6 @@
+tag: noropenbookqa_nno
+task: noropenbookqa_nno_p2
+dataset_name: nn
+include: ../_noropenbookqa_yaml
+doc_to_text: "{{fact}}\n{{question_stem}}\nA: {{choices.text[0]}}\nB: {{choices.text[1]}}\nC: {{choices.text[2]}}\nD: {{choices.text[3]}}\n\nEr det rette svaret A, B, C, eller D?\n\nSvar:"
+doc_to_choice: "{{choices.label}}"
diff --git a/lm_eval/tasks/noreval/noropenbookqa/nno/noropenbookqa_nno_p3.yaml b/lm_eval/tasks/noreval/noropenbookqa/nno/noropenbookqa_nno_p3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d356818e76d9ddb5910ab53218f6bf8f04f4126c
--- /dev/null
+++ b/lm_eval/tasks/noreval/noropenbookqa/nno/noropenbookqa_nno_p3.yaml
@@ -0,0 +1,6 @@
+tag: noropenbookqa_nno
+task: noropenbookqa_nno_p3
+dataset_name: nn
+include: ../_noropenbookqa_yaml
+doc_to_text: "Bakgrunn: {{fact}}\n\nSpørsmål: {{question_stem}}\nA: {{choices.text[0]}}\nB: {{choices.text[1]}}\nC: {{choices.text[2]}}\nD: {{choices.text[3]}}\n\nSvar:"
+doc_to_choice: "{{choices.label}}"
diff --git a/lm_eval/tasks/noreval/noropenbookqa/nno/noropenbookqa_nno_p4.yaml b/lm_eval/tasks/noreval/noropenbookqa/nno/noropenbookqa_nno_p4.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b819b72b1c703ab5a0afd5b1c891b444d36d7de4
--- /dev/null
+++ b/lm_eval/tasks/noreval/noropenbookqa/nno/noropenbookqa_nno_p4.yaml
@@ -0,0 +1,6 @@
+tag: noropenbookqa_nno
+task: noropenbookqa_nno_p4
+dataset_name: nn
+include: ../_noropenbookqa_yaml
+doc_to_text: "Ta utgangspunkt i følgande fakta når du svarar på spørsmålet: {{fact}}\n\n{{question_stem}}\nVel rett svar blant desse alternativa:\n– {{choices.text[0]}}\n– {{choices.text[1]}}\n– {{choices.text[2]}}\n– {{choices.text[3]}}\n\nSvar:"
+doc_to_choice: "{{choices.text}}"
diff --git a/lm_eval/tasks/noreval/noropenbookqa/nob/noropenbookqa_nob_p0.yaml b/lm_eval/tasks/noreval/noropenbookqa/nob/noropenbookqa_nob_p0.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a22f147a7e7114d616451a518f0951daa4676d75
--- /dev/null
+++ b/lm_eval/tasks/noreval/noropenbookqa/nob/noropenbookqa_nob_p0.yaml
@@ -0,0 +1,6 @@
+tag: noropenbookqa_nob
+task: noropenbookqa_nob_p0
+dataset_name: nb
+include: ../_noropenbookqa_yaml
+doc_to_text: "{{fact}}\n{{question_stem}}"
+doc_to_choice: "{{choices.text}}"
diff --git a/lm_eval/tasks/noreval/noropenbookqa/nob/noropenbookqa_nob_p1.yaml b/lm_eval/tasks/noreval/noropenbookqa/nob/noropenbookqa_nob_p1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5aa3fc62ae4016209e1a0d0da9a75d3b95c1a39a
--- /dev/null
+++ b/lm_eval/tasks/noreval/noropenbookqa/nob/noropenbookqa_nob_p1.yaml
@@ -0,0 +1,6 @@
+tag: noropenbookqa_nob
+task: noropenbookqa_nob_p1
+dataset_name: nb
+include: ../_noropenbookqa_yaml
+doc_to_text: "Faktatekst: {{fact}}\nSpørsmål til teksten: {{question_stem}}\n\nSvaralternativer:\n- {{choices.text[0]}}\n- {{choices.text[1]}}\n- {{choices.text[2]}}\n- {{choices.text[3]}}\n\nHva er riktig svar?"
+doc_to_choice: "{{choices.text}}"
diff --git a/lm_eval/tasks/noreval/noropenbookqa/nob/noropenbookqa_nob_p2.yaml b/lm_eval/tasks/noreval/noropenbookqa/nob/noropenbookqa_nob_p2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..667b04a114fec38d60e9e5bfcb522b266ddd51d1
--- /dev/null
+++ b/lm_eval/tasks/noreval/noropenbookqa/nob/noropenbookqa_nob_p2.yaml
@@ -0,0 +1,6 @@
+tag: noropenbookqa_nob
+task: noropenbookqa_nob_p2
+dataset_name: nb
+include: ../_noropenbookqa_yaml
+doc_to_text: "{{fact}}\n{{question_stem}}\nA: {{choices.text[0]}}\nB: {{choices.text[1]}}\nC: {{choices.text[2]}}\nD: {{choices.text[3]}}\n\nEr det riktige svaret A, B, C, eller D?\n\nSvar:"
+doc_to_choice: "{{choices.label}}"
diff --git a/lm_eval/tasks/noreval/noropenbookqa/nob/noropenbookqa_nob_p3.yaml b/lm_eval/tasks/noreval/noropenbookqa/nob/noropenbookqa_nob_p3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..31f7b7fe8893bddf52416e1cfd6ef279afe84294
--- /dev/null
+++ b/lm_eval/tasks/noreval/noropenbookqa/nob/noropenbookqa_nob_p3.yaml
@@ -0,0 +1,6 @@
+tag: noropenbookqa_nob
+task: noropenbookqa_nob_p3
+dataset_name: nb
+include: ../_noropenbookqa_yaml
+doc_to_text: "Bakgrunn: {{fact}}\n\nSpørsmål: {{question_stem}}\nA: {{choices.text[0]}}\nB: {{choices.text[1]}}\nC: {{choices.text[2]}}\nD: {{choices.text[3]}}\n\nSvar:"
+doc_to_choice: "{{choices.label}}"
diff --git a/lm_eval/tasks/noreval/noropenbookqa/nob/noropenbookqa_nob_p4.yaml b/lm_eval/tasks/noreval/noropenbookqa/nob/noropenbookqa_nob_p4.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ce7b8ae6feeaa4507a82afff7f36917cfbf7675e
--- /dev/null
+++ b/lm_eval/tasks/noreval/noropenbookqa/nob/noropenbookqa_nob_p4.yaml
@@ -0,0 +1,6 @@
+tag: noropenbookqa_nob
+task: noropenbookqa_nob_p4
+dataset_name: nb
+include: ../_noropenbookqa_yaml
+doc_to_text: "Ta utgangspunkt i følgende fakta når du svarer på spørsmålet: {{fact}}\n\n{{question_stem}}\nVelg riktig svar blant disse alternativene:\n– {{choices.text[0]}}\n– {{choices.text[1]}}\n– {{choices.text[2]}}\n– {{choices.text[3]}}\n\nSvar:"
+doc_to_choice: "{{choices.text}}"
diff --git a/lm_eval/tasks/noreval/noropenbookqa/utils.py b/lm_eval/tasks/noreval/noropenbookqa/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c74e93a2e419ff4e6d607ef6931bf89fa729aa01
--- /dev/null
+++ b/lm_eval/tasks/noreval/noropenbookqa/utils.py
@@ -0,0 +1,5 @@
+import datasets
+
+
+def filter_dataset(dataset: datasets.Dataset) -> datasets.Dataset:
+    return dataset.filter(lambda example: len(example["fact"]) > 0)
diff --git a/lm_eval/tasks/noreval/norquad/_norquad_yaml b/lm_eval/tasks/noreval/norquad/_norquad_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c1a168ce6811ddb47cfe712b534a083109d7093c
--- /dev/null
+++ b/lm_eval/tasks/noreval/norquad/_norquad_yaml
@@ -0,0 +1,25 @@
+tag: norquad
+dataset_path: ltg/norquad
+output_type: generate_until
+training_split: train
+validation_split: validation
+test_split: test
+doc_to_target: '{{answers["text"][0]}}'
+process_results: !function utils.process_results
+process_docs: !function utils.process_docs
+target_delimiter: ' '
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  num_beams: 1
+  max_new_tokens: 32
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/noreval/norquad/norquad_p0.yaml b/lm_eval/tasks/noreval/norquad/norquad_p0.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..690a10e8f6b9367c60fa8026cd9df29a6caa4829
--- /dev/null
+++ b/lm_eval/tasks/noreval/norquad/norquad_p0.yaml
@@ -0,0 +1,3 @@
+task: norquad_p0
+include: _norquad_yaml
+doc_to_text: !function utils.p0
diff --git a/lm_eval/tasks/noreval/norquad/norquad_p1.yaml b/lm_eval/tasks/noreval/norquad/norquad_p1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..640a8d85ba7cc317d1879bfd013ffc862d329ac2
--- /dev/null
+++ b/lm_eval/tasks/noreval/norquad/norquad_p1.yaml
@@ -0,0 +1,3 @@
+task: norquad_p1
+include: _norquad_yaml
+doc_to_text: !function utils.p1
diff --git a/lm_eval/tasks/noreval/norquad/norquad_p2.yaml b/lm_eval/tasks/noreval/norquad/norquad_p2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b18ce5a0feff38bf62f68de342d5fbbd8ed0c44a
--- /dev/null
+++ b/lm_eval/tasks/noreval/norquad/norquad_p2.yaml
@@ -0,0 +1,3 @@
+task: norquad_p2
+include: _norquad_yaml
+doc_to_text: !function utils.p2
diff --git a/lm_eval/tasks/noreval/norquad/norquad_p3.yaml b/lm_eval/tasks/noreval/norquad/norquad_p3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5f10683be8638797c6e029cf29bfe9bca1e125f5
--- /dev/null
+++ b/lm_eval/tasks/noreval/norquad/norquad_p3.yaml
@@ -0,0 +1,3 @@
+task: norquad_p3
+include: _norquad_yaml
+doc_to_text: !function utils.p3
diff --git a/lm_eval/tasks/noreval/norquad/norquad_p4.yaml b/lm_eval/tasks/noreval/norquad/norquad_p4.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dc8b42dfde23d29cefbb39efac974110f0f7fc9d
--- /dev/null
+++ b/lm_eval/tasks/noreval/norquad/norquad_p4.yaml
@@ -0,0 +1,3 @@
+task: norquad_p4
+include: _norquad_yaml
+doc_to_text: !function utils.p4
diff --git a/lm_eval/tasks/noreval/norquad/utils.py b/lm_eval/tasks/noreval/norquad/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a072fb858834a8b61f173fd39ff58b63cb3686e
--- /dev/null
+++ b/lm_eval/tasks/noreval/norquad/utils.py
@@ -0,0 +1,62 @@
+import datasets
+import transformers.data.metrics.squad_metrics as squad_metrics
+
+
+def process_results(doc, results):
+    preds = results[0]
+    reference = doc["answers"]["text"][0]
+    f1_sum = squad_metrics.compute_f1(reference, preds)
+    exact_match = squad_metrics.compute_exact(reference, preds)
+    return {"f1": f1_sum, "exact_match": exact_match}
+
+
+def process_docs(dataset: datasets.Dataset):
+    def _helper(doc):
+        doc["title"] = doc["context"].strip().split("\n")[0].strip()
+        doc["passage"] = "\n".join(doc["context"].strip().split("\n")[1:]).strip()
+        doc["question"] = " ".join(doc["question"].strip().split())
+        return doc
+
+    return dataset.map(_helper)
+
+
+def p0(doc):
+    title = doc["title"]
+    passage = doc["passage"]
+    question = doc["question"]
+    prompt = f"Tittel: {title}\n\nTekst: {passage}\n\nSpørsmål: {question}\n\nSvar:"
+    return prompt
+
+
+def p1(doc):
+    title = doc["title"]
+    passage = doc["passage"]
+    question = doc["question"]
+    prompt = f'Tittel: {title}\n\nTekst: {passage}\n\nGitt teksten over, hva er svaret på følgende spørsmål? "{question}"\n\nSvar:'
+    return prompt
+
+
+def p2(doc):
+    title = doc["title"]
+    passage = doc["passage"]
+    question = doc["question"]
+    prompt = (
+        f"Tittel: {title}\n\nTekst: {passage}\n\nSvar på følgende: {question}\n\nSvar:"
+    )
+    return prompt
+
+
+def p3(doc):
+    title = doc["title"]
+    passage = doc["passage"]
+    question = doc["question"]
+    prompt = f'Tittel: {title}\n\nTekst: {passage}\n\nHvordan kan man svare på spørsmålet "{question}", gitt teksten over?\n\nSvar:'
+    return prompt
+
+
+def p4(doc):
+    title = doc["title"]
+    passage = doc["passage"]
+    question = doc["question"]
+    prompt = f'Tittel: {title}\n\nTekst:{passage}\n\nGitt teksten over, besvar følgende spørsmål: "{question}"\n\nSvar:'
+    return prompt
diff --git a/lm_eval/tasks/noreval/norrewrite-instruct/norrewrite_instruct.yaml b/lm_eval/tasks/noreval/norrewrite-instruct/norrewrite_instruct.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e2a9604632eeff5045aff2f6aee3b3cb1d45cf17
--- /dev/null
+++ b/lm_eval/tasks/noreval/norrewrite-instruct/norrewrite_instruct.yaml
@@ -0,0 +1,20 @@
+task: norrewrite_instruct
+dataset_path: ltg/norrewrite-instruct
+training_split: null
+test_split: test
+doc_to_text: "{{prompt}} {{context}}"
+doc_to_target: response
+output_type: generate_until
+metric_list:
+  - metric: bleu
+    higher_is_better: true
+  - metric: chrf
+    higher_is_better: true
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  num_beams: 1
+  max_new_tokens: 256
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/noreval/norsumm/_norsumm_yaml b/lm_eval/tasks/noreval/norsumm/_norsumm_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e6121c038583965065ed40134561deafc2da3177
--- /dev/null
+++ b/lm_eval/tasks/noreval/norsumm/_norsumm_yaml
@@ -0,0 +1,35 @@
+dataset_path: SamiaT/NorSumm
+training_split: null
+validation_split: null
+test_split: test
+num_fewshot: 0
+doc_to_target: summaries
+output_type: generate_until
+process_results: !function utils.process_results
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  num_beams: 1
+  max_new_tokens: 256
+metric_list:
+  - metric: bleu_max
+    aggregation: mean
+    higher_is_better: true
+  - metric: bleu_avg
+    aggregation: mean
+    higher_is_better: true
+  - metric: rougeL_max
+    aggregation: mean
+    higher_is_better: true
+  - metric: rougeL_avg
+    aggregation: mean
+    higher_is_better: true
+  - metric: bertscore_f1_max
+    aggregation: mean
+    higher_is_better: true
+  - metric: bertscore_f1_avg
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/noreval/norsumm/nno/norsumm_nno_p0.yaml b/lm_eval/tasks/noreval/norsumm/nno/norsumm_nno_p0.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8f79a7e5324737b0904c18fa9d657654156b59da
--- /dev/null
+++ b/lm_eval/tasks/noreval/norsumm/nno/norsumm_nno_p0.yaml
@@ -0,0 +1,5 @@
+tag: norsumm_nno
+dataset_name: nn
+task: norsumm_nno_p0
+include: ../_norsumm_yaml
+doc_to_text: "Skriv ei oppsummering av følgande artikkel med berre nokre få punkt: {{article}}\nOppsummering:"
diff --git a/lm_eval/tasks/noreval/norsumm/nno/norsumm_nno_p1.yaml b/lm_eval/tasks/noreval/norsumm/nno/norsumm_nno_p1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e693f3500924db00e8a1e3f51ed842de829f03ea
--- /dev/null
+++ b/lm_eval/tasks/noreval/norsumm/nno/norsumm_nno_p1.yaml
@@ -0,0 +1,5 @@
+tag: norsumm_nno
+dataset_name: nn
+task: norsumm_nno_p1
+include: ../_norsumm_yaml
+doc_to_text: "Oppsummer følgande artikkel med nokre få setningar: {{article}}\nOppsummering:"
diff --git a/lm_eval/tasks/noreval/norsumm/nno/norsumm_nno_p2.yaml b/lm_eval/tasks/noreval/norsumm/nno/norsumm_nno_p2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7c2d725c638a16dbdd80587b27d65512f6959c01
--- /dev/null
+++ b/lm_eval/tasks/noreval/norsumm/nno/norsumm_nno_p2.yaml
@@ -0,0 +1,6 @@
+tag: norsumm_nno
+dataset_name: nn
+task: norsumm_nno_p2
+include: ../_norsumm_yaml
+doc_to_text: "{{article}}\nSkriv ein kort og presis oppsummering av teksten over. Språket må vere klart og lett å forstå. Sørg for å ikkje introdusere feil. Oppsummeringa må dekkje følgande spørsmål: kven, kva, kor, når, og kvifor er denne saka viktig å vite om. Oppsummeringa må vere engasjerande og framheve nøkkelinformasjon frå artikkelen. Oppsummeringa skal innehalde maksimalt 700 tegn, inkludert mellomrom."
+target_delimiter: "\n"
diff --git a/lm_eval/tasks/noreval/norsumm/nno/norsumm_nno_p3.yaml b/lm_eval/tasks/noreval/norsumm/nno/norsumm_nno_p3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a21f8438ed91a72618c7f6984118fd6a517ac6c2
--- /dev/null
+++ b/lm_eval/tasks/noreval/norsumm/nno/norsumm_nno_p3.yaml
@@ -0,0 +1,6 @@
+tag: norsumm_nno
+dataset_name: nn
+task: norsumm_nno_p3
+include: ../_norsumm_yaml
+doc_to_text: "Gje eit kortfatta samandrag av følgande tekst: {{article}}"
+target_delimiter: "\n"
diff --git a/lm_eval/tasks/noreval/norsumm/nno/norsumm_nno_p4.yaml b/lm_eval/tasks/noreval/norsumm/nno/norsumm_nno_p4.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1385549e5c57be539f778eb495ce4af1d303eec2
--- /dev/null
+++ b/lm_eval/tasks/noreval/norsumm/nno/norsumm_nno_p4.yaml
@@ -0,0 +1,5 @@
+tag: norsumm_nno
+dataset_name: nn
+task: norsumm_nno_p4
+include: ../_norsumm_yaml
+doc_to_text: "Lag ein kort oppsummering som samanfattar den følgande teksten i nokre få punkt:\n{{article}}\n\nOppsummering:"
diff --git a/lm_eval/tasks/noreval/norsumm/nno/norsumm_nno_p5.yaml b/lm_eval/tasks/noreval/norsumm/nno/norsumm_nno_p5.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8e89e95ca2f3f13b1015f7b3cda3e21441d13b74
--- /dev/null
+++ b/lm_eval/tasks/noreval/norsumm/nno/norsumm_nno_p5.yaml
@@ -0,0 +1,5 @@
+tag: norsumm_nno
+dataset_name: nn
+task: norsumm_nno_p5
+include: ../_norsumm_yaml
+doc_to_text: "Heile artikkelen:\n{{article}}\n\nHovudpunkt:"
diff --git a/lm_eval/tasks/noreval/norsumm/nob/norsumm_nob_p0.yaml b/lm_eval/tasks/noreval/norsumm/nob/norsumm_nob_p0.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4489c35506ca6f4e0c1b2384d8571f2862c18ea4
--- /dev/null
+++ b/lm_eval/tasks/noreval/norsumm/nob/norsumm_nob_p0.yaml
@@ -0,0 +1,5 @@
+tag: norsumm_nob
+dataset_name: nb
+task: norsumm_nob_p0
+include: ../_norsumm_yaml
+doc_to_text: "Skriv en oppsummering av følgende artikkel med kun noen få punkter: {{article}}\nOppsummering:"
diff --git a/lm_eval/tasks/noreval/norsumm/nob/norsumm_nob_p1.yaml b/lm_eval/tasks/noreval/norsumm/nob/norsumm_nob_p1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2311b177197ed21998cb127758e413088d86e0e7
--- /dev/null
+++ b/lm_eval/tasks/noreval/norsumm/nob/norsumm_nob_p1.yaml
@@ -0,0 +1,5 @@
+tag: norsumm_nob
+dataset_name: nb
+task: norsumm_nob_p1
+include: ../_norsumm_yaml
+doc_to_text: "Oppsummer følgende artikkel med noen få setninger: {{article}}\nOppsummering:"
diff --git a/lm_eval/tasks/noreval/norsumm/nob/norsumm_nob_p2.yaml b/lm_eval/tasks/noreval/norsumm/nob/norsumm_nob_p2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6f56d457cb2cc887f2f12a804e2914294cf786f2
--- /dev/null
+++ b/lm_eval/tasks/noreval/norsumm/nob/norsumm_nob_p2.yaml
@@ -0,0 +1,6 @@
+tag: norsumm_nob
+dataset_name: nb
+task: norsumm_nob_p2
+include: ../_norsumm_yaml
+doc_to_text: "{{article}}\nSkriv en kort og presis oppsummering av teksten over. Språket må være klart og lett å forstå. Sørg for å ikke introdusere feil. Oppsummeringen må dekke følgende spørsmål: hvem, hva, hvor, når, og hvorfor er denne saken viktig å vite om. Oppsummeringen må være engasjerende og fremheve nøkkelinformasjon fra artikkelen. Oppsummeringen skal inneholde maksimalt 700 tegn, inkludert mellomrom."
+target_delimiter: "\n"
diff --git a/lm_eval/tasks/noreval/norsumm/nob/norsumm_nob_p3.yaml b/lm_eval/tasks/noreval/norsumm/nob/norsumm_nob_p3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..668010386639f34fe96030d28060270ecfabbe63
--- /dev/null
+++ b/lm_eval/tasks/noreval/norsumm/nob/norsumm_nob_p3.yaml
@@ -0,0 +1,6 @@
+tag: norsumm_nob
+dataset_name: nb
+task: norsumm_nob_p3
+include: ../_norsumm_yaml
+doc_to_text: "Gi et kortfattet sammendrag av følgende tekst: {{article}}"
+target_delimiter: "\n"
diff --git a/lm_eval/tasks/noreval/norsumm/nob/norsumm_nob_p4.yaml b/lm_eval/tasks/noreval/norsumm/nob/norsumm_nob_p4.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f2425e9284db64b7c444ab848240c67f43a96bee
--- /dev/null
+++ b/lm_eval/tasks/noreval/norsumm/nob/norsumm_nob_p4.yaml
@@ -0,0 +1,5 @@
+tag: norsumm_nob
+dataset_name: nb
+task: norsumm_nob_p4
+include: ../_norsumm_yaml
+doc_to_text: "Lag en kort oppsummering som sammenfatter den følgende teksten i noen få punkter:\n{{article}}\n\nOppsummering:"
diff --git a/lm_eval/tasks/noreval/norsumm/nob/norsumm_nob_p5.yaml b/lm_eval/tasks/noreval/norsumm/nob/norsumm_nob_p5.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6a367f79d5507893904f23c52bd136a3905ea25f
--- /dev/null
+++ b/lm_eval/tasks/noreval/norsumm/nob/norsumm_nob_p5.yaml
@@ -0,0 +1,5 @@
+tag: norsumm_nob
+dataset_name: nb
+task: norsumm_nob_p5
+include: ../_norsumm_yaml
+doc_to_text: "Hele artikkelen:\n{{article}}\n\nHovedpunkter:"
diff --git a/lm_eval/tasks/noreval/norsumm/utils.py b/lm_eval/tasks/noreval/norsumm/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..24ed9c436c7ebbbed916ccbb00e7c35df828df60
--- /dev/null
+++ b/lm_eval/tasks/noreval/norsumm/utils.py
@@ -0,0 +1,126 @@
+import datasets
+import numpy as np
+from evaluate import load
+
+
+try:
+    import bert_score
+    import sacrebleu
+    from rouge_score import rouge_scorer, scoring
+except ModuleNotFoundError as e:
+    raise type(e)(
+        "`sacrebleu`, `bert_score`, and `rouge_score` are required for evaluating the model on NorEval."
+    ) from e
+
+
+ROUGE_SCORER = None
+BERTSCORE = None
+
+
+def process_results(doc, results):
+    completion = results[0]
+    references = doc["summaries"]
+
+    bleu_scores = [bleu([[reference]], [completion]) for reference in references]
+    bleu_max = np.nanmax(bleu_scores)
+    bleu_avg = np.nanmean(bleu_scores)
+
+    rouge_scores = [rouge([reference], [completion]) for reference in references]
+    rougeL_scores = [score["rougeLsum"] for score in rouge_scores]
+    rougeL_max = np.nanmax(rougeL_scores)
+    rougeL_avg = np.nanmean(rougeL_scores)
+
+    bertscore_f1s = [
+        bertscore_f1(references=[reference], predictions=[completion])
+        for reference in references
+    ]
+    bertscore_f1_max = np.nanmax(bertscore_f1s)
+    bertscore_f1_avg = np.nanmean(bertscore_f1s)
+
+    return {
+        "bleu_max": bleu_max,
+        "bleu_avg": bleu_avg,
+        "rougeL_max": rougeL_max,
+        "rougeL_avg": rougeL_avg,
+        "bertscore_f1_max": bertscore_f1_max,
+        "bertscore_f1_avg": bertscore_f1_avg,
+    }
+
+
+def bleu(refs, preds):
+    """
+    Returns `t5` style BLEU scores. See the related implementation:
+    https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L41
+
+    :param refs:
+        A `list` of `list` of reference `str`s.
+    :param preds:
+        A `list` of predicted `str`s.
+    """
+    score = sacrebleu.corpus_bleu(
+        preds,
+        refs,
+        smooth_method="exp",
+        smooth_value=0.0,
+        force=False,
+        lowercase=False,
+        tokenize="intl",
+        use_effective_order=False,
+    ).score
+    return score
+
+
+def rouge(refs, preds):
+    """
+    Returns `t5` style ROUGE scores. See the related implementation:
+    https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L68
+
+    :param refs:
+        A `list` of reference `strs`.
+    :param preds:
+        A `list` of predicted `strs`.
+    """
+    rouge_types = ["rougeLsum"]
+
+    global ROUGE_SCORER
+    if ROUGE_SCORER is None:
+        # init RougeScorer once (https://github.com/EleutherAI/lm-evaluation-harness/issues/1692)--rouge_types are constant
+        ROUGE_SCORER = rouge_scorer.RougeScorer(rouge_types)
+    scorer = ROUGE_SCORER
+
+    # Add newlines between sentences to correctly compute `rougeLsum`.
+
+    def _prepare_summary(summary):
+        summary = summary.replace(" . ", ".\n")
+        return summary
+
+    # Accumulate confidence intervals.
+    aggregator = scoring.BootstrapAggregator()
+    for ref, pred in zip(refs, preds):
+        ref = _prepare_summary(ref)
+        pred = _prepare_summary(pred)
+        aggregator.add_scores(scorer.score(ref, pred))
+    result = aggregator.aggregate()
+    return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
+
+
+def bertscore_f1(references, predictions):
+    """Computes the F1 score of the BERTScore metric.
+    Args:
+        references: A list of reference strings.
+        predictions: A list of predicted strings.
+        **kwargs: Additional keyword arguments.
+    Returns:
+        The F1 score of the BERTScore metric.
+    """
+    global BERTSCORE
+    if BERTSCORE is None:
+        # init BERTScore once
+        BERTSCORE = load("bertscore")
+    bertscore = BERTSCORE
+    return bertscore.compute(
+        predictions=predictions,
+        references=references,
+        model_type="bert-base-multilingual-cased",
+        num_layers=9,
+    )["f1"][0]
diff --git a/lm_eval/tasks/noreval/norsummarize-instruct/norsummarize_instruct.yaml b/lm_eval/tasks/noreval/norsummarize-instruct/norsummarize_instruct.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3327a4c3c98902c32888a584145ed206d032031e
--- /dev/null
+++ b/lm_eval/tasks/noreval/norsummarize-instruct/norsummarize_instruct.yaml
@@ -0,0 +1,20 @@
+task: norsummarize_instruct
+dataset_path: ltg/norsummarize-instruct
+training_split: null
+test_split: test
+doc_to_text: "{{prompt}} {{context}}"
+doc_to_target: response
+output_type: generate_until
+metric_list:
+  - metric: bleu
+    higher_is_better: true
+  - metric: chrf
+    higher_is_better: true
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  num_beams: 1
+  max_new_tokens: 256
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/noreval/nortruthfulqa/generation/_nortruthfulqa_gen_yaml b/lm_eval/tasks/noreval/nortruthfulqa/generation/_nortruthfulqa_gen_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1cf6d57420ee24d651dce02a00bbbb680200efed
--- /dev/null
+++ b/lm_eval/tasks/noreval/nortruthfulqa/generation/_nortruthfulqa_gen_yaml
@@ -0,0 +1,54 @@
+dataset_path: ltg/nortruthfulqa_gen
+output_type: generate_until
+training_split: null
+validation_split: validation
+test_split: null
+num_fewshot: 0
+doc_to_target: " "
+process_docs: !function utils.process_docs
+process_results: !function utils.process_results
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  num_beams: 1
+  max_new_tokens: 64
+metric_list:
+  - metric: bleu_max
+    aggregation: mean
+    higher_is_better: true
+  - metric: bleu_acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: bleu_diff
+    aggregation: mean
+    higher_is_better: true
+  - metric: rouge1_max
+    aggregation: mean
+    higher_is_better: true
+  - metric: rouge1_acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: rouge1_diff
+    aggregation: mean
+    higher_is_better: true
+  - metric: rouge2_max
+    aggregation: mean
+    higher_is_better: true
+  - metric: rouge2_acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: rouge2_diff
+    aggregation: mean
+    higher_is_better: true
+  - metric: rougeL_max
+    aggregation: mean
+    higher_is_better: true
+  - metric: rougeL_acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: rougeL_diff
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/noreval/nortruthfulqa/generation/nno/nortruthfulqa_gen_nno_p0.yaml b/lm_eval/tasks/noreval/nortruthfulqa/generation/nno/nortruthfulqa_gen_nno_p0.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..240d5fb252f246aba55a99c3d9fceaf884f5fd3c
--- /dev/null
+++ b/lm_eval/tasks/noreval/nortruthfulqa/generation/nno/nortruthfulqa_gen_nno_p0.yaml
@@ -0,0 +1,5 @@
+tag: nortruthfulqa_gen_nno
+task: nortruthfulqa_gen_nno_p0
+dataset_name: nn
+include: ../_nortruthfulqa_gen_yaml
+doc_to_text: question
diff --git a/lm_eval/tasks/noreval/nortruthfulqa/generation/nno/nortruthfulqa_gen_nno_p1.yaml b/lm_eval/tasks/noreval/nortruthfulqa/generation/nno/nortruthfulqa_gen_nno_p1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9412dada87a62e3676e49c5a277b91266773adda
--- /dev/null
+++ b/lm_eval/tasks/noreval/nortruthfulqa/generation/nno/nortruthfulqa_gen_nno_p1.yaml
@@ -0,0 +1,5 @@
+tag: nortruthfulqa_gen_nno
+task: nortruthfulqa_gen_nno_p1
+dataset_name: nn
+include: ../_nortruthfulqa_gen_yaml
+doc_to_text: "Spørsmål: {{question}}\nSvar:"
diff --git a/lm_eval/tasks/noreval/nortruthfulqa/generation/nno/nortruthfulqa_gen_nno_p2.yaml b/lm_eval/tasks/noreval/nortruthfulqa/generation/nno/nortruthfulqa_gen_nno_p2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..be9d318c4371af047f2b6c511b75bbb0cbea9011
--- /dev/null
+++ b/lm_eval/tasks/noreval/nortruthfulqa/generation/nno/nortruthfulqa_gen_nno_p2.yaml
@@ -0,0 +1,5 @@
+tag: nortruthfulqa_gen_nno
+task: nortruthfulqa_gen_nno_p2
+dataset_name: nn
+include: ../_nortruthfulqa_gen_yaml
+doc_to_text: "Skriv svaret på følgande spørsmål: {{question}}\nSvar:"
diff --git a/lm_eval/tasks/noreval/nortruthfulqa/generation/nno/nortruthfulqa_gen_nno_p3.yaml b/lm_eval/tasks/noreval/nortruthfulqa/generation/nno/nortruthfulqa_gen_nno_p3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f8b8a525240a5ddcf54ff41f2e186183b76e3728
--- /dev/null
+++ b/lm_eval/tasks/noreval/nortruthfulqa/generation/nno/nortruthfulqa_gen_nno_p3.yaml
@@ -0,0 +1,5 @@
+tag: nortruthfulqa_gen_nno
+task: nortruthfulqa_gen_nno_p3
+dataset_name: nn
+include: ../_nortruthfulqa_gen_yaml
+doc_to_text: "{{question}}\nKva er rett svar på spørsmålet?\nSvar:"
diff --git a/lm_eval/tasks/noreval/nortruthfulqa/generation/nno/nortruthfulqa_gen_nno_p4.yaml b/lm_eval/tasks/noreval/nortruthfulqa/generation/nno/nortruthfulqa_gen_nno_p4.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5153b878efa3504d63311cbafc06243edf33b1c8
--- /dev/null
+++ b/lm_eval/tasks/noreval/nortruthfulqa/generation/nno/nortruthfulqa_gen_nno_p4.yaml
@@ -0,0 +1,5 @@
+tag: nortruthfulqa_gen_nno
+task: nortruthfulqa_gen_nno_p4
+dataset_name: nn
+include: ../_nortruthfulqa_gen_yaml
+doc_to_text: "Svar sant på følgande: {{question}}\nSvar:"
diff --git a/lm_eval/tasks/noreval/nortruthfulqa/generation/nob/nortruthfulqa_gen_nob_p0.yaml b/lm_eval/tasks/noreval/nortruthfulqa/generation/nob/nortruthfulqa_gen_nob_p0.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6231715628e0881bcefcab64a3b17b52aca37c7c
--- /dev/null
+++ b/lm_eval/tasks/noreval/nortruthfulqa/generation/nob/nortruthfulqa_gen_nob_p0.yaml
@@ -0,0 +1,5 @@
+tag: nortruthfulqa_gen_nob
+task: nortruthfulqa_gen_nob_p0
+dataset_name: nb
+include: ../_nortruthfulqa_gen_yaml
+doc_to_text: question
diff --git a/lm_eval/tasks/noreval/nortruthfulqa/generation/nob/nortruthfulqa_gen_nob_p1.yaml b/lm_eval/tasks/noreval/nortruthfulqa/generation/nob/nortruthfulqa_gen_nob_p1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6633e8cdcdb972feaac379e5aafcf586610ff273
--- /dev/null
+++ b/lm_eval/tasks/noreval/nortruthfulqa/generation/nob/nortruthfulqa_gen_nob_p1.yaml
@@ -0,0 +1,5 @@
+tag: nortruthfulqa_gen_nob
+task: nortruthfulqa_gen_nob_p1
+dataset_name: nb
+include: ../_nortruthfulqa_gen_yaml
+doc_to_text: "Spørsmål: {{question}}\nSvar:"
diff --git a/lm_eval/tasks/noreval/nortruthfulqa/generation/nob/nortruthfulqa_gen_nob_p2.yaml b/lm_eval/tasks/noreval/nortruthfulqa/generation/nob/nortruthfulqa_gen_nob_p2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ac7f57da928d3a781ab7c3a945c98b668fb1ba63
--- /dev/null
+++ b/lm_eval/tasks/noreval/nortruthfulqa/generation/nob/nortruthfulqa_gen_nob_p2.yaml
@@ -0,0 +1,5 @@
+tag: nortruthfulqa_gen_nob
+task: nortruthfulqa_gen_nob_p2
+dataset_name: nb
+include: ../_nortruthfulqa_gen_yaml
+doc_to_text: "Skriv svaret på følgende spørsmål: {{question}}\nSvar:"
diff --git a/lm_eval/tasks/noreval/nortruthfulqa/generation/nob/nortruthfulqa_gen_nob_p3.yaml b/lm_eval/tasks/noreval/nortruthfulqa/generation/nob/nortruthfulqa_gen_nob_p3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f196b1010056320b3d31307ad746d536b971f08b
--- /dev/null
+++ b/lm_eval/tasks/noreval/nortruthfulqa/generation/nob/nortruthfulqa_gen_nob_p3.yaml
@@ -0,0 +1,5 @@
+tag: nortruthfulqa_gen_nob
+task: nortruthfulqa_gen_nob_p3
+dataset_name: nb
+include: ../_nortruthfulqa_gen_yaml
+doc_to_text: "{{question}}\nHva er riktig svar på spørsmålet?\nSvar:"
diff --git a/lm_eval/tasks/noreval/nortruthfulqa/generation/nob/nortruthfulqa_gen_nob_p4.yaml b/lm_eval/tasks/noreval/nortruthfulqa/generation/nob/nortruthfulqa_gen_nob_p4.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2d6deaf17b55b7cac51f58feaf47255727cfac45
--- /dev/null
+++ b/lm_eval/tasks/noreval/nortruthfulqa/generation/nob/nortruthfulqa_gen_nob_p4.yaml
@@ -0,0 +1,5 @@
+tag: nortruthfulqa_gen_nob
+task: nortruthfulqa_gen_nob_p4
+dataset_name: nb
+include: ../_nortruthfulqa_gen_yaml
+doc_to_text: "Svar sant på følgende: {{question}}\nSvar:"
diff --git a/lm_eval/tasks/noreval/nortruthfulqa/generation/utils.py b/lm_eval/tasks/noreval/nortruthfulqa/generation/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5cfe16de6001d42e14a216b5f198287efcda2527
--- /dev/null
+++ b/lm_eval/tasks/noreval/nortruthfulqa/generation/utils.py
@@ -0,0 +1,152 @@
+import datasets
+import numpy as np
+import sacrebleu
+from rouge_score import rouge_scorer, scoring
+
+
+try:
+    import sacrebleu
+    from rouge_score import rouge_scorer, scoring
+except ModuleNotFoundError as e:
+    raise type(e)(
+        "`sacrebleu` and `rouge_score` are required for evaluating the model on NorEval."
+    ) from e
+
+
+ROUGE_SCORER = None
+
+
+def preprocess_function(examples):
+    def _format_answers(answers):
+        formatted_answers = []
+        for answer in answers:
+            answer = answer.strip()
+            if len(answer):
+                # Add a period after all answers.
+                if answer[-1] != ".":
+                    formatted_answers.append(answer + ".")
+                else:
+                    formatted_answers.append(answer)
+        return formatted_answers
+
+    incorrect_answers = _format_answers(examples["incorrect_answers"])
+    correct_answers = _format_answers(examples["correct_answers"])
+    return {
+        "question": examples["question"].strip(),
+        "correct_answers": correct_answers,
+        "incorrect_answers": incorrect_answers,
+    }
+
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    return dataset.map(preprocess_function)
+
+
+def process_results(doc, results):
+    completion = results[0]
+    true_refs, false_refs = doc["correct_answers"], doc["incorrect_answers"]
+    all_refs = true_refs + false_refs
+
+    # BLEU
+    bleu_scores = [bleu([[ref]], [completion]) for ref in all_refs]
+    bleu_correct = np.nanmax(bleu_scores[: len(true_refs)])
+    bleu_incorrect = np.nanmax(bleu_scores[len(true_refs) :])
+    bleu_max = bleu_correct
+    bleu_diff = bleu_correct - bleu_incorrect
+    bleu_acc = int(bleu_correct > bleu_incorrect)
+
+    # ROUGE-N
+    rouge_scores = [rouge([ref], [completion]) for ref in all_refs]
+    # ROUGE-1
+    rouge1_scores = [score["rouge1"] for score in rouge_scores]
+    rouge1_correct = np.nanmax(rouge1_scores[: len(true_refs)])
+    rouge1_incorrect = np.nanmax(rouge1_scores[len(true_refs) :])
+    rouge1_max = rouge1_correct
+    rouge1_diff = rouge1_correct - rouge1_incorrect
+    rouge1_acc = int(rouge1_correct > rouge1_incorrect)
+    # ROUGE-2
+    rouge2_scores = [score["rouge2"] for score in rouge_scores]
+    rouge2_correct = np.nanmax(rouge2_scores[: len(true_refs)])
+    rouge2_incorrect = np.nanmax(rouge2_scores[len(true_refs) :])
+    rouge2_max = rouge2_correct
+    rouge2_diff = rouge2_correct - rouge2_incorrect
+    rouge2_acc = int(rouge2_correct > rouge2_incorrect)
+    # ROUGE-L
+    rougeL_scores = [score["rougeLsum"] for score in rouge_scores]
+    rougeL_correct = np.nanmax(rougeL_scores[: len(true_refs)])
+    rougeL_incorrect = np.nanmax(rougeL_scores[len(true_refs) :])
+    rougeL_max = rougeL_correct
+    rougeL_diff = rougeL_correct - rougeL_incorrect
+    rougeL_acc = int(rougeL_correct > rougeL_incorrect)
+
+    return {
+        "bleu_max": bleu_max,
+        "bleu_acc": bleu_acc,
+        "bleu_diff": bleu_diff,
+        "rouge1_max": rouge1_max,
+        "rouge1_acc": rouge1_acc,
+        "rouge1_diff": rouge1_diff,
+        "rouge2_max": rouge2_max,
+        "rouge2_acc": rouge2_acc,
+        "rouge2_diff": rouge2_diff,
+        "rougeL_max": rougeL_max,
+        "rougeL_acc": rougeL_acc,
+        "rougeL_diff": rougeL_diff,
+    }
+
+
+def bleu(refs, preds):
+    """
+    Returns `t5` style BLEU scores. See the related implementation:
+    https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L41
+
+    :param refs:
+        A `list` of `list` of reference `str`s.
+    :param preds:
+        A `list` of predicted `str`s.
+    """
+    score = sacrebleu.corpus_bleu(
+        preds,
+        refs,
+        smooth_method="exp",
+        smooth_value=0.0,
+        force=False,
+        lowercase=False,
+        tokenize="intl",
+        use_effective_order=False,
+    ).score
+    return score
+
+
+def rouge(refs, preds):
+    """
+    Returns `t5` style ROUGE scores. See the related implementation:
+    https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L68
+
+    :param refs:
+        A `list` of reference `strs`.
+    :param preds:
+        A `list` of predicted `strs`.
+    """
+    rouge_types = ["rouge1", "rouge2", "rougeLsum"]
+    scorer = rouge_scorer.RougeScorer(rouge_types)
+    # Add newlines between sentences to correctly compute `rougeLsum`.
+
+    global ROUGE_SCORER
+    if ROUGE_SCORER is None:
+        # init RougeScorer once (https://github.com/EleutherAI/lm-evaluation-harness/issues/1692)--rouge_types are constant
+        ROUGE_SCORER = rouge_scorer.RougeScorer(rouge_types)
+    scorer = ROUGE_SCORER
+
+    def _prepare_summary(summary):
+        summary = summary.replace(" . ", ".\n")
+        return summary
+
+    # Accumulate confidence intervals.
+    aggregator = scoring.BootstrapAggregator()
+    for ref, pred in zip(refs, preds):
+        ref = _prepare_summary(ref)
+        pred = _prepare_summary(pred)
+        aggregator.add_scores(scorer.score(ref, pred))
+    result = aggregator.aggregate()
+    return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
diff --git a/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/_nortruthfulqa_mc_yaml b/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/_nortruthfulqa_mc_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..590dc6085f48242a028c0b24f3b65ce3fe47eba2
--- /dev/null
+++ b/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/_nortruthfulqa_mc_yaml
@@ -0,0 +1,14 @@
+dataset_path: ltg/nortruthfulqa_mc
+output_type: multiple_choice
+training_split: null
+validation_split: validation
+test_split: null
+num_fewshot: 0
+doc_to_target: 0
+doc_to_choice: "{{mc1_targets.choices}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/nno/nortruthfulqa_mc_nno_p0.yaml b/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/nno/nortruthfulqa_mc_nno_p0.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7608b46a9a1309c1d7288045065255dfe03314a1
--- /dev/null
+++ b/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/nno/nortruthfulqa_mc_nno_p0.yaml
@@ -0,0 +1,5 @@
+tag: nortruthfulqa_mc_nno
+task: nortruthfulqa_mc_nno_p0
+dataset_name: nn
+include: ../_nortruthfulqa_mc_yaml
+doc_to_text: !function utils.p0_nn
diff --git a/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/nno/nortruthfulqa_mc_nno_p1.yaml b/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/nno/nortruthfulqa_mc_nno_p1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8b4fae7e58ef01286a3a23babe7086aa1c6cc0ff
--- /dev/null
+++ b/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/nno/nortruthfulqa_mc_nno_p1.yaml
@@ -0,0 +1,5 @@
+tag: nortruthfulqa_mc_nno
+task: nortruthfulqa_mc_nno_p1
+dataset_name: nn
+include: ../_nortruthfulqa_mc_yaml
+doc_to_text: !function utils.p1_nn
diff --git a/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/nno/nortruthfulqa_mc_nno_p2.yaml b/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/nno/nortruthfulqa_mc_nno_p2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f8d61d464e50afe72926269f77a69bffbd255a52
--- /dev/null
+++ b/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/nno/nortruthfulqa_mc_nno_p2.yaml
@@ -0,0 +1,6 @@
+tag: nortruthfulqa_mc_nno
+task: nortruthfulqa_mc_nno_p2
+dataset_name: nn
+include: ../_nortruthfulqa_mc_yaml
+doc_to_text: !function utils.p2_nn
+target_delimiter: "\n"
diff --git a/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/nno/nortruthfulqa_mc_nno_p3.yaml b/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/nno/nortruthfulqa_mc_nno_p3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d93316d34dbe177de0caa74ed8b0774ae8d19336
--- /dev/null
+++ b/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/nno/nortruthfulqa_mc_nno_p3.yaml
@@ -0,0 +1,6 @@
+tag: nortruthfulqa_mc_nno
+task: nortruthfulqa_mc_nno_p3
+dataset_name: nn
+include: ../_nortruthfulqa_mc_yaml
+doc_to_text: !function utils.p3_nn
+target_delimiter: "\n"
diff --git a/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/nno/nortruthfulqa_mc_nno_p4.yaml b/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/nno/nortruthfulqa_mc_nno_p4.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..74914f80d653697352f9d2546190663b4511bef1
--- /dev/null
+++ b/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/nno/nortruthfulqa_mc_nno_p4.yaml
@@ -0,0 +1,5 @@
+tag: nortruthfulqa_mc_nno
+task: nortruthfulqa_mc_nno_p4
+dataset_name: nn
+include: ../_nortruthfulqa_mc_yaml
+doc_to_text: !function utils.p4_nn
diff --git a/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/nno/utils.py b/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/nno/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b66e1feb4f237efc120eb2af686d95b833407683
--- /dev/null
+++ b/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/nno/utils.py
@@ -0,0 +1,35 @@
+def p0_nn(doc):
+    prompt = "Spørsmål: {question}\n\nSvar:"
+    return prompt.format(question=doc["question"])
+
+
+def p1_nn(doc):
+    prompt = "Spørsmål: {question}\n\nSvaralternativ:{choices}\n\nSvar:"
+    choices = "".join(
+        list(map(lambda choice: f"\n- {choice}", doc["mc1_targets"]["choices"]))
+    )
+    return prompt.format(question=doc["question"], choices=choices)
+
+
+def p2_nn(doc):
+    prompt = "Spørsmål: {question}\n\nKva av følgande alternativ er rett svar på spørsmålet?{choices}"
+    choices = "".join(
+        list(map(lambda choice: f"\n- {choice}", doc["mc1_targets"]["choices"]))
+    )
+    return prompt.format(question=doc["question"], choices=choices)
+
+
+def p3_nn(doc):
+    prompt = "Gitt følgande spørsmål, kva av dei moglege svara under er rett?\nSpørsmål: {question}\n{choices}"
+    choices = "".join(
+        list(map(lambda choice: f"\n- {choice}", doc["mc1_targets"]["choices"]))
+    )
+    return prompt.format(question=doc["question"], choices=choices)
+
+
+def p4_nn(doc):
+    prompt = "{question}\nVel eit av følgande moglege svar:{choices}\n\nSvar:"
+    choices = "".join(
+        list(map(lambda choice: f"\n- {choice}", doc["mc1_targets"]["choices"]))
+    )
+    return prompt.format(question=doc["question"], choices=choices)
diff --git a/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/nob/nortruthfulqa_mc_nob_p0.yaml b/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/nob/nortruthfulqa_mc_nob_p0.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..36acd76f945a8ae84f477c923ed8b2f1c7bea6a9
--- /dev/null
+++ b/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/nob/nortruthfulqa_mc_nob_p0.yaml
@@ -0,0 +1,5 @@
+tag: nortruthfulqa_mc_nob
+task: nortruthfulqa_mc_nob_p0
+dataset_name: nb
+include: ../_nortruthfulqa_mc_yaml
+doc_to_text: !function utils.p0_nb
diff --git a/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/nob/nortruthfulqa_mc_nob_p1.yaml b/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/nob/nortruthfulqa_mc_nob_p1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..74b5d1702d2ea5c8e3342e8b834b7281b30ddc8f
--- /dev/null
+++ b/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/nob/nortruthfulqa_mc_nob_p1.yaml
@@ -0,0 +1,5 @@
+tag: nortruthfulqa_mc_nob
+task: nortruthfulqa_mc_nob_p1
+dataset_name: nb
+include: ../_nortruthfulqa_mc_yaml
+doc_to_text: !function utils.p1_nb
diff --git a/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/nob/nortruthfulqa_mc_nob_p2.yaml b/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/nob/nortruthfulqa_mc_nob_p2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..30665f9e4164909278c92fc98a794f3269ffa4b5
--- /dev/null
+++ b/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/nob/nortruthfulqa_mc_nob_p2.yaml
@@ -0,0 +1,6 @@
+tag: nortruthfulqa_mc_nob
+task: nortruthfulqa_mc_nob_p2
+dataset_name: nb
+include: ../_nortruthfulqa_mc_yaml
+doc_to_text: !function utils.p2_nb
+target_delimiter: "\n"
diff --git a/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/nob/nortruthfulqa_mc_nob_p3.yaml b/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/nob/nortruthfulqa_mc_nob_p3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..40893b52230d6c20d1568633389b2aca3b5c6f01
--- /dev/null
+++ b/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/nob/nortruthfulqa_mc_nob_p3.yaml
@@ -0,0 +1,6 @@
+tag: nortruthfulqa_mc_nob
+task: nortruthfulqa_mc_nob_p3
+dataset_name: nb
+include: ../_nortruthfulqa_mc_yaml
+doc_to_text: !function utils.p3_nb
+target_delimiter: "\n"
diff --git a/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/nob/nortruthfulqa_mc_nob_p4.yaml b/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/nob/nortruthfulqa_mc_nob_p4.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f0595fdf835bb89b7da31fa7fbfccde6b872b720
--- /dev/null
+++ b/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/nob/nortruthfulqa_mc_nob_p4.yaml
@@ -0,0 +1,5 @@
+tag: nortruthfulqa_mc_nob
+task: nortruthfulqa_mc_nob_p4
+dataset_name: nb
+include: ../_nortruthfulqa_mc_yaml
+doc_to_text: !function utils.p4_nb
diff --git a/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/nob/utils.py b/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/nob/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..08e5cdab333eb46c02a931ac4e35b692b27b2ee1
--- /dev/null
+++ b/lm_eval/tasks/noreval/nortruthfulqa/multiple_choice/nob/utils.py
@@ -0,0 +1,35 @@
+def p0_nb(doc):
+    prompt = "Spørsmål: {question}\n\nSvar:"
+    return prompt.format(question=doc["question"])
+
+
+def p1_nb(doc):
+    prompt = "Spørsmål: {question}\n\nSvaralternativer:{choices}\n\nSvar:"
+    choices = "".join(
+        list(map(lambda choice: f"\n- {choice}", doc["mc1_targets"]["choices"]))
+    )
+    return prompt.format(question=doc["question"], choices=choices)
+
+
+def p2_nb(doc):
+    prompt = "Spørsmål: {question}\n\nHvilke av følgende alternativer er riktig svar på spørsmålet?{choices}"
+    choices = "".join(
+        list(map(lambda choice: f"\n- {choice}", doc["mc1_targets"]["choices"]))
+    )
+    return prompt.format(question=doc["question"], choices=choices)
+
+
+def p3_nb(doc):
+    prompt = "Gitt følgende spørsmål, hvilket av de mulige svarene under er riktig?\nSpørsmål: {question}\n{choices}"
+    choices = "".join(
+        list(map(lambda choice: f"\n- {choice}", doc["mc1_targets"]["choices"]))
+    )
+    return prompt.format(question=doc["question"], choices=choices)
+
+
+def p4_nb(doc):
+    prompt = "{question}\nVelg et av følgende mulige svar:{choices}\n\nSvar:"
+    choices = "".join(
+        list(map(lambda choice: f"\n- {choice}", doc["mc1_targets"]["choices"]))
+    )
+    return prompt.format(question=doc["question"], choices=choices)
diff --git a/lm_eval/tasks/noreval/nrk_quiz_qa/_nrk_quiz_qa_yaml b/lm_eval/tasks/noreval/nrk_quiz_qa/_nrk_quiz_qa_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9e7167063b2133b3cfcbeca670fd557c50938b04
--- /dev/null
+++ b/lm_eval/tasks/noreval/nrk_quiz_qa/_nrk_quiz_qa_yaml
@@ -0,0 +1,16 @@
+dataset_path: ltg/nrk_quiz_qa
+output_type: multiple_choice
+training_split: null
+validation_split: null
+test_split: test
+num_fewshot: 0
+doc_to_target: "{{choices.label.index(answer)}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/noreval/nrk_quiz_qa/nno/nrk_quiz_qa_nno_p0.yaml b/lm_eval/tasks/noreval/nrk_quiz_qa/nno/nrk_quiz_qa_nno_p0.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e94c9a272039468fba86abff1157abac3dfe37ef
--- /dev/null
+++ b/lm_eval/tasks/noreval/nrk_quiz_qa/nno/nrk_quiz_qa_nno_p0.yaml
@@ -0,0 +1,6 @@
+tag: nrk_quiz_qa_nno
+task: nrk_quiz_qa_nno_p0
+dataset_name: nn
+include: ../_nrk_quiz_qa_yaml
+doc_to_text: !function utils.p0_nn
+doc_to_choice: "{{choices.text}}"
diff --git a/lm_eval/tasks/noreval/nrk_quiz_qa/nno/nrk_quiz_qa_nno_p1.yaml b/lm_eval/tasks/noreval/nrk_quiz_qa/nno/nrk_quiz_qa_nno_p1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..05d260db892884b614badca7db47c2b71fe5dbb7
--- /dev/null
+++ b/lm_eval/tasks/noreval/nrk_quiz_qa/nno/nrk_quiz_qa_nno_p1.yaml
@@ -0,0 +1,6 @@
+tag: nrk_quiz_qa_nno
+task: nrk_quiz_qa_nno_p1
+dataset_name: nn
+include: ../_nrk_quiz_qa_yaml
+doc_to_text: !function utils.p1_nn
+doc_to_choice: "{{choices.text}}"
diff --git a/lm_eval/tasks/noreval/nrk_quiz_qa/nno/nrk_quiz_qa_nno_p2.yaml b/lm_eval/tasks/noreval/nrk_quiz_qa/nno/nrk_quiz_qa_nno_p2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fee4fadb28e1626b60ab877a036957360d0c6be3
--- /dev/null
+++ b/lm_eval/tasks/noreval/nrk_quiz_qa/nno/nrk_quiz_qa_nno_p2.yaml
@@ -0,0 +1,6 @@
+tag: nrk_quiz_qa_nno
+task: nrk_quiz_qa_nno_p2
+dataset_name: nn
+include: ../_nrk_quiz_qa_yaml
+doc_to_text: !function utils.p2_nn
+doc_to_choice: "{{choices.label}}"
diff --git a/lm_eval/tasks/noreval/nrk_quiz_qa/nno/nrk_quiz_qa_nno_p3.yaml b/lm_eval/tasks/noreval/nrk_quiz_qa/nno/nrk_quiz_qa_nno_p3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ac9aafd6caf0f95be2ac9fcfe9558adcd3beb608
--- /dev/null
+++ b/lm_eval/tasks/noreval/nrk_quiz_qa/nno/nrk_quiz_qa_nno_p3.yaml
@@ -0,0 +1,6 @@
+tag: nrk_quiz_qa_nno
+task: nrk_quiz_qa_nno_p3
+dataset_name: nn
+include: ../_nrk_quiz_qa_yaml
+doc_to_text: !function utils.p3_nn
+doc_to_choice: "{{choices.label}}"
diff --git a/lm_eval/tasks/noreval/nrk_quiz_qa/nno/nrk_quiz_qa_nno_p4.yaml b/lm_eval/tasks/noreval/nrk_quiz_qa/nno/nrk_quiz_qa_nno_p4.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d095b6f12b1f2f09ed1757e1c2899794f5789302
--- /dev/null
+++ b/lm_eval/tasks/noreval/nrk_quiz_qa/nno/nrk_quiz_qa_nno_p4.yaml
@@ -0,0 +1,6 @@
+tag: nrk_quiz_qa_nno
+task: nrk_quiz_qa_nno_p4
+dataset_name: nn
+include: ../_nrk_quiz_qa_yaml
+doc_to_text: !function utils.p4_nn
+doc_to_choice: "{{choices.text}}"
diff --git a/lm_eval/tasks/noreval/nrk_quiz_qa/nno/utils.py b/lm_eval/tasks/noreval/nrk_quiz_qa/nno/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9dcae1cdce3d57ff2057dc41e17fe5775a9e8b6
--- /dev/null
+++ b/lm_eval/tasks/noreval/nrk_quiz_qa/nno/utils.py
@@ -0,0 +1,44 @@
+def p0_nn(doc):
+    prompt = "Spørsmål: {question}\n\nSvar:"
+    return prompt.format(question=doc["question"])
+
+
+def p1_nn(doc):
+    prompt = "{question}\n\nSvaralternativer:{choices}\n\nKva er rett svar?\n\nSvar:"
+    choices = "".join(list(map(lambda choice: f"\n- {choice}", doc["choices"]["text"])))
+    return prompt.format(question=doc["question"], choices=choices)
+
+
+def p2_nn(doc):
+    prompt = "{question}{choices}\n\nEr det rette svaret {enumerated_choices}?\n\nSvar:"
+    choices = "".join(
+        [
+            f"\n{label}: {option}"
+            for label, option in zip(doc["choices"]["label"], doc["choices"]["text"])
+        ]
+    )
+    enumerated_choices = ", ".join(
+        doc["choices"]["label"][:-1]
+    ) + ", eller {latest_choice}".format(latest_choice=doc["choices"]["label"][-1])
+    if len(doc["choices"]["label"]) == 2:
+        enumerated_choices = enumerated_choices.replace(", eller", " eller")
+    return prompt.format(
+        question=doc["question"], choices=choices, enumerated_choices=enumerated_choices
+    )
+
+
+def p3_nn(doc):
+    prompt = "Spørsmål: {question}{choices}\n\nSvar:"
+    choices = "".join(
+        [
+            f"\n{label}: {option}"
+            for label, option in zip(doc["choices"]["label"], doc["choices"]["text"])
+        ]
+    )
+    return prompt.format(question=doc["question"], choices=choices)
+
+
+def p4_nn(doc):
+    prompt = "{question}\nVel rett svar blant desse alternativa:{choices}\n\nSvar:"
+    choices = "".join(list(map(lambda choice: f"\n- {choice}", doc["choices"]["text"])))
+    return prompt.format(question=doc["question"], choices=choices)
diff --git a/lm_eval/tasks/noreval/nrk_quiz_qa/nob/nrk_quiz_qa_nob_p0.yaml b/lm_eval/tasks/noreval/nrk_quiz_qa/nob/nrk_quiz_qa_nob_p0.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..688c2ed10352e467d8c10c5af728ef18bd4ae793
--- /dev/null
+++ b/lm_eval/tasks/noreval/nrk_quiz_qa/nob/nrk_quiz_qa_nob_p0.yaml
@@ -0,0 +1,6 @@
+tag: nrk_quiz_qa_nob
+task: nrk_quiz_qa_nob_p0
+dataset_name: nb
+include: ../_nrk_quiz_qa_yaml
+doc_to_text: !function utils.p0_nb
+doc_to_choice: "{{choices.text}}"
diff --git a/lm_eval/tasks/noreval/nrk_quiz_qa/nob/nrk_quiz_qa_nob_p1.yaml b/lm_eval/tasks/noreval/nrk_quiz_qa/nob/nrk_quiz_qa_nob_p1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c1920e9519c477bb722e906663cf3a6db87dbff8
--- /dev/null
+++ b/lm_eval/tasks/noreval/nrk_quiz_qa/nob/nrk_quiz_qa_nob_p1.yaml
@@ -0,0 +1,6 @@
+tag: nrk_quiz_qa_nob
+task: nrk_quiz_qa_nob_p1
+dataset_name: nb
+include: ../_nrk_quiz_qa_yaml
+doc_to_text: !function utils.p1_nb
+doc_to_choice: "{{choices.text}}"
diff --git a/lm_eval/tasks/noreval/nrk_quiz_qa/nob/nrk_quiz_qa_nob_p2.yaml b/lm_eval/tasks/noreval/nrk_quiz_qa/nob/nrk_quiz_qa_nob_p2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5b7746da531f77c4e8c8cd5c3889be22c8eaf1b4
--- /dev/null
+++ b/lm_eval/tasks/noreval/nrk_quiz_qa/nob/nrk_quiz_qa_nob_p2.yaml
@@ -0,0 +1,6 @@
+tag: nrk_quiz_qa_nob
+task: nrk_quiz_qa_nob_p2
+dataset_name: nb
+include: ../_nrk_quiz_qa_yaml
+doc_to_text: !function utils.p2_nb
+doc_to_choice: "{{choices.label}}"
diff --git a/lm_eval/tasks/noreval/nrk_quiz_qa/nob/nrk_quiz_qa_nob_p3.yaml b/lm_eval/tasks/noreval/nrk_quiz_qa/nob/nrk_quiz_qa_nob_p3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..394070535e40abe23f51c8d23e124d0a09fafb44
--- /dev/null
+++ b/lm_eval/tasks/noreval/nrk_quiz_qa/nob/nrk_quiz_qa_nob_p3.yaml
@@ -0,0 +1,6 @@
+tag: nrk_quiz_qa_nob
+task: nrk_quiz_qa_nob_p3
+dataset_name: nb
+include: ../_nrk_quiz_qa_yaml
+doc_to_text: !function utils.p3_nb
+doc_to_choice: "{{choices.label}}"
diff --git a/lm_eval/tasks/noreval/nrk_quiz_qa/nob/nrk_quiz_qa_nob_p4.yaml b/lm_eval/tasks/noreval/nrk_quiz_qa/nob/nrk_quiz_qa_nob_p4.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0f2e645c41fddf40feec4e69cb0dc489c932a4a9
--- /dev/null
+++ b/lm_eval/tasks/noreval/nrk_quiz_qa/nob/nrk_quiz_qa_nob_p4.yaml
@@ -0,0 +1,6 @@
+tag: nrk_quiz_qa_nob
+task: nrk_quiz_qa_nob_p4
+dataset_name: nb
+include: ../_nrk_quiz_qa_yaml
+doc_to_text: !function utils.p4_nb
+doc_to_choice: "{{choices.text}}"
diff --git a/lm_eval/tasks/noreval/nrk_quiz_qa/nob/utils.py b/lm_eval/tasks/noreval/nrk_quiz_qa/nob/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..aac29f6965cbec9903455642243f47450d9a0bda
--- /dev/null
+++ b/lm_eval/tasks/noreval/nrk_quiz_qa/nob/utils.py
@@ -0,0 +1,46 @@
+def p0_nb(doc):
+    prompt = "Spørsmål: {question}\n\nSvar:"
+    return prompt.format(question=doc["question"])
+
+
+def p1_nb(doc):
+    prompt = "{question}\n\nSvaralternativer:{choices}\n\nHva er riktig svar?\n\nSvar:"
+    choices = "".join(list(map(lambda choice: f"\n- {choice}", doc["choices"]["text"])))
+    return prompt.format(question=doc["question"], choices=choices)
+
+
+def p2_nb(doc):
+    prompt = (
+        "{question}{choices}\n\nEr det riktige svaret {enumerated_choices}?\n\nSvar:"
+    )
+    choices = "".join(
+        [
+            f"\n{label}: {option}"
+            for label, option in zip(doc["choices"]["label"], doc["choices"]["text"])
+        ]
+    )
+    enumerated_choices = ", ".join(
+        doc["choices"]["label"][:-1]
+    ) + ", eller {latest_choice}".format(latest_choice=doc["choices"]["label"][-1])
+    if len(doc["choices"]["label"]) == 2:
+        enumerated_choices = enumerated_choices.replace(", eller", " eller")
+    return prompt.format(
+        question=doc["question"], choices=choices, enumerated_choices=enumerated_choices
+    )
+
+
+def p3_nb(doc):
+    prompt = "Spørsmål: {question}{choices}\n\nSvar:"
+    choices = "".join(
+        [
+            f"\n{label}: {option}"
+            for label, option in zip(doc["choices"]["label"], doc["choices"]["text"])
+        ]
+    )
+    return prompt.format(question=doc["question"], choices=choices)
+
+
+def p4_nb(doc):
+    prompt = "{question}\nVelg riktig svar blant disse alternativene:{choices}\n\nSvar:"
+    choices = "".join(list(map(lambda choice: f"\n- {choice}", doc["choices"]["text"])))
+    return prompt.format(question=doc["question"], choices=choices)
diff --git a/lm_eval/tasks/noreval/tatoeba/_tatoeba_yaml b/lm_eval/tasks/noreval/tatoeba/_tatoeba_yaml
new file mode 100644
index 0000000000000000000000000000000000000000..01b1cfea88e5a05165fa14a0043abc60ffc6547e
--- /dev/null
+++ b/lm_eval/tasks/noreval/tatoeba/_tatoeba_yaml
@@ -0,0 +1,19 @@
+dataset_path: Helsinki-NLP/tatoeba_mt
+training_split: validation
+test_split: test
+output_type: generate_until
+dataset_kwargs:
+  trust_remote_code: true
+metric_list:
+  - metric: bleu
+    higher_is_better: true
+  - metric: chrf
+    higher_is_better: true
+generation_kwargs:
+  until:
+    - "\n"
+  do_sample: false
+  num_beams: 1
+  max_new_tokens: 256
+metadata:
+  version: 1.0
diff --git a/lm_eval/tasks/noreval/tatoeba/tatoeba_eng_nno/tatoeba_eng_nno_p0.yaml b/lm_eval/tasks/noreval/tatoeba/tatoeba_eng_nno/tatoeba_eng_nno_p0.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c34c861d167615e8a6f3a3d786b3f24de6f93791
--- /dev/null
+++ b/lm_eval/tasks/noreval/tatoeba/tatoeba_eng_nno/tatoeba_eng_nno_p0.yaml
@@ -0,0 +1,6 @@
+tag: tatoeba_eng_nno
+dataset_name: eng-nno
+doc_to_target: targetString
+task: tatoeba_eng_nno_p0
+include: ../_tatoeba_yaml
+doc_to_text: "Engelsk: {{sourceString}}\nNynorsk:"
diff --git a/lm_eval/tasks/noreval/tatoeba/tatoeba_eng_nno/tatoeba_eng_nno_p1.yaml b/lm_eval/tasks/noreval/tatoeba/tatoeba_eng_nno/tatoeba_eng_nno_p1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7dca37a83f53f95b41496806c9aedcfb0c8eddc5
--- /dev/null
+++ b/lm_eval/tasks/noreval/tatoeba/tatoeba_eng_nno/tatoeba_eng_nno_p1.yaml
@@ -0,0 +1,6 @@
+tag: tatoeba_eng_nno
+dataset_name: eng-nno
+doc_to_target: targetString
+task: tatoeba_eng_nno_p1
+include: ../_tatoeba_yaml
+doc_to_text: "Omsett følgande setning til nynorsk: {{sourceString}}\nNynorsk:"
diff --git a/lm_eval/tasks/noreval/tatoeba/tatoeba_eng_nno/tatoeba_eng_nno_p2.yaml b/lm_eval/tasks/noreval/tatoeba/tatoeba_eng_nno/tatoeba_eng_nno_p2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1e84f18c0144589c3ed864799930298ead8c391d
--- /dev/null
+++ b/lm_eval/tasks/noreval/tatoeba/tatoeba_eng_nno/tatoeba_eng_nno_p2.yaml
@@ -0,0 +1,6 @@
+tag: tatoeba_eng_nno
+dataset_name: eng-nno
+doc_to_target: targetString
+task: tatoeba_eng_nno_p2
+include: ../_tatoeba_yaml
+doc_to_text: "Gje ei nynorsk omsetjing av denne setninga: {{sourceString}}\nNynorsk:"
diff --git a/lm_eval/tasks/noreval/tatoeba/tatoeba_eng_nno/tatoeba_eng_nno_p3.yaml b/lm_eval/tasks/noreval/tatoeba/tatoeba_eng_nno/tatoeba_eng_nno_p3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ccc700269ae431d15f2d31be6ba9fe73d5f76518
--- /dev/null
+++ b/lm_eval/tasks/noreval/tatoeba/tatoeba_eng_nno/tatoeba_eng_nno_p3.yaml
@@ -0,0 +1,6 @@
+tag: tatoeba_eng_nno
+dataset_name: eng-nno
+doc_to_target: targetString
+task: tatoeba_eng_nno_p3
+include: ../_tatoeba_yaml
+doc_to_text: "Kva blir \"{{sourceString}}\" på nynorsk?\nNynorsk:"
diff --git a/lm_eval/tasks/noreval/tatoeba/tatoeba_eng_nob/tatoeba_eng_nob_p0.yaml b/lm_eval/tasks/noreval/tatoeba/tatoeba_eng_nob/tatoeba_eng_nob_p0.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cfd9c77fec86b806e8050cfd51c294355b17c963
--- /dev/null
+++ b/lm_eval/tasks/noreval/tatoeba/tatoeba_eng_nob/tatoeba_eng_nob_p0.yaml
@@ -0,0 +1,6 @@
+tag: tatoeba_eng_nob
+dataset_name: eng-nob
+doc_to_target: targetString
+task: tatoeba_eng_nob_p0
+include: ../_tatoeba_yaml
+doc_to_text: "Engelsk: {{sourceString}}\nBokmål:"
diff --git a/lm_eval/tasks/noreval/tatoeba/tatoeba_eng_nob/tatoeba_eng_nob_p1.yaml b/lm_eval/tasks/noreval/tatoeba/tatoeba_eng_nob/tatoeba_eng_nob_p1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c762b47b338ab6941772bb79069e7714840f46bf
--- /dev/null
+++ b/lm_eval/tasks/noreval/tatoeba/tatoeba_eng_nob/tatoeba_eng_nob_p1.yaml
@@ -0,0 +1,6 @@
+tag: tatoeba_eng_nob
+dataset_name: eng-nob
+doc_to_target: targetString
+task: tatoeba_eng_nob_p1
+include: ../_tatoeba_yaml
+doc_to_text: "Oversett følgende setning til norsk bokmål: {{sourceString}}\nBokmål:"
diff --git a/lm_eval/tasks/noreval/tatoeba/tatoeba_eng_nob/tatoeba_eng_nob_p2.yaml b/lm_eval/tasks/noreval/tatoeba/tatoeba_eng_nob/tatoeba_eng_nob_p2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d1362c224eb233f0c794bbbe9862728840f57ea1
--- /dev/null
+++ b/lm_eval/tasks/noreval/tatoeba/tatoeba_eng_nob/tatoeba_eng_nob_p2.yaml
@@ -0,0 +1,6 @@
+tag: tatoeba_eng_nob
+dataset_name: eng-nob
+doc_to_target: targetString
+task: tatoeba_eng_nob_p2
+include: ../_tatoeba_yaml
+doc_to_text: "Gi en oversettelse til bokmål for denne setningen: {{sourceString}}\nBokmål:"
diff --git a/lm_eval/tasks/noreval/tatoeba/tatoeba_eng_nob/tatoeba_eng_nob_p3.yaml b/lm_eval/tasks/noreval/tatoeba/tatoeba_eng_nob/tatoeba_eng_nob_p3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..66bda471f1d3b7246e8dad43a2a008186e8d5bd7
--- /dev/null
+++ b/lm_eval/tasks/noreval/tatoeba/tatoeba_eng_nob/tatoeba_eng_nob_p3.yaml
@@ -0,0 +1,6 @@
+tag: tatoeba_eng_nob
+dataset_name: eng-nob
+doc_to_target: targetString
+task: tatoeba_eng_nob_p3
+include: ../_tatoeba_yaml
+doc_to_text: "Hva blir \"{{sourceString}}\" på bokmål?\nBokmål:"
diff --git a/lm_eval/tasks/noreval/tatoeba/tatoeba_nno_eng/tatoeba_nno_eng_p0.yaml b/lm_eval/tasks/noreval/tatoeba/tatoeba_nno_eng/tatoeba_nno_eng_p0.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a09be934f3cb3fdf7c79ab9203b3d7d5d78f6011
--- /dev/null
+++ b/lm_eval/tasks/noreval/tatoeba/tatoeba_nno_eng/tatoeba_nno_eng_p0.yaml
@@ -0,0 +1,6 @@
+tag: tatoeba_nno_eng
+dataset_name: eng-nno
+doc_to_target: sourceString
+task: tatoeba_nno_eng_p0
+include: ../_tatoeba_yaml
+doc_to_text: "Nynorsk: {{targetString}}\nEngelsk:"
diff --git a/lm_eval/tasks/noreval/tatoeba/tatoeba_nno_eng/tatoeba_nno_eng_p1.yaml b/lm_eval/tasks/noreval/tatoeba/tatoeba_nno_eng/tatoeba_nno_eng_p1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cf69ce7c30fe30de055bf5c3048b28f35d99d13b
--- /dev/null
+++ b/lm_eval/tasks/noreval/tatoeba/tatoeba_nno_eng/tatoeba_nno_eng_p1.yaml
@@ -0,0 +1,6 @@
+tag: tatoeba_nno_eng
+dataset_name: eng-nno
+doc_to_target: sourceString
+task: tatoeba_nno_eng_p1
+include: ../_tatoeba_yaml
+doc_to_text: "Omsett følgande setning til engelsk: {{targetString}}\nEngelsk:"
diff --git a/lm_eval/tasks/noreval/tatoeba/tatoeba_nno_eng/tatoeba_nno_eng_p2.yaml b/lm_eval/tasks/noreval/tatoeba/tatoeba_nno_eng/tatoeba_nno_eng_p2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3f179696d10b9130600dc9c032b62b564981f35d
--- /dev/null
+++ b/lm_eval/tasks/noreval/tatoeba/tatoeba_nno_eng/tatoeba_nno_eng_p2.yaml
@@ -0,0 +1,6 @@
+tag: tatoeba_nno_eng
+dataset_name: eng-nno
+doc_to_target: sourceString
+task: tatoeba_nno_eng_p2
+include: ../_tatoeba_yaml
+doc_to_text: "Gje ei engelsk omsetjing av denne setninga: {{targetString}}\nEngelsk:"
diff --git a/lm_eval/tasks/noreval/tatoeba/tatoeba_nno_eng/tatoeba_nno_eng_p3.yaml b/lm_eval/tasks/noreval/tatoeba/tatoeba_nno_eng/tatoeba_nno_eng_p3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f5b5583795d0e9676e1c3e2505b11834cdc29bde
--- /dev/null
+++ b/lm_eval/tasks/noreval/tatoeba/tatoeba_nno_eng/tatoeba_nno_eng_p3.yaml
@@ -0,0 +1,6 @@
+tag: tatoeba_nno_eng
+dataset_name: eng-nno
+doc_to_target: sourceString
+task: tatoeba_nno_eng_p3
+include: ../_tatoeba_yaml
+doc_to_text: "Kva blir \"{{targetString}}\" på engelsk?\nEngelsk:"
diff --git a/lm_eval/tasks/noreval/tatoeba/tatoeba_nob_eng/tatoeba_nob_eng_p0.yaml b/lm_eval/tasks/noreval/tatoeba/tatoeba_nob_eng/tatoeba_nob_eng_p0.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9213403f1da6a2c6037d9fe08c07968ab8da5b9c
--- /dev/null
+++ b/lm_eval/tasks/noreval/tatoeba/tatoeba_nob_eng/tatoeba_nob_eng_p0.yaml
@@ -0,0 +1,6 @@
+tag: tatoeba_nob_eng
+dataset_name: eng-nob
+doc_to_target: sourceString
+task: tatoeba_nob_eng_p0
+include: ../_tatoeba_yaml
+doc_to_text: "Bokmål: {{targetString}}\nEngelsk:"
diff --git a/lm_eval/tasks/noreval/tatoeba/tatoeba_nob_eng/tatoeba_nob_eng_p1.yaml b/lm_eval/tasks/noreval/tatoeba/tatoeba_nob_eng/tatoeba_nob_eng_p1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..efb7c6315d893a21ae80a9bf0c900dd09dcf6b06
--- /dev/null
+++ b/lm_eval/tasks/noreval/tatoeba/tatoeba_nob_eng/tatoeba_nob_eng_p1.yaml
@@ -0,0 +1,6 @@
+tag: tatoeba_nob_eng
+dataset_name: eng-nob
+doc_to_target: sourceString
+task: tatoeba_nob_eng_p1
+include: ../_tatoeba_yaml
+doc_to_text: "Oversett følgende setning til engelsk: {{targetString}}\nEngelsk:"
diff --git a/lm_eval/tasks/noreval/tatoeba/tatoeba_nob_eng/tatoeba_nob_eng_p2.yaml b/lm_eval/tasks/noreval/tatoeba/tatoeba_nob_eng/tatoeba_nob_eng_p2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c8a17df2dce28670c8e24b833e55da79142e1ca7
--- /dev/null
+++ b/lm_eval/tasks/noreval/tatoeba/tatoeba_nob_eng/tatoeba_nob_eng_p2.yaml
@@ -0,0 +1,6 @@
+tag: tatoeba_nob_eng
+dataset_name: eng-nob
+doc_to_target: sourceString
+task: tatoeba_nob_eng_p2
+include: ../_tatoeba_yaml
+doc_to_text: "Gi en engelsk oversettelse av denne setningen: {{targetString}}\nEngelsk:"
diff --git a/lm_eval/tasks/noreval/tatoeba/tatoeba_nob_eng/tatoeba_nob_eng_p3.yaml b/lm_eval/tasks/noreval/tatoeba/tatoeba_nob_eng/tatoeba_nob_eng_p3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..85a29eec508711815259b9823660d8ce7cbe95ba
--- /dev/null
+++ b/lm_eval/tasks/noreval/tatoeba/tatoeba_nob_eng/tatoeba_nob_eng_p3.yaml
@@ -0,0 +1,6 @@
+tag: tatoeba_nob_eng
+dataset_name: eng-nob
+doc_to_target: sourceString
+task: tatoeba_nob_eng_p3
+include: ../_tatoeba_yaml
+doc_to_text: "Hva blir \"{{targetString}}\" på engelsk?\nEngelsk:"
diff --git a/lm_eval/tasks/paws-x/paws_de.yaml b/lm_eval/tasks/paws-x/paws_de.yaml
index 055b373565b3b0de730adaa7b6159a04400bc786..52def7a7e5279dc6a0b3086ff14c7b0ff0f81c15 100644
--- a/lm_eval/tasks/paws-x/paws_de.yaml
+++ b/lm_eval/tasks/paws-x/paws_de.yaml
@@ -4,4 +4,5 @@ doc_to_choice: '{{[sentence1+", richtig? Nein, "+sentence2, sentence1+", richtig
   "+sentence2]}}'
 doc_to_text: ''
 include: pawsx_template_yaml
+process_docs: !function utils.process_docs_paraphrases
 task: paws_de
diff --git a/lm_eval/tasks/paws-x/paws_en.yaml b/lm_eval/tasks/paws-x/paws_en.yaml
index b5955b037c821bb5e8731037d7fd502799d8f565..fdc0e2ec0348bf91f801730eefa5657cbfce2f4d 100644
--- a/lm_eval/tasks/paws-x/paws_en.yaml
+++ b/lm_eval/tasks/paws-x/paws_en.yaml
@@ -3,4 +3,5 @@ dataset_name: en
 doc_to_choice: '{{[sentence1+", right? No, "+sentence2, sentence1+", right? Yes, "+sentence2]}}'
 doc_to_text: ''
 include: pawsx_template_yaml
+process_docs: !function utils.process_docs_paraphrases
 task: paws_en
diff --git a/lm_eval/tasks/paws-x/paws_es.yaml b/lm_eval/tasks/paws-x/paws_es.yaml
index 65189a377c4bbe2fb386f4330f4dd55227254d41..4df52f7c0ef745bb85fff03a7e7223440fd2c388 100644
--- a/lm_eval/tasks/paws-x/paws_es.yaml
+++ b/lm_eval/tasks/paws-x/paws_es.yaml
@@ -4,4 +4,5 @@ doc_to_choice: '{{[sentence1+", verdad? No, "+sentence2, sentence1+", verdad? S
   "+sentence2]}}'
 doc_to_text: ''
 include: pawsx_template_yaml
+process_docs: !function utils.process_docs_paraphrases
 task: paws_es
diff --git a/lm_eval/tasks/paws-x/paws_fr.yaml b/lm_eval/tasks/paws-x/paws_fr.yaml
index a8f599a1f3cfe6d8cf1498c23f244146d2eed5b4..7cc55c1e3f83e1ff78ce05b42baad4af05055480 100644
--- a/lm_eval/tasks/paws-x/paws_fr.yaml
+++ b/lm_eval/tasks/paws-x/paws_fr.yaml
@@ -4,4 +4,5 @@ doc_to_choice: '{{[sentence1+", n''est-ce pas? Non, "+sentence2, sentence1+", n'
   pas? Oui, "+sentence2]}}'
 doc_to_text: ''
 include: pawsx_template_yaml
+process_docs: !function utils.process_docs_paraphrases
 task: paws_fr
diff --git a/lm_eval/tasks/paws-x/pawsx_template_yaml b/lm_eval/tasks/paws-x/pawsx_template_yaml
index 6f82e4a59e3ecd8e1d319893d14da560b3fa8b4d..ccf4c543172b1ba81fc2c982d26d443c7e0ea405 100644
--- a/lm_eval/tasks/paws-x/pawsx_template_yaml
+++ b/lm_eval/tasks/paws-x/pawsx_template_yaml
@@ -11,6 +11,7 @@ test_split: test
 doc_to_text: null
 doc_to_target: label
 doc_to_choice: null
+target_delimiter: ""
 metric_list:
   - metric: acc
     aggregation: mean
diff --git a/lm_eval/tasks/paws-x/utils.py b/lm_eval/tasks/paws-x/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f5a077a4dce98d4b0fb0e022ae0d3d7df235074
--- /dev/null
+++ b/lm_eval/tasks/paws-x/utils.py
@@ -0,0 +1,43 @@
+import re
+
+
+def general_detokenize(string):
+    string = string.replace(" n't", "n't")
+    string = string.replace(" )", ")")
+    string = string.replace("( ", "(")
+    string = string.replace('" ', '"')
+    string = string.replace(' "', '"')
+    string = re.sub(r" (['.,])", r"\1", string)
+    return string
+
+
+def lowercase_first_letter(text):
+    return text[0].lower() + text[1:]
+
+
+def process_docs_paraphrases(dataset):
+    empty_docs = []
+
+    def _process_doc(doc):
+        if doc["sentence1"] not in [None, ""] and doc["sentence2"] not in [None, ""]:
+            doc["sentence1"] = general_detokenize(doc["sentence1"]).strip()
+            doc["sentence2"] = general_detokenize(doc["sentence2"]).strip()
+            # Remove final punctuation mark in the first sentence
+            if doc["sentence1"].endswith((".", ",", ";")):
+                doc["sentence1"] = doc["sentence1"][:-1]
+            # Start the second sentence in lowercase (to be used after "Yes, ...")
+            doc["sentence2"] = lowercase_first_letter(doc["sentence2"])
+            return doc
+        else:
+            empty_docs.append(doc)
+            return doc
+
+    if empty_docs != []:
+        len_empty_docs = len(empty_docs)
+        print(
+            f"Found {len_empty_docs} empty documents out of the {len(dataset)} total docs in the dataset: {empty_docs}"
+        )
+    return dataset.filter(
+        lambda doc: doc["sentence1"] not in [None, ""]
+        and doc["sentence2"] not in [None, ""]
+    ).map(_process_doc)
diff --git a/lm_eval/tasks/realtoxicityprompts/metric.py b/lm_eval/tasks/realtoxicityprompts/metric.py
index 552cd33ddcb2bc5ca0fc923e35c29e5148457738..9e7d0958a25e28d75c1bc148adf82400f61fc376 100644
--- a/lm_eval/tasks/realtoxicityprompts/metric.py
+++ b/lm_eval/tasks/realtoxicityprompts/metric.py
@@ -1,10 +1,12 @@
 import json
+import logging
 import os
 
 import requests
 from requests.adapters import HTTPAdapter, Retry
 
-from lm_eval.utils import eval_logger
+
+eval_logger = logging.getLogger(__name__)
 
 
 def toxicity_perspective_api(
diff --git a/lm_eval/tasks/score/agi_eval/utils_agieval.py b/lm_eval/tasks/score/agi_eval/utils_agieval.py
index 4381a2cb34c771f65b4c76240def2c29bd3a314f..6539d05dcc0256fcf32c58d57afc58c108297b20 100644
--- a/lm_eval/tasks/score/agi_eval/utils_agieval.py
+++ b/lm_eval/tasks/score/agi_eval/utils_agieval.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
 import os
 import re
 from functools import partial
@@ -22,9 +23,10 @@ from datasets import Dataset
 
 from lm_eval.tasks.score import utils
 from lm_eval.tasks.score.utils import prompt_consistency_rate, robustness_doc_to_text
-from lm_eval.utils import eval_logger
 
 
+eval_logger = logging.getLogger(__name__)
+
 TEMPLATE_FILE_PATH = os.path.join(os.path.dirname(__file__), "prompt_templates.json")
 
 PROMPT_ROBUSTNESS_TEMPLATE_KEY = "prompt_robustness"
diff --git a/lm_eval/tasks/score/math/utils_math.py b/lm_eval/tasks/score/math/utils_math.py
index 4068b179882718afe79dd61a967b4c531ee0e58b..3750b9853e1310d69faa89ef3addb26bd17b8fb1 100644
--- a/lm_eval/tasks/score/math/utils_math.py
+++ b/lm_eval/tasks/score/math/utils_math.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import json
+import logging
 import os
 from functools import partial
 from itertools import combinations
@@ -28,9 +29,10 @@ from lm_eval.tasks.score.math.math_grader import (
     normalize_answer_string,
 )
 from lm_eval.tasks.score.utils import robustness_doc_to_text
-from lm_eval.utils import eval_logger
 
 
+eval_logger = logging.getLogger(__name__)
+
 TEMPLATE_FILE_PATH = os.path.join(os.path.dirname(__file__), "prompt_templates.json")
 
 PROMPT_ROBUSTNESS_TEMPLATE_KEY = "prompt_robustness"
diff --git a/lm_eval/tasks/score/mmlu_pro/utils_mmlu_pro.py b/lm_eval/tasks/score/mmlu_pro/utils_mmlu_pro.py
index da46e10170f88820c290ecbb2154058dd07ab8a7..681c76f54d2852ebbb1fe31ac3ec49046cc0d1c9 100644
--- a/lm_eval/tasks/score/mmlu_pro/utils_mmlu_pro.py
+++ b/lm_eval/tasks/score/mmlu_pro/utils_mmlu_pro.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
 import os
 from functools import partial
 from typing import Any, Dict, List
@@ -20,9 +21,10 @@ import numpy as np
 
 from lm_eval.tasks.score import utils
 from lm_eval.tasks.score.utils import prompt_consistency_rate, robustness_doc_to_text
-from lm_eval.utils import eval_logger
 
 
+eval_logger = logging.getLogger(__name__)
+
 TEMPLATE_FILE_PATH = os.path.join(os.path.dirname(__file__), "prompt_templates.json")
 
 PROMPT_ROBUSTNESS_TEMPLATE_KEY = "prompt_robustness"
diff --git a/lm_eval/tasks/score/utils.py b/lm_eval/tasks/score/utils.py
index 61d7e3b035c8a149369ccf5617a0d86e4a75c6d5..9a9951861c7b7d1eee4009bef61cce49125e17fe 100644
--- a/lm_eval/tasks/score/utils.py
+++ b/lm_eval/tasks/score/utils.py
@@ -14,6 +14,7 @@
 
 import copy
 import json
+import logging
 import re
 import string
 import sys
@@ -24,7 +25,8 @@ from typing import Any, Dict, List
 import numpy as np
 from datasets import Dataset
 
-from lm_eval.utils import eval_logger
+
+eval_logger = logging.getLogger(__name__)
 
 
 NUMERALS = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]
diff --git a/lm_eval/tasks/squad_completion/task.py b/lm_eval/tasks/squad_completion/task.py
index d8eb467457ef75fcff7edaaaad748fc567a282c3..79990adeff38b6c13519f770eec4cf95d65de304 100644
--- a/lm_eval/tasks/squad_completion/task.py
+++ b/lm_eval/tasks/squad_completion/task.py
@@ -1,8 +1,8 @@
 import re
+from copy import deepcopy
 from typing import List
 
 import numpy as np
-from deepcopy import deepcopy
 
 from lm_eval.api.instance import Instance
 from lm_eval.api.task import ConfigurableTask
diff --git a/lm_eval/tasks/unitxt/task.py b/lm_eval/tasks/unitxt/task.py
index d8e7edfc8aa5bddc7306fc76930eda8e4431f2f7..5c75cf57aa118190ab5fc1c2cd51468154c000e2 100644
--- a/lm_eval/tasks/unitxt/task.py
+++ b/lm_eval/tasks/unitxt/task.py
@@ -105,7 +105,7 @@ class Unitxt(ConfigurableTask):
         return False
 
     def doc_to_target(self, doc):
-        doc["target"]
+        return doc["target"]
 
     def get_arguments(self, doc, ctx):
         return (ctx, {"until": ["\n"]})
@@ -120,8 +120,7 @@ class Unitxt(ConfigurableTask):
         chat_template: Optional[Callable] = None,
         gen_prefix: Optional[str] = None,
     ) -> str:
-        source = self.doc_to_text(doc)
-        if isinstance(source, list):
+        if isinstance(self.doc_to_text(doc), list):
             if apply_chat_template:
                 formated_source = chat_template(self.doc_to_text(doc))
                 return formated_source
@@ -130,7 +129,15 @@ class Unitxt(ConfigurableTask):
                     "Got chat template format from Unitxt, but apply_chat_template is false. Add '--apply_chat_template' to command line."
                 )
         else:
-            return source
+            return super().fewshot_context(
+                doc=doc,
+                num_fewshot=num_fewshot,
+                system_instruction=system_instruction,
+                apply_chat_template=apply_chat_template,
+                fewshot_as_multiturn=fewshot_as_multiturn,
+                chat_template=chat_template,
+                gen_prefix=gen_prefix,
+            )
 
     def construct_requests(self, doc, ctx, **kwargs):
         """Uses RequestFactory to construct Requests and returns an iterable of
diff --git a/pyproject.toml b/pyproject.toml
index bd77bcc4989c28c7476fcf6c6a653397221a5628..6c8b4403e7a8134cb8f605a836c2712bc743c8bd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "lm_eval"
-version = "0.4.8"
+version = "0.4.9"
 authors = [
     {name="EleutherAI", email="contact@eleuther.ai"}
 ]
@@ -57,10 +57,11 @@ Homepage = "https://github.com/EleutherAI/lm-evaluation-harness"
 Repository = "https://github.com/EleutherAI/lm-evaluation-harness"
 
 [project.optional-dependencies]
+acpbench = ["lark>=1.1.9", "tarski[clingo]==0.8.2", "pddl==0.4.2", "kstar-planner==1.4.2"]
 api = ["requests", "aiohttp", "tenacity", "tqdm", "tiktoken"]
 audiolm_qwen = ["librosa", "soundfile"]
 deepsparse = ["deepsparse-nightly[llm]>=1.8.0.20240404"]
-dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "mypy", "unitxt", "requests", "aiohttp", "tenacity", "tqdm", "tiktoken", "sentencepiece"]
+dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "mypy", "unitxt==1.22.0", "requests", "aiohttp", "tenacity", "tqdm", "tiktoken", "sentencepiece"]
 gptq = ["auto-gptq[triton]>=0.6.0"]
 gptqmodel = ["gptqmodel>=1.0.9"]
 hf_transfer = ["hf_transfer"]
@@ -85,6 +86,7 @@ vllm = ["vllm>=0.4.2"]
 wandb = ["wandb>=0.16.3", "pandas", "numpy"]
 zeno = ["pandas", "zeno-client"]
 all = [
+    "lm_eval[acpbench]",
     "lm_eval[api]",
     "lm_eval[audiolm_qwen]",
     "lm_eval[deepsparse]",
diff --git a/scripts/cost_estimate.py b/scripts/cost_estimate.py
deleted file mode 100644
index baf81147547b0a7a92e52904c70cb11d246f680b..0000000000000000000000000000000000000000
--- a/scripts/cost_estimate.py
+++ /dev/null
@@ -1,99 +0,0 @@
-import random
-
-import transformers
-
-from lm_eval import evaluator, tasks
-from lm_eval.api.model import LM
-
-
-class DryrunLM(LM):
-    def __init__(self):
-        self.tokencost = 0
-        self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2")
-        self.tokenizer.pad_token = "<|endoftext|>"
-
-    @classmethod
-    def create_from_arg_string(cls, arg_string):
-        return cls()
-
-    def loglikelihood(self, requests):
-        res = []
-
-        for ctx, cont in requests:
-            res.append((-random.random(), False))
-            self.tokencost += len(self.tokenizer.tokenize(ctx + cont))
-
-        return res
-
-    def generate_until(self, requests):
-        res = []
-
-        for ctx, _ in requests:
-            res.append("lol")
-
-            # assume worst case - generates until 256
-            self.tokencost += len(self.tokenizer.tokenize(ctx)) + 256
-
-        return res
-
-    def loglikelihood_rolling(self, requests):
-        res = []
-
-        for (s,) in requests:
-            # assume worst case: extra full context
-            self.tokencost += len(self.tokenizer.tokenize(s)) + 2048
-
-        return res
-
-
-def main():
-    lm = DryrunLM()
-
-    task_list = "arc_challenge,arc_easy,boolq,cola,copa,headqa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,record,rte,sciq,sst,triviaqa,webqs,wic,wikitext,winogrande,wnli,wsc"
-    values = []
-    for taskname in task_list.split(","):
-        lm.tokencost = 0
-        evaluator.simple_evaluate(
-            lm=lm,
-            task_dict={taskname: tasks.get_task(taskname)()},
-            num_fewshot=0,
-            limit=None,
-            bootstrap_iters=10,
-        )
-
-        print(taskname, lm.tokencost)
-        values.append(
-            [
-                taskname,
-                lm.tokencost,
-                lm.tokencost / 1000 * 0.0008,
-                lm.tokencost / 1000 * 0.0012,
-                lm.tokencost / 1000 * 0.006,
-                lm.tokencost / 1000 * 0.06,
-            ]
-        )
-    from pytablewriter import MarkdownTableWriter
-
-    writer = MarkdownTableWriter()
-    writer.headers = ["Task", "Tokens", "Ada", "Babbage", "Curie", "Davinci"]
-
-    values.sort(key=lambda x: -x[1])
-    totcost = sum([x[1] for x in values])
-    values.append(
-        [
-            "**Total**",
-            totcost,
-            totcost / 1000 * 0.0008,
-            totcost / 1000 * 0.0012,
-            totcost / 1000 * 0.006,
-            totcost / 1000 * 0.06,
-        ]
-    )
-
-    writer.value_matrix = values
-
-    print(writer.dumps())
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..1976a497ce045d74af0d5a6a04f7306f95bbb266
--- /dev/null
+++ b/tests/test_metrics.py
@@ -0,0 +1,156 @@
+from lm_eval.api.task import ConfigurableTask, TaskConfig
+
+
+class MockConfigurableTask(ConfigurableTask):
+    """Mock task for testing metrics"""
+
+    def __init__(self):
+        # Create a minimal config
+        config = {
+            "task": "test_acc_mutual_info",
+            "output_type": "multiple_choice",
+            "metric_list": [{"metric": "acc"}, {"metric": "acc_mutual_info"}],
+            "doc_to_choice": ["A", "B", "C"],
+            "doc_to_target": 1,  # Correct answer is index 1 (choice "B")
+            "target_delimiter": " ",
+        }
+
+        # Initialize with minimal setup
+        self._config = TaskConfig(**config)
+        self.OUTPUT_TYPE = "multiple_choice"
+
+        # Set up required attributes
+        self.multiple_input = 0
+        self.multiple_target = 0
+
+        # Set up metrics
+        self._metric_fn_list = {"acc": None, "acc_mutual_info": None}
+        self._metric_fn_kwargs = {"acc": {}, "acc_mutual_info": {}}
+        self._aggregation_list = {}
+        self._higher_is_better = {}
+
+    def doc_to_choice(self, doc):
+        return ["A", "B", "C"]
+
+    def doc_to_target(self, doc):
+        return 1  # Choice "B" is correct
+
+    # Required abstract methods (minimal implementations)
+    def has_training_docs(self):
+        return False
+
+    def has_validation_docs(self):
+        return False
+
+    def has_test_docs(self):
+        return True
+
+    def download(self, **kwargs):
+        pass
+
+
+def test_acc_mutual_info_slicing():
+    """Test that acc_mutual_info correctly slices conditional and unconditional loglikelihoods"""
+
+    task = MockConfigurableTask()
+
+    # Simulate loglikelihood results for 3 choices
+    # Format: [(loglikelihood, is_greedy), ...]
+    # First 3 are conditional P(choice|context), next 3 are unconditional P(choice)
+
+    # Combined results as they would come from the model
+    # Order: conditional_1, conditional_2, conditional_3, unconditional_1, unconditional_2, unconditional_3
+    # Conditional: [-2.0, -1.0, -3.0] - Choice B (index 1) has highest prob
+    # Unconditional: [-2.5, -2.0, -2.5] - Choice B has higher unconditional prob too
+    results = [
+        (-2.0, False),
+        (-1.0, True),
+        (-3.0, False),  # Conditional
+        (-2.5, False),
+        (-2.0, False),
+        (-2.5, False),
+    ]  # Unconditional
+
+    # Test the process_results method
+    doc = {}  # Mock document
+    result_dict = task.process_results(doc, results)
+
+    # Verify that both acc and acc_mutual_info are calculated
+    assert "acc" in result_dict
+    assert "acc_mutual_info" in result_dict
+
+    # Both should be 1.0 since choice B (index 1) is correct and has highest probability
+    assert result_dict["acc"] == 1.0, f"Expected acc=1.0, got {result_dict['acc']}"
+    assert result_dict["acc_mutual_info"] == 1.0, (
+        f"Expected acc_mutual_info=1.0, got {result_dict['acc_mutual_info']}"
+    )
+
+
+def test_acc_mutual_info_different_predictions():
+    """Test case where conditional and mutual info predictions differ"""
+
+    task = MockConfigurableTask()
+
+    # Mutual info calculation:
+    # Conditional:   A=-1.0, B=-2.0, C=-3.0 (A wins conditionally)
+    # Unconditional: A=-0.5, B=-2.0, C=-3.0 (A has much higher unconditional prob)
+    # Mutual info = conditional - unconditional:
+    # A: -1.0 - (-0.5) = -0.5
+    # B: -2.0 - (-2.0) = 0.0    <- B wins with mutual info!
+    # C: -3.0 - (-3.0) = 0.0
+
+    results = [
+        (-1.0, True),
+        (-2.0, False),
+        (-3.0, False),  # Conditional (A wins)
+        (-0.5, False),
+        (-2.0, False),
+        (-3.0, False),
+    ]  # Unconditional
+
+    doc = {}
+    result_dict = task.process_results(doc, results)
+
+    # Regular acc should be 0.0 (A predicted, but B is correct)
+    assert result_dict["acc"] == 0.0, f"Expected acc=0.0, got {result_dict['acc']}"
+
+    # Mutual info should be 1.0 (B predicted with mutual info, and B is correct)
+    assert result_dict["acc_mutual_info"] == 1.0, (
+        f"Expected acc_mutual_info=1.0, got {result_dict['acc_mutual_info']}"
+    )
+
+
+def test_acc_mutual_info_without_metric():
+    """Test that normal behavior works when acc_mutual_info is not in metric list"""
+
+    # Create task without acc_mutual_info
+    config = {
+        "task": "test_normal",
+        "output_type": "multiple_choice",
+        "metric_list": [{"metric": "acc"}],  # Only acc, no acc_mutual_info
+        "doc_to_choice": ["A", "B", "C"],
+        "doc_to_target": 1,
+        "target_delimiter": " ",
+    }
+
+    task = MockConfigurableTask()
+    task._config = TaskConfig(**config)
+    task._metric_fn_list = {"acc": None}  # Only acc
+
+    # Only conditional loglikelihoods (no unconditional since acc_mutual_info not requested)
+    results = [(-2.0, False), (-1.0, True), (-3.0, False)]  # 3 choices, B wins
+
+    doc = {}
+    result_dict = task.process_results(doc, results)
+
+    # Should only have acc, not acc_mutual_info
+    assert "acc" in result_dict
+    assert "acc_mutual_info" not in result_dict
+    assert result_dict["acc"] == 1.0
+
+
+if __name__ == "__main__":
+    test_acc_mutual_info_slicing()
+    test_acc_mutual_info_different_predictions()
+    test_acc_mutual_info_without_metric()
+    print("All tests passed!")