merge conflicts

16bc6bc0 · haileyschoelkopf · 3d7f777d · 465c695b · 16bc6bc0 · 16bc6bc0
Commit 16bc6bc0 authored Aug 01, 2023 by haileyschoelkopf
20 changed files
--- a/.github/workflows/new_tasks.yml
+++ b/.github/workflows/new_tasks.yml
@@ -8,10 +8,11 @@ on:
    branches:
      - big-refactor
  workflow_dispatch:
-
+# comment/edit out the above to stop/change the triggers
 jobs:
  changed_files:
    runs-on: ubuntu-latest  # windows-latest || macos-latest
+    timeout-minutes: 120
    name: Scan for changed tasks
    steps:
      - name: checkout
@@ -19,11 +20,15 @@ jobs:
        with:
          fetch-depth: 0  # OR "2" -> To retrieve the preceding commit.

-      # Example 1
+      # Uses the tj-actions/changed-files@v37 action to check for changes.
+      # Outputs provided here: https://github.com/tj-actions/changed-files#outputs
+      # The `files_yaml` input optionally takes a yaml string to specify filters,
+      # and prepends the filter name to the standard output names.
      - name: Check task folders
        id: changed-tasks
        uses: tj-actions/changed-files@v37.1.2
        with:
+          # tasks checks the tasks folder and api checks the api folder for changes
          files_yaml: |
            tasks:
              - lm_eval/tasks/**
@@ -31,31 +36,35 @@ jobs:
              - lm_eval/api/**
          write_output_files: true

+    # The next step is optional; the files are written to the workspace by default (above).
+    # so it's just for debugging
      - name: Run Tests
        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
        run: |
          echo .github/outputs/tasks_all_changed_and_modified_files.txt >> 'GITHUB_ENV'
          echo "One or more test file(s) has changed."
          echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}"
+
      - name: Set up Python 3.9
        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
        uses: actions/setup-python@v4
        with:
          python-version: 3.9
-          cache: 'pip'
      - name: Install dependencies
        if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
        run: |
            python -m pip install --upgrade pip
            pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
-            #         Install optional git dependencies
-    #                pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
-    #        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    #   Install optional git dependencies
+    #       pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
+    #       if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
      - name: Test with pytest
+        # if new tasks are added, run tests on them
        if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
-        run: python -m pytest tests/test_tasks.py -s -vv -n=auto --new_task
+        run: python -m pytest tests/extra/test_new_tasks.py -s -vv -n=auto
+        # if api is modified, run tests on it
      - name: Test more tasks with pytest
        env:
          API: true
        if: steps.changed-tasks.outputs.api_any_modified == 'true'
-        run: python -m pytest tests/test_api.py -s -vv -n=auto --new_task
+        run: python -m pytest tests/extra/test_new_tasks.py -s -vv -n=auto
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
 # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
-
+# just comment out unwanted steps to turn off the test.
 name: Unit Tests

 on:
@@ -11,7 +11,8 @@ on:
    branches:
      - big-refactor
  workflow_dispatch:
-
+# Jobs run concurrently and steps run sequentially within a job.
+# jobs: linter and cpu_tests. Add more jobs/steps as required.
 jobs:
  linter:
    name: Linters
@@ -35,9 +36,10 @@ jobs:
        flake8 . --count --select=F,E9,E71,E72,E501,E112,E113,W6 --extend-ignore=F541 --show-source --statistics --exit-zero
        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
-    - name: Lint with mypy
-      run: mypy . --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
-
+      # mypy turned off for now
+#    - name: Lint with mypy
+#      run: mypy . --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
+# Job 2
  testcpu:
    name: CPU Tests
    runs-on: ubuntu-latest
@@ -53,7 +55,7 @@ jobs:
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
-        pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install -e '.[testing,anthropic,sentencepiece]' --extra-index-url https://download.pytorch.org/whl/cpu
        #         Install optional git dependencies
 #                pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
 #        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi

--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,15 @@ env
 data/
 lm_cache
 .idea
-
-*.egg-info/
+build
+dist
+*.egg-info
+venv
 .vscode/
+temp
+__pycache__
+.ipynb_checkpoints
+temp
+# IPython
+profile_default/
+ipython_config.py
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -119,6 +119,9 @@ class TaskConfig(dict):
    def __getitem__(self, item):
        return getattr(self, item)

+    def __setitem__(self, item, value):
+        return setattr(self, item, value)
+
    def to_dict(self):
        """dumps the current config as a dictionary object, as a printable format.
        null fields will not be printed.
@@ -460,6 +463,9 @@ class Task(abc.ABC):
            return labeled_examples + example
        elif type(example) == list:
            return [labeled_examples + ex for ex in example]
+        elif type(example) == int:
+            choices = self.doc_to_choice(doc)
+            return labeled_examples + choices[example]

    def apply_filters(self):

@@ -944,22 +950,21 @@ class ConfigurableTask(Task):
            if self.multiple_target:
                acc = 1.0 if pred in gold else 0.0
                acc_norm = 1.0 if pred_norm in gold else 0.0
+                exact_match = int(any([is_greedy[i] for i in gold]))
            else:
                acc = 1.0 if pred == gold else 0.0
                acc_norm = 1.0 if pred_norm == gold else 0.0
+                # TODO: this gets score of 0 on arc_challenge for pythia-70m. need to test that this works properly
+                exact_match = int(is_greedy[gold])

            result_dict = {
                **({"acc": acc} if "acc" in use_metric else {}),
                **({"f1": (gold, pred)} if "f1" in use_metric else {}),
                **({"mcc": (gold, pred)} if "mcc" in use_metric else {}),
                **({"acc_norm": acc_norm} if "acc_norm" in use_metric else {}),
+                **({"exact_match": exact_match} if "exact_match" in use_metric else {}),
            }

-            if "exact_match" in self._metric_fn_list.keys():
-                # TODO: this gets score of 0 on arc_challenge for pythia-70m. need to test that this works properly
-                is_greedy = is_greedy[gold]  # take value for the gold answer
-                result_dict["exact_match"] = int(is_greedy)
-
            if "acc_mutual_info" in use_metric:
                lls_mutual_info = [
                    ll_c - ll_u for ll_c, ll_u in zip(lls, lls_unconditional)

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -35,7 +35,7 @@ def simple_evaluate(
    model,
    model_args=None,
    tasks=[],
-    num_fewshot=0,
+    num_fewshot=None,
    batch_size=None,
    max_batch_size=None,
    device=None,
@@ -112,7 +112,17 @@ def simple_evaluate(
            + "_rank" + str(lm.rank) + ".db",
        )

-    task_dict = lm_eval.tasks.get_task_dict(tasks, num_fewshot=num_fewshot)
+    task_dict = lm_eval.tasks.get_task_dict(tasks)
+    for task_name in task_dict.keys():
+        config = task_dict[task_name]._config
+        if num_fewshot is not None:
+            if config["num_fewshot"] > 0:
+                default_num_fewshot = config["num_fewshot"]
+                eval_logger.warning(
+                    f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
+                )
+
+            task_dict[task_name]._config["num_fewshot"] = num_fewshot

    if check_integrity:
        run_task_tests(task_list=tasks)
@@ -134,7 +144,6 @@ def simple_evaluate(
            if isinstance(model, str)
            else model.model.config._name_or_path,
            "model_args": model_args,
-            "num_fewshot": num_fewshot,
            "batch_size": batch_size,
            "batch_sizes": list(lm.batch_sizes.values())
            if hasattr(lm, "batch_sizes")
@@ -169,8 +178,6 @@ def evaluate(
        Language Model
    :param task_dict: dict[str, Task]
        Dictionary of tasks. Tasks will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
-    :param num_fewshot: int
-        Number of examples in few-shot context
    :param limit: int, optional
        Limit the number of examples per task (only use this for testing)
    :param bootstrap_iters:

--- a/lm_eval/models/anthropic_llms.py
+++ b/lm_eval/models/anthropic_llms.py
@@ -3,21 +3,28 @@ from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
 from tqdm import tqdm
 import time
+import anthropic
+from lm_eval.logger import eval_logger
+from typing import List, Literal, Any


 def anthropic_completion(
-    client, model, prompt, max_tokens_to_sample, temperature, stop
+    client: anthropic.Anthropic,
+    model: str,
+    prompt: str,
+    max_tokens_to_sample: int,
+    temperature: float,
+    stop: List[str],
+    **kwargs: Any,
 ):
    """Query Anthropic API for completion.

    Retry with back-off until they respond
    """
-    import anthropic
-
    backoff_time = 3
    while True:
        try:
-            response = client.completion(
+            response = client.completions.create(
                prompt=f"{anthropic.HUMAN_PROMPT} {prompt}{anthropic.AI_PROMPT}",
                model=model,
                # NOTE: Claude really likes to do CoT, and overly aggressive stop sequences
@@ -25,36 +32,53 @@ def anthropic_completion(
                stop_sequences=[anthropic.HUMAN_PROMPT] + stop,
                max_tokens_to_sample=max_tokens_to_sample,
                temperature=temperature,
+                **kwargs,
+            )
+            return response.completion
+        except anthropic.RateLimitError as e:
+            eval_logger.warning(
+                f"RateLimitError occurred: {e.__cause__}\n Retrying in {backoff_time} seconds"
            )
-            return response["completion"]
-        except RuntimeError:
-            # TODO: I don't actually know what error Anthropic raises when it times out
-            #       So err update this error when we find out.
-            import traceback
-
-            traceback.print_exc()
            time.sleep(backoff_time)
            backoff_time *= 1.5


 @register_model("anthropic")
 class AnthropicLM(LM):
-    REQ_CHUNK_SIZE = 20
-
-    def __init__(self, model):
-        """
+    REQ_CHUNK_SIZE = 20  # TODO: not used
+
+    def __init__(
+        self,
+        batch_size: int = 1,
+        model: str = "claude-2.0",
+        max_tokens_to_sample: int = 256,
+        temperature: float = 0,  # defaults to 1
+        **kwargs,  # top_p, top_k, etc.
+    ):
+        """Anthropic API wrapper.

        :param model: str
-            Anthropic model e.g. claude-instant-v1
+            Anthropic model e.g. 'claude-instant-v1', 'claude-2'
+        :param max_tokens_to_sample: int
+            Maximum number of tokens to sample from the model
+        :param temperature: float
+            Sampling temperature
+        :param kwargs: Any
+            Additional model_args to pass to the API client
        """
        super().__init__()
-        import anthropic

        self.model = model
-        self.client = anthropic.Client(os.environ["ANTHROPIC_API_KEY"])
+        # defaults to os.environ.get("ANTHROPIC_API_KEY")
+        self.client = anthropic.Anthropic()
+        self.temperature = temperature
+        self.max_tokens_to_sample = max_tokens_to_sample
+        self.tokenizer = self.client.get_tokenizer()
+        self.kwargs = kwargs

    @property
    def eot_token_id(self):
+        # Not sure but anthropic.AI_PROMPT -> [203, 203, 50803, 30]
        raise NotImplementedError("No idea about anthropic tokenization.")

    @property
@@ -63,23 +87,23 @@ class AnthropicLM(LM):

    @property
    def max_gen_toks(self):
-        return 256
+        return self.max_tokens_to_sample

    @property
    def batch_size(self):
        # Isn't used because we override _loglikelihood_tokens
-        raise NotImplementedError()
+        raise NotImplementedError("No support for logits.")

    @property
    def device(self):
        # Isn't used because we override _loglikelihood_tokens
-        raise NotImplementedError()
+        raise NotImplementedError("No support for logits.")

-    def tok_encode(self, string: str):
-        raise NotImplementedError("No idea about anthropic tokenization.")
+    def tok_encode(self, string: str) -> List[int]:
+        return self.tokenizer.encode(string).ids

-    def tok_decode(self, tokens):
-        raise NotImplementedError("No idea about anthropic tokenization.")
+    def tok_decode(self, tokens: List[int]) -> str:
+        return self.tokenizer.decode(tokens)

    def _loglikelihood_tokens(self, requests, disable_tqdm=False):
        raise NotImplementedError("No support for logits.")
@@ -92,20 +116,31 @@ class AnthropicLM(LM):

        res = []
        for request in tqdm(requests):
-            inp = request[0]
-            request_args = request[1]
-            until = request_args["until"]
-            response = anthropic_completion(
-                client=self.client,
-                model=self.model,
-                prompt=inp,
-                max_tokens_to_sample=self.max_gen_toks,
-                temperature=0.0,  # TODO: implement non-greedy sampling for Anthropic
-                stop=until,
-            )
-            res.append(response)
-
-            self.cache_hook.add_partial("greedy_until", request, response)
+            try:
+                inp = request[0]
+                request_args = request[1]
+                # generation_kwargs
+                until = request_args.get("until")
+                max_gen_toks = request_args.get("max_gen_toks", self.max_length)
+                temperature = request_args.get("temperature", self.temperature)
+                response = anthropic_completion(
+                    client=self.client,
+                    model=self.model,
+                    prompt=inp,
+                    max_tokens_to_sample=max_gen_toks,
+                    temperature=temperature,  # TODO: implement non-greedy sampling for Anthropic
+                    stop=until,
+                    **self.kwargs,
+                )
+                res.append(response)
+
+                self.cache_hook.add_partial("greedy_until", request, response)
+            except anthropic.APIConnectionError as e:
+                eval_logger.critical(f"Server unreachable: {e.__cause__}")
+                break
+            except anthropic.APIStatusError as e:
+                eval_logger.critical(f"API error {e.status_code}: {e.message}")
+                break

        return res

@@ -116,3 +151,9 @@ class AnthropicLM(LM):
    def _model_generate(self, context, max_length, eos_token_id):
        # Isn't used because we override greedy_until
        raise NotImplementedError()
+
+    def loglikelihood(self, requests):
+        raise NotImplementedError("No support for logits.")
+
+    def loglikelihood_rolling(self, requests):
+        raise NotImplementedError("No support for logits.")
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -740,7 +740,7 @@ class HFLM(LM):
                else:
                    max_gen_toks = self.max_gen_toks
                # first stop sequence is used to halt generation upon encountering
-                (primary_until) = until[0]
+                primary_until = [until[0]]

                # set the max length in tokens of inputs ("context_enc")
                if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:

--- a/lm_eval/tasks/webqs/README.md
+++ b/lm_eval/tasks/webqs/README.md
+# Task-name
+
+### Paper
+
+Title: `Semantic Parsing on Freebase from Question-Answer Pairs`
+
+Abstract: `https://cs.stanford.edu/~pliang/papers/freebase-emnlp2013.pdf`
+
+WebQuestions is a benchmark for question answering. The dataset consists of 6,642
+question/answer pairs. The questions are supposed to be answerable by Freebase, a
+large knowledge graph. The questions are mostly centered around a single named entity.
+The questions are popular ones asked on the web (at least in 2013).
+
+Homepage: `https://worksheets.codalab.org/worksheets/0xba659fe363cb46e7a505c5b6a774dc8a`
+
+
+### Citation
+
+```
+@inproceedings{berant-etal-2013-semantic,
+    title = "Semantic Parsing on {F}reebase from Question-Answer Pairs",
+    author = "Berant, Jonathan  and
+      Chou, Andrew  and
+      Frostig, Roy  and
+      Liang, Percy",
+    booktitle = "Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing",
+    month = oct,
+    year = "2013",
+    address = "Seattle, Washington, USA",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/D13-1160",
+    pages = "1533--1544",
+}
+```
+
+### Subtasks
+
+List or describe tasks defined in this folder, and their names here:
+* `webqs`: `Questions with multiple accepted answers.`
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+  * [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/webqs/utils.py
+++ b/lm_eval/tasks/webqs/utils.py
+from typing import Dict, List
+
+
+def doc_to_choice(doc: Dict) -> List[str]:
+    """Return all of the accepted answers as choices."""
+    return _remove_prefixes(doc["answers"])
+
+
+def doc_to_target(doc: Dict) -> List[int]:
+    """Return list of indices of accepted answers (all of them)."""
+    remaining = _remove_prefixes(doc["answers"])
+    return list(range(len(remaining)))
+
+
+def _remove_prefixes(aliases):
+    """
+    Remove any alias that has a strict prefix elsewhere in the list.
+
+    This is an optimization. We can do this because if the prefix is acceptable by isgreedy,
+    we can stop looking.
+    """
+    aliases.sort()
+    ret = [aliases[0]]
+    for alias in aliases[1:]:
+        if not alias.startswith(ret[-1]):
+            ret.append(alias)
+    return ret
--- a/lm_eval/tasks/webqs/webqs.yaml
+++ b/lm_eval/tasks/webqs/webqs.yaml
+group:
+  - freebase
+  - question_answer
+task: webqs
+dataset_path: web_questions
+dataset_name: null
+output_type: multiple_choice
+training_split: train
+validation_split: null
+test_split: test
+doc_to_text: "Question: {{question}}\nAnswer:"
+doc_to_target: !function utils.doc_to_target
+doc_to_choice: !function utils.doc_to_choice
+should_decontaminate: true
+doc_to_decontamination_query: question
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/xwinograd/README.md
+++ b/lm_eval/tasks/xwinograd/README.md
+# Task-name
+
+### Paper
+
+Title: `It's All in the Heads: Using Attention Heads as a Baseline for Cross-Lingual Transfer in Commonsense Reasoning`
+Abstract: `https://arxiv.org/abs/2106.12066`
+
+Multilingual winograd schema challenge that includes English, French, Japanese, Portuguese, Russian and Chinese. Winograd schema challenges come from the XWinograd dataset introduced in Tikhonov et al. As it only contains 16 Chinese schemas, we add 488 Chinese schemas from clue/cluewsc2020.
+
+Homepage: `https://huggingface.co/datasets/Muennighoff/xwinograd`
+
+
+### Citation
+
+```
+@misc{muennighoff2022crosslingual,
+      title={Crosslingual Generalization through Multitask Finetuning},
+      author={Niklas Muennighoff and Thomas Wang and Lintang Sutawika and Adam Roberts and Stella Biderman and Teven Le Scao and M Saiful Bari and Sheng Shen and Zheng-Xin Yong and Hailey Schoelkopf and Xiangru Tang and Dragomir Radev and Alham Fikri Aji and Khalid Almubarak and Samuel Albanie and Zaid Alyafeai and Albert Webson and Edward Raff and Colin Raffel},
+      year={2022},
+      eprint={2211.01786},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+@misc{tikhonov2021heads,
+    title={It's All in the Heads: Using Attention Heads as a Baseline for Cross-Lingual Transfer in Commonsense Reasoning},
+    author={Alexey Tikhonov and Max Ryabinin},
+    year={2021},
+    eprint={2106.12066},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+```
+
+### Subtasks
+
+List or describe tasks defined in this folder, and their names here:
+* `xwinograd_en`: Winograd schema challenges in English.
+* `xwinograd_fr`: Winograd schema challenges in French.
+* `xwinograd_jp`: Winograd schema challenges in Japanese.
+* `xwinograd_pt`: Winograd schema challenges in Portuguese.
+* `xwinograd_ru`: Winograd schema challenges in Russian.
+* `xwinograd_zh`: Winograd schema challenges in Chinese.
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+  * [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/xwinograd/utils.py
+++ b/lm_eval/tasks/xwinograd/utils.py
+import argparse
+from typing import Dict, List
+
+import yaml
+
+
+# Different languages that are part of xwinograd.
+# These correspond to dataset names (Subsets) on HuggingFace.
+# A yaml file is generated by this script for each language.
+LANGUAGES = ["en", "fr", "jp", "pt", "ru", "zh"]
+
+
+def doc_to_text(doc: Dict) -> int:
+    """
+    Return index of the correct choice.
+
+    Note: We are using the "multiple input" mode of the multiple-choice
+        output-type, which means we use different contexts with the same target
+        for the different choices, rather than the same context and different targets.
+    """
+    answer_to_num = {"1": 0, "2": 1}
+    return answer_to_num[doc["answer"]]
+
+
+def doc_to_target(doc: Dict) -> str:
+    """
+    Return the target completion.
+
+    Note that this does not depend on the correct choice as we are using
+    "multiple input" mode.
+    """
+    idx = doc["sentence"].index("_") + 1
+    return doc["sentence"][idx:].strip()
+
+
+def doc_to_choice(doc: Dict) -> List[str]:
+    """Return the choices that will be used as contexts in "multiple input" mode."""
+    idx = doc["sentence"].index("_")
+    options = [doc["option1"], doc["option2"]]
+    return [doc["sentence"][:idx] + opt for opt in options]
+
+
+def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
+    """
+    Generate a yaml file for each language.
+
+    :param output_dir: The directory to output the files to.
+    :param overwrite: Whether to overwrite files if they already exist.
+    """
+    err = []
+    for lang in LANGUAGES:
+        file_name = f"xwinograd_{lang}.yaml"
+        try:
+            with open(f"{output_dir}/{file_name}", "w" if overwrite else "x") as f:
+                f.write("# Generated by utils.py\n")
+                yaml.dump(
+                    {
+                        "include": "xwinograd_common_yaml",
+                        "dataset_name": lang,
+                        "task": f"xwinograd_{lang}",
+                    },
+                    f,
+                )
+        except FileExistsError:
+            err.append(file_name)
+
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist (use --overwrite flag):"
+            f" {', '.join(err)}"
+        )
+
+
+def main() -> None:
+    """Parse CLI args and generate language-specific yaml files."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--overwrite",
+        default=False,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir", default=".", help="Directory to write yaml files to"
+    )
+    args = parser.parse_args()
+
+    gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite)
+
+
+if __name__ == "__main__":
+    main()
--- a/lm_eval/tasks/xwinograd/xwinograd_common_yaml
+++ b/lm_eval/tasks/xwinograd/xwinograd_common_yaml
+# This file will be included in the generated language-specific task configs.
+# It doesn't have a yaml file extension as it is not meant to be imported directly
+# by the harness.
+group:
+  - winograd
+  - commonsense
+  - multilingual
+dataset_path: Muennighoff/xwinograd
+dataset_name: null  # Overridden by language-specific config.
+output_type: multiple_choice
+training_split: null
+validation_split: null
+test_split: test
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+doc_to_choice: !function utils.doc_to_choice
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/xwinograd/xwinograd_en.yaml
+++ b/lm_eval/tasks/xwinograd/xwinograd_en.yaml
+# Generated by utils.py
+dataset_name: en
+include: xwinograd_common_yaml
+task: xwinograd_en
--- a/lm_eval/tasks/xwinograd/xwinograd_fr.yaml
+++ b/lm_eval/tasks/xwinograd/xwinograd_fr.yaml
+# Generated by utils.py
+dataset_name: fr
+include: xwinograd_common_yaml
+task: xwinograd_fr
--- a/lm_eval/tasks/xwinograd/xwinograd_jp.yaml
+++ b/lm_eval/tasks/xwinograd/xwinograd_jp.yaml
+# Generated by utils.py
+dataset_name: jp
+include: xwinograd_common_yaml
+task: xwinograd_jp
--- a/lm_eval/tasks/xwinograd/xwinograd_pt.yaml
+++ b/lm_eval/tasks/xwinograd/xwinograd_pt.yaml
+# Generated by utils.py
+dataset_name: pt
+include: xwinograd_common_yaml
+task: xwinograd_pt
--- a/lm_eval/tasks/xwinograd/xwinograd_ru.yaml
+++ b/lm_eval/tasks/xwinograd/xwinograd_ru.yaml
+# Generated by utils.py
+dataset_name: ru
+include: xwinograd_common_yaml
+task: xwinograd_ru
--- a/lm_eval/tasks/xwinograd/xwinograd_zh.yaml
+++ b/lm_eval/tasks/xwinograd/xwinograd_zh.yaml
+# Generated by utils.py
+dataset_name: zh
+include: xwinograd_common_yaml
+task: xwinograd_zh
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -265,10 +265,20 @@ def make_table(result_dict):

    md_writer = MarkdownTableWriter()
    latex_writer = LatexTableWriter()
-    md_writer.headers = ["Task", "Version", "Filter", "Metric", "Value", "", "Stderr"]
+    md_writer.headers = [
+        "Task",
+        "Version",
+        "Fewshot",
+        "Filter",
+        "Metric",
+        "Value",
+        "",
+        "Stderr",
+    ]
    latex_writer.headers = [
        "Task",
        "Version",
+        "Fewshot",
        "Filter",
        "Metric",
        "Value",
@@ -280,6 +290,7 @@ def make_table(result_dict):

    for k, dic in result_dict["results"].items():
        version = result_dict["versions"][k]
+        n = str(result_dict["configs"][k]["num_fewshot"])
        for (mf), v in dic.items():
            m, _, f = mf.partition(",")
            if m.endswith("_stderr"):
@@ -287,10 +298,11 @@ def make_table(result_dict):

            if m + "_stderr" + "," + f in dic:
                se = dic[m + "_stderr" + "," + f]
-                values.append([k, version, f, m, "%.4f" % v, "±", "%.4f" % se])
+                values.append([k, version, n, f, m, "%.4f" % v, "±", "%.4f" % se])
            else:
-                values.append([k, version, f, m, "%.4f" % v, "", ""])
+                values.append([k, version, n, f, m, "%.4f" % v, "", ""])
            k = ""
+            n = ""
            version = ""
    md_writer.value_matrix = values
    latex_writer.value_matrix = values