merge update

50f4428b · lintangsutawika · 020abc86 · b8d1cef9 · 50f4428b · 50f4428b
Commit 50f4428b authored Sep 13, 2023 by lintangsutawika
19 changed files
--- a/docs/README.md
+++ b/docs/README.md
@@ -4,6 +4,7 @@ Welcome to the docs for the LM Evaluation Harness!

 ## Table of Contents

+* To learn about the public interface of the library, as well as how to evaluate via the commandline or as integrated into an external library, see the [Interface](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/user_guide.md)
 * To learn how to add a new library, API, or model type to the library, as well as a quick explainer on the types of ways to evaluate an LM, see the [Model Guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/model_guide.md).
 * For a crash course on adding new tasks to the library, see our [New Task Guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/new_task_guide.md).
 * To learn more about pushing the limits of task configuration that the Eval Harness supports, see the [Advanced Task Guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/advanced_task_guide.md).

--- a/docs/interface.md
+++ b/docs/interface.md
+# User Guide
+
+This document details the interface exposed by `lm-eval` and provides details on what flags are available to users.
+
+## Command-line Interface
+
+A majority of users run the library by cloning it from Github and running the `main.py` script.
+
+Equivalently, running the library can be done via the `lm-eval` entrypoint at the command line.
+
+This mode supports a number of command-line arguments, the details of which can be also be seen via running with `-h` or `--help`:
+
+* `--model` : Selects which model type or provider is evaluated. Must be a string corresponding to the name of the model type/provider being used. See [the main README](https://github.com/EleutherAI/lm-evaluation-harness/tree/big-refactor#commercial-apis) for a full list of enabled model names and supported libraries or APIs.
+
+* `--model_args` : Controls parameters passed to the model constructor. Accepts a string containing comma-separated keyword arguments to the model class of the format `"arg1=val1,arg2=val2,..."`, such as, for example `--model_args pretrained=EleutherAI/pythia-160m,dtype=float32`. For a full list of what keyword arguments, see the initialization of the `lm_eval.api.model.LM` subclass, e.g. [`HFLM`](https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/models/huggingface.py#L66)
+
+* `--tasks` : Determines which tasks or task groups are evaluated. Accepts a comma-separated list of task names or task group names. Must be solely comprised of valid tasks/groups.
+
+* `--num_fewshot` : Sets the number of few-shot examples to place in context. Must be an integer.
+
+* `--batch_size` : Sets the batch size used for evaluation. Can be a positive integer or `"auto"` to automatically select the largest batch size that will fit in memory, speeding up evaluation. One can pass `--batch_size auto:N` to re-select the maximum batch size `N` times during evaluation. This can help accelerate evaluation further, since `lm-eval` sorts documents in descending order of context length.
+
+* `--max_batch_size` : Sets the maximum batch size to try to fit in memory, if `--batch_size auto` is passed.
+
+* `--device` : Sets which device to place the model onto. Must be a string, for example, `"cuda", "cuda:0", "cpu", "mps"`. Defaults to "cuda", and can be ignored if running multi-GPU or running a non-local model type.
+
+* `--output_path` : A string of the form `dir/file.jsonl` or `dir/`. Provides a path where high-level results will be saved, either into the file named or into the directory named. If `--log_samples` is passed as well, then per-document outputs and metrics will be saved into the directory as well.
+
+* `--log_samples` : If this flag is passed, then the model's outputs, and the text fed into the model, will be saved at per-document granularity. Must be used with `--output_path`.
+
+* `--limit` : Accepts an integer, or a float between 0.0 and 1.0 . If passed, will limit the number of documents to evaluate to the first X documents (if an integer) per task or first X% of documents per task. Useful for debugging, especially on costly API models.
+
+* `--use_cache` : Should be a path where a sqlite db file can be written to. Takes a string of format `/path/to/sqlite_cache_` in order to create a cache db at `/path/to/sqlite_cache_rank{i}.db` for each process (0-NUM_GPUS). This allows results of prior runs to be cached, so that there is no need to re-run results in order to re-score or re-run a given (model, task) pair again.
+
+* `--decontamination_ngrams_path` : Deprecated, see (this commit)[https://github.com/EleutherAI/lm-evaluation-harness/commit/00209e10f6e27edf5d766145afaf894079b5fe10] or older for a working decontamination-checker tool.
+
+* `--check_integrity` : If this flag is used, the library tests for each task selected are run to confirm task integrity.
+
+* `--write_out` : Used for diagnostic purposes to observe the format of task documents passed to a model. If this flag is used, then prints the prompt and gold target string for the first document of each task.
+
+* `--show_config` : If used, prints the full `lm_eval.api.task.TaskConfig` contents (non-default settings the task YAML file) for each task which was run, at the completion of an evaluation. Useful for when one is modifying a task's configuration YAML locally to transmit the exact configurations used for debugging or for reproducibility purposes.
+
+* `--include_path` : Accepts a path to a folder. If passed, then all YAML files containing `lm-eval`` compatible task configurations will be added to the task registry as available tasks. Used for when one is writing config files for their own task in a folder other than `lm_eval/tasks/`
+
+## External Library Usage
+
+We also support using the library's external API for use within model training loops or other scripts.
+
+`lm_eval` supplies two functions for external import and use: `lm_eval.evaluate()` and `lm_eval.simple_evaluate()`.
+
+
+`simple_evaluate()` can be used by simply creating an `lm_eval.api.model.LM` subclass that implements the methods described in the [Model Guide](https://github.com/EleutherAI/lm-evaluation-harness/tree/big-refactor/docs/model_guide.md), and wrapping your custom model in that class as follows:
+
+```python
+import lm_eval
+...
+
+my_model = initialize_my_model() # create your model (could be running finetuning with some custom modeling code)
+...
+lm_obj = Your_LM(model=my_model, batch_size=16) # instantiate an LM subclass that takes your initialized model and can run `Your_LM.loglikelihood()`, `Your_LM.loglikelihood_rolling()`, `Your_LM.greedy_until()`
+
+results = lm_eval.simple_evaluate( # call simple_evaluate
+    model=lm_obj,
+    tasks=["taskname1", "taskname2"],
+    num_fewshot=0,
+    ...
+)
+```
+
+
+See https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/evaluator.py#L35 for a full description of all arguments available. All keyword arguments to simple_evaluate share the same role as the command-line flags described previously.
+
+Additionally, the `evaluate()` function offers the core evaluation functionality provided by the library, but without some of the special handling and simplification + abstraction provided by `simple_evaluate()`.
+
+See https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/evaluator.py#L173 for more details.
+
+As a brief example usage of `evaluate()`:
+```python
+import lm_eval
+
+from my_tasks import MyTask1 # suppose you've defined a custom lm_eval.api.Task subclass in your own external codebase
+...
+
+my_model = initialize_my_model() # create your model (could be running finetuning with some custom modeling code)
+...
+lm_obj = Your_LM(model=my_model, batch_size=16) # instantiate an LM subclass that takes your initialized model and can run `Your_LM.loglikelihood()`, `Your_LM.loglikelihood_rolling()`, `Your_LM.greedy_until()`
+
+
+
+def evaluate(
+    lm=lm_obj,
+    task_dict={"mytask1": MyTask1},
+    ...
+):
+```
--- a/lm_eval/__init__.py
+++ b/lm_eval/__init__.py
+from .evaluator import evaluate, simple_evaluate
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -250,6 +250,11 @@ class Task(abc.ABC):
            download_mode=download_mode,
        )

+    @property
+    def config(self):
+        """Returns the TaskConfig associated with this class."""
+        return self._config
+
    @abc.abstractmethod
    def has_training_docs(self):
        """Whether the task has a training set"""
@@ -352,7 +357,7 @@ class Task(abc.ABC):
            ), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"

        eval_logger.info(
-            f"Building contexts for task '{self._config.task}' on rank {rank}..."
+            f"Building contexts for task '{self.config.task}' on rank {rank}..."
        )

        instances = []
@@ -362,14 +367,14 @@ class Task(abc.ABC):
            # sample fewshot context #TODO: need to offset doc_id by rank now!
            fewshot_ctx = self.fewshot_context(
                doc,
-                self._config.num_fewshot,
+                self.config.num_fewshot,
            )

-            # TODO: we should override self._config.repeats if doing greedy gen so users don't waste time+compute
+            # TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute
            inst = self.construct_requests(
                doc=doc,
                ctx=fewshot_ctx,
-                metadata=(self._config["task"], doc_id, self._config.repeats),
+                metadata=(self.config["task"], doc_id, self.config.repeats),
            )

            if not isinstance(inst, list):
@@ -457,9 +462,9 @@ class Task(abc.ABC):

        if num_fewshot == 0:
            # always prepend the (possibly empty) task description
-            labeled_examples = self._config.description
+            labeled_examples = self.config.description
        else:
-            labeled_examples = self._config.description + self.sampler.get_context(
+            labeled_examples = self.config.description + self.sampler.get_context(
                doc, num_fewshot
            )

@@ -469,7 +474,7 @@ class Task(abc.ABC):
        elif type(example) == list:
            return [labeled_examples + ex for ex in example]
        elif type(example) == int:
-            if self._config.doc_to_choice is not None:
+            if self.config.doc_to_choice is not None:
                choices = self.doc_to_choice(doc)
                return labeled_examples + choices[example]
            else:
@@ -491,7 +496,7 @@ class Task(abc.ABC):
        """
        # TODO: this should only return the overrides applied to a non-YAML task's configuration.
        # (num_fewshot)
-        return self._config.to_dict()
+        return self.config.to_dict()


 class ConfigurableTask(Task):
@@ -506,35 +511,35 @@ class ConfigurableTask(Task):
        self._config = self.CONFIG

        # Use new configurations if there was no preconfiguration
-        if self._config is None:
+        if self.config is None:
            self._config = TaskConfig(**config)
        # Overwrite configs
        else:
            if config is not None:
                self._config.__dict__.update(config)

-        if self._config is None:
+        if self.config is None:
            raise ValueError(
                "Must pass a config to ConfigurableTask, either in cls.CONFIG or `config` kwarg"
            )

-        if self._config.output_type is not None:
-            assert self._config.output_type in ALL_OUTPUT_TYPES
-            self.OUTPUT_TYPE = self._config.output_type
+        if self.config.output_type is not None:
+            assert self.config.output_type in ALL_OUTPUT_TYPES
+            self.OUTPUT_TYPE = self.config.output_type

-        if self._config.dataset_path is not None:
-            self.DATASET_PATH = self._config.dataset_path
+        if self.config.dataset_path is not None:
+            self.DATASET_PATH = self.config.dataset_path

-        if self._config.dataset_name is not None:
-            self.DATASET_NAME = self._config.dataset_name
+        if self.config.dataset_name is not None:
+            self.DATASET_NAME = self.config.dataset_name

        self._metric_fn_list = {}
        self._metric_fn_kwargs = {}
        self._aggregation_list = {}
        self._higher_is_better = {}

-        _metric_list = DEFAULT_METRIC_REGISTRY[self._config.output_type]
-        if self._config.metric_list is None:
+        _metric_list = DEFAULT_METRIC_REGISTRY[self.config.output_type]
+        if self.config.metric_list is None:
            # TODO: handle this in TaskConfig.__post_init__ ?
            for metric_name in _metric_list:
                self._metric_fn_list[metric_name] = get_metric(metric_name)
@@ -543,7 +548,7 @@ class ConfigurableTask(Task):
                )
                self._higher_is_better[metric_name] = is_higher_better(metric_name)
        else:
-            for metric_config in self._config.metric_list:
+            for metric_config in self.config.metric_list:
                assert "metric" in metric_config
                metric_name = metric_config["metric"]
                kwargs = {
@@ -552,7 +557,7 @@ class ConfigurableTask(Task):
                    if key not in ["metric", "aggregation", "higher_is_better"]
                }

-                if self._config.process_results is not None:
+                if self.config.process_results is not None:
                    self._metric_fn_list[metric_name] = None
                    self._metric_fn_kwargs[metric_name] = {}
                elif callable(metric_name):
@@ -594,13 +599,13 @@ class ConfigurableTask(Task):
                    )
                    self._higher_is_better[metric_name] = is_higher_better(metric_name)

-        self.download(self._config.dataset_kwargs)
+        self.download(self.config.dataset_kwargs)
        self._training_docs = None
        self._fewshot_docs = None

-        if self._config.filter_list is not None:
+        if self.config.filter_list is not None:
            self._filters = []
-            for filter_config in self._config.filter_list:
+            for filter_config in self.config.filter_list:
                for filter_pipeline in filter_config:
                    filter_name = filter_config["name"]
                    filter_functions = filter_config["filter"]
@@ -615,10 +620,10 @@ class ConfigurableTask(Task):
        else:
            self._filters = [build_filter_ensemble("none", [["take_first", None]])]

-        if self._config.use_prompt is not None:
-            eval_logger.info(f"loading prompt {self._config.use_prompt}")
+        if self.config.use_prompt is not None:
+            eval_logger.info(f"loading prompt {self.config.use_prompt}")
            self.prompt = get_prompt(
-                self._config.use_prompt, self.DATASET_PATH, self.DATASET_NAME
+                self.config.use_prompt, self.DATASET_PATH, self.DATASET_NAME
            )
        else:
            self.prompt = None
@@ -645,7 +650,7 @@ class ConfigurableTask(Task):
        test_text = self.doc_to_text(test_doc)
        test_target = self.doc_to_target(test_doc)

-        if self._config.doc_to_choice is not None:
+        if self.config.doc_to_choice is not None:
            test_choice = self.doc_to_choice(test_doc)
            if type(test_choice) is not list:
                eval_logger.error("doc_to_choice must return list")
@@ -673,7 +678,7 @@ class ConfigurableTask(Task):
        for choice in check_choices:
            choice_has_whitespace = True if " " in choice else False
            delimiter_has_whitespace = (
-                True if " " in self._config.target_delimiter else False
+                True if " " in self.config.target_delimiter else False
            )

            if delimiter_has_whitespace and choice_has_whitespace:
@@ -693,52 +698,52 @@ class ConfigurableTask(Task):
        )

    def has_training_docs(self) -> bool:
-        if self._config.training_split is not None:
+        if self.config.training_split is not None:
            return True
        else:
            return False

    def has_validation_docs(self) -> bool:
-        if self._config.validation_split is not None:
+        if self.config.validation_split is not None:
            return True
        else:
            return False

    def has_test_docs(self) -> bool:
-        if self._config.test_split is not None:
+        if self.config.test_split is not None:
            return True
        else:
            return False

    def training_docs(self) -> datasets.Dataset:
        if self.has_training_docs():
-            if self._config.process_docs is not None:
-                return self._config.process_docs(
-                    self.dataset[self._config.training_split]
+            if self.config.process_docs is not None:
+                return self.config.process_docs(
+                    self.dataset[self.config.training_split]
                )
-            return self.dataset[self._config.training_split]
+            return self.dataset[self.config.training_split]

    def validation_docs(self) -> datasets.Dataset:
        if self.has_validation_docs():
-            if self._config.process_docs is not None:
-                return self._config.process_docs(
-                    self.dataset[self._config.validation_split]
+            if self.config.process_docs is not None:
+                return self.config.process_docs(
+                    self.dataset[self.config.validation_split]
                )
-            return self.dataset[self._config.validation_split]
+            return self.dataset[self.config.validation_split]

    def test_docs(self) -> datasets.Dataset:
        if self.has_test_docs():
-            if self._config.process_docs is not None:
-                return self._config.process_docs(self.dataset[self._config.test_split])
-            return self.dataset[self._config.test_split]
+            if self.config.process_docs is not None:
+                return self.config.process_docs(self.dataset[self.config.test_split])
+            return self.dataset[self.config.test_split]

    def fewshot_docs(self):
-        if self._config.fewshot_split is not None:
-            return self.dataset[self._config.fewshot_split]
+        if self.config.fewshot_split is not None:
+            return self.dataset[self.config.fewshot_split]
        else:
-            if self._config.num_fewshot > 0:
+            if self.config.num_fewshot > 0:
                eval_logger.warning(
-                    f"Task '{self._config.task}': "
+                    f"Task '{self.config.task}': "
                    "num_fewshot > 0 but fewshot_split is None. "
                    "using preconfigured rule."
                )
@@ -754,15 +759,15 @@ class ConfigurableTask(Task):
            return self._instances

    def should_decontaminate(self):
-        return self._config.should_decontaminate
+        return self.config.should_decontaminate

    def doc_to_decontamination_query(self, doc):
-        if self._config.should_decontaminate:
-            if self._config.doc_to_decontamination_query in self.features:
-                return doc[self._config.doc_to_decontamination_query]
+        if self.config.should_decontaminate:
+            if self.config.doc_to_decontamination_query in self.features:
+                return doc[self.config.doc_to_decontamination_query]
            else:
                return ast.literal_eval(
-                    utils.apply_template(self._config.doc_to_decontamination_query, doc)
+                    utils.apply_template(self.config.doc_to_decontamination_query, doc)
                )

    def _process_doc(self, doc):
@@ -780,13 +785,13 @@ class ConfigurableTask(Task):
        if self.prompt is not None:
            doc_to_text = self.prompt
        else:
-            doc_to_text = self._config.doc_to_text
+            doc_to_text = self.config.doc_to_text

        if type(doc_to_text) == int:
            return doc_to_text
        elif type(doc_to_text) == str:
            if doc_to_text in self.features:
-                # if self._config.doc_to_choice is not None:
+                # if self.config.doc_to_choice is not None:
                #     return self.doc_to_choice(doc)[doc[doc_to_text]]
                # else:
                return doc[doc_to_text]
@@ -805,7 +810,7 @@ class ConfigurableTask(Task):
                return applied_prompt[0]
            else:
                eval_logger.warning("Applied prompt returns empty string")
-                return self._config.fewshot_delimiter
+                return self.config.fewshot_delimiter
        else:
            print(type(doc_to_text))
            raise TypeError
@@ -814,13 +819,13 @@ class ConfigurableTask(Task):
        if self.prompt is not None:
            doc_to_target = self.prompt
        else:
-            doc_to_target = self._config.doc_to_target
+            doc_to_target = self.config.doc_to_target

        if type(doc_to_target) == int:
            return doc_to_target
        elif type(doc_to_target) == str:
            if doc_to_target in self.features:
-                # if self._config.doc_to_choice is not None:
+                # if self.config.doc_to_choice is not None:
                #     return self.doc_to_choice(doc)[doc[doc_to_target]]
                # else:
                return doc[doc_to_target]
@@ -847,17 +852,17 @@ class ConfigurableTask(Task):
                return applied_prompt[1]
            else:
                eval_logger.warning("Applied prompt returns empty string")
-                return self._config.fewshot_delimiter
+                return self.config.fewshot_delimiter
        else:
            raise TypeError

    def doc_to_choice(self, doc: Any) -> List[str]:
        if self.prompt is not None:
            doc_to_choice = self.prompt
-        elif self._config.doc_to_choice is None:
+        elif self.config.doc_to_choice is None:
            eval_logger.error("doc_to_choice was called but not set in config")
        else:
-            doc_to_choice = self._config.doc_to_choice
+            doc_to_choice = self.config.doc_to_choice

        if type(doc_to_choice) == str:
            return ast.literal_eval(utils.apply_template(doc_to_choice, doc))
@@ -878,8 +883,8 @@ class ConfigurableTask(Task):

        # in multiple_choice tasks, this should be castable to an int corresponding to the index
        # within the answer choices, while doc_to_target is the string version of {{answer_choices[gold]}}.
-        if self._config.gold_alias is not None:
-            doc_to_target = self._config.gold_alias
+        if self.config.gold_alias is not None:
+            doc_to_target = self.config.gold_alias
        else:
            return self.doc_to_target(doc)

@@ -901,7 +906,7 @@ class ConfigurableTask(Task):
            arguments = (self.doc_to_target(doc),)
        elif self.OUTPUT_TYPE == "multiple_choice":
            choices = self.doc_to_choice(doc)
-            target_delimiter = self._config.target_delimiter
+            target_delimiter = self.config.target_delimiter
            if self.multiple_input:
                # If there are multiple inputs, choices are placed in the ctx
                cont = self.doc_to_target(doc)
@@ -943,15 +948,16 @@ class ConfigurableTask(Task):
            return request_list

        elif self.OUTPUT_TYPE == "greedy_until":
-            arguments = (ctx, self._config.generation_kwargs)
+            arguments = (ctx, self.config.generation_kwargs)

        return Instance(
            request_type=self.OUTPUT_TYPE, doc=doc, arguments=arguments, idx=0, **kwargs
        )

    def process_results(self, doc, results):
-        if callable(self._config.process_results):
-            return self._config.process_results(doc, results)
+
+        if callable(self.config.process_results):
+            return self.config.process_results(doc, results)

        result_dict = {}
        use_metric = list(self._metric_fn_list.keys())
@@ -1056,7 +1062,7 @@ class ConfigurableTask(Task):

        elif self.OUTPUT_TYPE == "greedy_until":
            gold = self.doc_to_target(doc)
-            if self._config.doc_to_choice is not None:
+            if self.config.doc_to_choice is not None:
                # If you set doc_to_choice,
                # it assumes that doc_to_target returns a number.
                choices = self.doc_to_choice(doc)

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -184,7 +184,7 @@ def evaluate(
    :param lm: obj
        Language Model
    :param task_dict: dict[str, Task]
-        Dictionary of tasks. Tasks will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
+        Dictionary of tasks. Tasks will be taken to have name type(task).config.task .
    :param limit: int, optional
        Limit the number of examples per task (only use this for testing)
    :param bootstrap_iters:

--- a/lm_eval/tasks/mgsm/native_cot/cot_yaml
+++ b/lm_eval/tasks/mgsm/native_cot/cot_yaml
--- a/lm_eval/tasks/mgsm/en_cot/mgsm_bn_en-cot.yaml
+++ b/lm_eval/tasks/mgsm/en_cot/mgsm_bn_en-cot.yaml
+# Generated by utils.py
+dataset_name: bn
+doc_to_target: '{% if answer is not none %}{{answer[20+1]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"প্রশ্ন: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+include: cot_yaml
+task: mgsm_bn_direct
--- a/lm_eval/tasks/mgsm/en_cot/mgsm_de_en-cot.yaml
+++ b/lm_eval/tasks/mgsm/en_cot/mgsm_de_en-cot.yaml
+# Generated by utils.py
+dataset_name: de
+doc_to_target: '{% if answer is not none %}{{answer[20+1]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Frage: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+include: cot_yaml
+task: mgsm_de_direct
--- a/lm_eval/tasks/mgsm/en_cot/mgsm_en_en-cot.yaml
+++ b/lm_eval/tasks/mgsm/en_cot/mgsm_en_en-cot.yaml
+# Generated by utils.py
+dataset_name: en
+doc_to_target: '{% if answer is not none %}{{answer[20+1]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+include: cot_yaml
+task: mgsm_en_direct
--- a/lm_eval/tasks/mgsm/en_cot/mgsm_es_en-cot.yaml
+++ b/lm_eval/tasks/mgsm/en_cot/mgsm_es_en-cot.yaml
+# Generated by utils.py
+dataset_name: es
+doc_to_target: '{% if answer is not none %}{{answer[20+1]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Pregunta: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+include: cot_yaml
+task: mgsm_es_direct
--- a/lm_eval/tasks/mgsm/en_cot/mgsm_fr_en-cot.yaml
+++ b/lm_eval/tasks/mgsm/en_cot/mgsm_fr_en-cot.yaml
+# Generated by utils.py
+dataset_name: fr
+doc_to_target: '{% if answer is not none %}{{answer[20+1]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Question : "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+include: cot_yaml
+task: mgsm_fr_direct
--- a/lm_eval/tasks/mgsm/en_cot/mgsm_ja_en-cot.yaml
+++ b/lm_eval/tasks/mgsm/en_cot/mgsm_ja_en-cot.yaml
+# Generated by utils.py
+dataset_name: ja
+doc_to_target: '{% if answer is not none %}{{answer[20+1]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"問題: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+include: cot_yaml
+task: mgsm_ja_direct
--- a/lm_eval/tasks/mgsm/en_cot/mgsm_ru_en-cot.yaml
+++ b/lm_eval/tasks/mgsm/en_cot/mgsm_ru_en-cot.yaml
+# Generated by utils.py
+dataset_name: ru
+doc_to_target: '{% if answer is not none %}{{answer[20+1]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Задача: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+include: cot_yaml
+task: mgsm_ru_direct
--- a/lm_eval/tasks/mgsm/en_cot/mgsm_sw_en-cot.yaml
+++ b/lm_eval/tasks/mgsm/en_cot/mgsm_sw_en-cot.yaml
+# Generated by utils.py
+dataset_name: sw
+doc_to_target: '{% if answer is not none %}{{answer[20+1]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Swali: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+include: cot_yaml
+task: mgsm_sw_direct
--- a/lm_eval/tasks/mgsm/en_cot/mgsm_te_en-cot.yaml
+++ b/lm_eval/tasks/mgsm/en_cot/mgsm_te_en-cot.yaml
+# Generated by utils.py
+dataset_name: te
+doc_to_target: '{% if answer is not none %}{{answer[20+1]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"ప్రశ్న: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+include: cot_yaml
+task: mgsm_te_direct
--- a/lm_eval/tasks/mgsm/en_cot/mgsm_th_en-cot.yaml
+++ b/lm_eval/tasks/mgsm/en_cot/mgsm_th_en-cot.yaml
+# Generated by utils.py
+dataset_name: th
+doc_to_target: '{% if answer is not none %}{{answer[20+1]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"โจทย์: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+include: cot_yaml
+task: mgsm_th_direct
--- a/lm_eval/tasks/mgsm/en_cot/mgsm_zh_en-cot.yaml
+++ b/lm_eval/tasks/mgsm/en_cot/mgsm_zh_en-cot.yaml
+# Generated by utils.py
+dataset_name: zh
+doc_to_target: '{% if answer is not none %}{{answer[20+1]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"问题: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+include: cot_yaml
+task: mgsm_zh_direct
--- a/lm_eval/tasks/mgsm/utils.py
+++ b/lm_eval/tasks/mgsm/utils.py
@@ -4,16 +4,19 @@ import argparse

 LANGUAGES = {
    "bn": {  # Bengali
+        # "QUESTION": "প্রশ্ন:",
        "QUESTION": "\u09aa\u09cd\u09b0\u09b6\u09cd\u09a8:",
+        # "ANSWER": "ধাপে ধাপে উত্তর:",
        "ANSWER": "\u09a7\u09be\u09aa\u09c7 \u09a7\u09be\u09aa\u09c7 \u0989\u09a4\u09cd\u09a4\u09b0:",
        "DIRECT": "Answer:",
        "REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
    },
    "de": {  # German
        "QUESTION": "Frage:",
+        # "ANSWER": "Schritt-für-Schritt-Antwort:",
        "ANSWER": "Schritt-f\u00fcr-Schritt-Antwort:",
        "DIRECT": "Antwort:",
-        "REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
+        "REGEX": "Die Antwort lautet (\\-?[0-9\\.\\,]+)",
    },
    "en": {  # English
        "QUESTION": "Question:",
@@ -24,50 +27,68 @@ LANGUAGES = {
    "es": {  # Spanish
        "QUESTION": "Pregunta:",
        "ANSWER": "Respuesta paso a paso:",
-        "DIRECT": "Answer:",
-        "REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
+        "DIRECT": "Respuesta:",
+        "REGEX": "La respuesta es (\\-?[0-9\\.\\,]+)",
    },
    "fr": {  # French
        "QUESTION": "Question :",
+        # "ANSWER": "Réponse étape par étape :"
        "ANSWER": "R\u00e9ponse \u00e9tape par \u00e9tape :",
-        "DIRECT": "Answer:",
-        "REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
+        # "DIRECT": "Réponse :",
+        "DIRECT": "R\u00e9ponse :",
+        # "REGEX": "La réponse est (\\-?[0-9\\.\\,]+)",
+        "REGEX": "La r\u00e9ponse est (\\-?[0-9\\.\\,]+)",
    },
    "ru": {  # Russian
+        # "QUESTION": "Задача:",
        "QUESTION": "\u0417\u0430\u0434\u0430\u0447\u0430:",
+        # "ANSWER": "Пошаговоерешение:",
        "ANSWER": "\u041f\u043e\u0448\u0430\u0433\u043e\u0432\u043e\u0435\u0440\u0435\u0448\u0435\u043d\u0438\u0435:",
        "DIRECT": "Answer:",
-        "REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
+        # "REGEX": "Ответ — (\\-?[0-9\\.\\,]+)",
+        "REGEX": "\u041e\u0442\u0432\u0435\u0442 \u2014 (\\-?[0-9\\.\\,]+)",
    },
    "sw": {  # Swahili
        "QUESTION": "Swali:",
        "ANSWER": "Jibu la Hatua kwa Hatua:",
        "DIRECT": "Answer:",
-        "REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
+        "REGEX": "Jibu ni (\\-?[0-9\\.\\,]+)",
    },
    "te": {  # Telugu
+        # "QUESTION": "ప్రశ్న:",
        "QUESTION": "\u0c2a\u0c4d\u0c30\u0c36\u0c4d\u0c28:",
+        # "ANSWER": "దశలవారీగా సమాధానం:",
        "ANSWER": "\u0c26\u0c36\u0c32\u0c35\u0c3e\u0c30\u0c40\u0c17\u0c3e \u0c38\u0c2e\u0c3e\u0c27\u0c3e\u0c28\u0c02:",
        "DIRECT": "Answer:",
-        "REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
+        # "REGEX": "సమాధానం (\\-?[0-9\\.\\,]+)",
+        "REGEX": "\u0c38\u0c2e\u0c3e\u0c27\u0c3e\u0c28\u0c02 (\\-?[0-9\\.\\,]+)",
    },
    "th": {  # Thai
+        # "QUESTION": "โจทย์:",
        "QUESTION": "\u0e42\u0e08\u0e17\u0e22\u0e4c:",
+        # "ANSWER": "คำตอบทีละขั้นตอน:",
        "ANSWER": "\u0e04\u0e33\u0e15\u0e2d\u0e1a\u0e17\u0e35\u0e25\u0e30\u0e02\u0e31\u0e49\u0e19\u0e15\u0e2d\u0e19:",
        "DIRECT": "Answer:",
-        "REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
+        # "REGEX": "คำตอบคือ (\\-?[0-9\\.\\,]+)",
+        "REGEX": "\u0e04\u0e33\u0e15\u0e2d\u0e1a\u0e04\u0e37\u0e2d (\\-?[0-9\\.\\,]+)",
    },
    "ja": {  # Japanese
+        # "QUESTION": "問題:",
        "QUESTION": "\u554f\u984c:",
+        # "ANSWER": "ステップごとの答え:",
        "ANSWER": "\u30b9\u30c6\u30c3\u30d7\u3054\u3068\u306e\u7b54\u3048:",
        "DIRECT": "Answer:",
-        "REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
+        # "REGEX": "答えは(\\-?[0-9\\.\\,]+)です。",
+        "REGEX": "\u7b54\u3048\u306f(\\-?[0-9\\.\\,]+)\u3067\u3059\u3002",
    },
    "zh": {  # Chinese
+        # "QUESTION": "问题:",
        "QUESTION": "\u95ee\u9898:",
+        # "ANSWER": "逐步解答:",
        "ANSWER": "\u9010\u6b65\u89e3\u7b54:",
        "DIRECT": "Answer:",
-        "REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
+        # "REGEX": "答案是 (\\-?[0-9\\.\\,]+)。",
+        "REGEX": "\u7b54\u6848\u662f (\\-?[0-9\\.\\,]+)\u3002",
    },
 }

@@ -80,15 +101,15 @@ def add_regex_pattern(regex_pattern):
        "filter_list": [
            {
                "name": "get-answer",
-            },
-        ],
-        "filter": [
-            {
-                "function": "regex",
-                "regex_pattern": regex_pattern,
-            },
-            {
-                "function": "take_first",
+                "filter": [
+                    {
+                        "function": "regex",
+                        "regex_pattern": regex_pattern,
+                    },
+                    {
+                        "function": "take_first",
+                    },
+                ],
            },
        ],
    }
@@ -107,6 +128,7 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
            QUESTION = LANGUAGES[lang]["QUESTION"]

            yaml_template = "cot_yaml"
+            filter_list = {}
            if mode == "direct":
                ANSWER = LANGUAGES[lang]["DIRECT"]
                REGEX = None
@@ -116,13 +138,13 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
                ANSWER = LANGUAGES[lang]["ANSWER"]
                REGEX = LANGUAGES[lang]["REGEX"]
                task_name = f"mgsm_{lang}_native-cot"
+                filter_list = add_regex_pattern(REGEX)
            elif mode == "en-cot":
                ANSWER = LANGUAGES["en"]["ANSWER"]
                REGEX = LANGUAGES["en"]["REGEX"]
                task_name = f"mgsm_{lang}_en-cot"

            file_name = f"{task_name}.yaml"
-            filter_list = add_regex_pattern(REGEX)

            with open(
                f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf8"
@@ -147,6 +169,7 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
                    },
                    f,
                    allow_unicode=True,
+                    width=float("inf"),
                )
        except FileExistsError:
            err.append(file_name)

--- a/pyproject.toml
+++ b/pyproject.toml
+[build-system]
+requires = ["setuptools>=40.8.0", "wheel"]
+build-backend = "setuptools.build_meta"