Merge branch 'main' into add-chat-templating

2b40017b · haileyschoelkopf · bbcdffb8 · ff739414 · 2b40017b · 2b40017b
Commit 2b40017b authored Jan 15, 2024 by haileyschoelkopf
20 changed files
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -61,7 +61,7 @@ jobs:
 #                pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
 #        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
    - name: Test with pytest
-      run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/tests_master --ignore=tests/extra
+      run: python -m pytest --showlocals -s -vv -n=auto
    - name: Archive artifacts
      uses: actions/upload-artifact@v3
      with:

--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -271,7 +271,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
                        default=_handle_non_serializable,
                        ensure_ascii=False,
                    )
-                    filename.open("w").write(samples_dumped)
+                    filename.write_text(samples_dumped, encoding="utf-8")

        print(
            f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, "

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -1131,27 +1131,36 @@ class ConfigurableTask(Task):
                        # sometimes, a multiple_target dataset has exceptions where one doc has only one string answer
                        # print(gold)
                        gold = [gold]
-                    for gold_option in gold:
-                        try:
-                            result_score = self._metric_fn_list[metric](
-                                references=[gold_option],
-                                predictions=[result],
-                                **self._metric_fn_kwargs[metric],
-                            )
-                        except (
-                            TypeError
-                        ):  # TODO: this is hacky and I don't want to do it
-                            result_score = self._metric_fn_list[metric](
-                                [gold_option, result]
-                            )
-                        if isinstance(result_score, dict):
-                            # TODO: this handles the case where HF evaluate returns a dict.
-                            result_score = result_score[metric]
-                        scores.append(result_score)
-                    if any(scores):
-                        result_score = 1.0
+                    if metric == "exact_match":
+                        result = [result for _ in range(len(gold))]
+                        scores = self._metric_fn_list[metric](
+                            references=gold,
+                            predictions=result,
+                            **self._metric_fn_kwargs[metric],
+                        )[metric]
+                        result_score = 1.0 if scores > 0.0 else 0.0
                    else:
-                        result_score = 0.0
+                        for gold_option in gold:
+                            try:
+                                result_score = self._metric_fn_list[metric](
+                                    references=[gold_option],
+                                    predictions=[result],
+                                    **self._metric_fn_kwargs[metric],
+                                )
+                            except (
+                                TypeError
+                            ):  # TODO: this is hacky and I don't want to do it
+                                result_score = self._metric_fn_list[metric](
+                                    [gold_option, result]
+                                )
+                            if isinstance(result_score, dict):
+                                # TODO: this handles the case where HF evaluate returns a dict.
+                                result_score = result_score[metric]
+                            scores.append(result_score)
+                        if any(scores):
+                            result_score = 1.0
+                        else:
+                            result_score = 0.0
                else:
                    try:
                        result_score = self._metric_fn_list[metric](

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -749,7 +749,7 @@ class HFLM(LM):
            generation_kwargs["do_sample"] = False
        # build stopping criteria
        stopping_criteria = stop_sequences_criteria(
-            self.tokenizer, stop, 1, context.shape[0]
+            self.tokenizer, stop, context.shape[1], context.shape[0]
        )
        return self.model.generate(
            input_ids=context,

--- a/lm_eval/tasks/babi/babi.yaml
+++ b/lm_eval/tasks/babi/babi.yaml
@@ -17,4 +17,4 @@ metric_list:
    aggregation: mean
    higher_is_better: true
 metadata:
-  version: 0.0
+  version: 1.0
--- a/lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml
@@ -27,4 +27,4 @@ filter_list:
      - function: "take_first"
 num_fewshot: 0
 metadata:
-  version: 1.0
+  version: 2.0
--- a/lm_eval/tasks/bbh/cot_zeroshot/_cot_zeroshot_template_yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/_cot_zeroshot_template_yaml
@@ -24,4 +24,4 @@ filter_list:
      - function: "take_first"
 num_fewshot: 0
 metadata:
-  version: 0
+  version: 1.0
--- a/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml
@@ -18,4 +18,4 @@ generation_kwargs:
  temperature: 0.0
 num_fewshot: 0
 metadata:
-  version: 0
+  version: 1.0
--- a/lm_eval/tasks/bbh/zeroshot/_zeroshot_template_yaml
+++ b/lm_eval/tasks/bbh/zeroshot/_zeroshot_template_yaml
@@ -18,4 +18,4 @@ generation_kwargs:
  temperature: 0.0
 num_fewshot: 0
 metadata:
-  version: 0
+  version: 1.0
--- a/lm_eval/tasks/belebele/_default_template_yaml
+++ b/lm_eval/tasks/belebele/_default_template_yaml
 group: belebele
 dataset_path: facebook/belebele
-test_split: test
-fewshot_split: test
 fewshot_config:
  sampler: first_n
 output_type: multiple_choice

--- a/lm_eval/tasks/belebele/_generate_configs.py
+++ b/lm_eval/tasks/belebele/_generate_configs.py
@@ -8,7 +8,7 @@ import requests

 from tqdm import tqdm

-from lm_eval.logger import eval_logger
+from lm_eval.utils import logging

 API_URL = "https://datasets-server.huggingface.co/splits?dataset=facebook/belebele"

@@ -39,8 +39,8 @@ if __name__ == "__main__":
    def query():
        response = requests.get(API_URL)
        return response.json()["splits"]
-
-    languages = [split["config"] for split in query()]
+    print(query())
+    languages = [split["split"] for split in query()]

    for lang in tqdm(languages):
        yaml_dict = {
@@ -48,11 +48,12 @@ if __name__ == "__main__":
            "task": f"belebele_{args.task_prefix}_{lang}"
            if args.task_prefix != ""
            else f"belebele_{lang}",
-            "dataset_name": lang,
+            "test_split": lang,
+            "fewshot_split":lang,
        }

        file_save_path = args.save_prefix_path + f"_{lang}.yaml"
-        eval_logger.info(f"Saving yaml for subset {lang} to {file_save_path}")
+        logging.info(f"Saving yaml for subset {lang} to {file_save_path}")
        with open(file_save_path, "w") as yaml_file:
            yaml.dump(
                yaml_dict,

--- a/lm_eval/tasks/belebele/belebele_acm_Arab.yaml
+++ b/lm_eval/tasks/belebele/belebele_acm_Arab.yaml
-"dataset_name": "acm_Arab"
+"fewshot_split": "acm_Arab"
 "include": "_default_template_yaml"
 "task": "belebele_acm_Arab"
+"test_split": "acm_Arab"
--- a/lm_eval/tasks/belebele/belebele_afr_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_afr_Latn.yaml
-"dataset_name": "afr_Latn"
+"fewshot_split": "afr_Latn"
 "include": "_default_template_yaml"
 "task": "belebele_afr_Latn"
+"test_split": "afr_Latn"
--- a/lm_eval/tasks/belebele/belebele_als_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_als_Latn.yaml
-"dataset_name": "als_Latn"
+"fewshot_split": "als_Latn"
 "include": "_default_template_yaml"
 "task": "belebele_als_Latn"
+"test_split": "als_Latn"
--- a/lm_eval/tasks/belebele/belebele_amh_Ethi.yaml
+++ b/lm_eval/tasks/belebele/belebele_amh_Ethi.yaml
-"dataset_name": "amh_Ethi"
+"fewshot_split": "amh_Ethi"
 "include": "_default_template_yaml"
 "task": "belebele_amh_Ethi"
+"test_split": "amh_Ethi"
--- a/lm_eval/tasks/belebele/belebele_apc_Arab.yaml
+++ b/lm_eval/tasks/belebele/belebele_apc_Arab.yaml
-"dataset_name": "apc_Arab"
+"fewshot_split": "apc_Arab"
 "include": "_default_template_yaml"
 "task": "belebele_apc_Arab"
+"test_split": "apc_Arab"
--- a/lm_eval/tasks/belebele/belebele_arb_Arab.yaml
+++ b/lm_eval/tasks/belebele/belebele_arb_Arab.yaml
-"dataset_name": "arb_Arab"
+"fewshot_split": "arb_Arab"
 "include": "_default_template_yaml"
 "task": "belebele_arb_Arab"
+"test_split": "arb_Arab"
--- a/lm_eval/tasks/belebele/belebele_arb_Latn.yaml
+++ b/lm_eval/tasks/belebele/belebele_arb_Latn.yaml
-"dataset_name": "arb_Latn"
+"fewshot_split": "arb_Latn"
 "include": "_default_template_yaml"
 "task": "belebele_arb_Latn"
+"test_split": "arb_Latn"
--- a/lm_eval/tasks/belebele/belebele_ars_Arab.yaml
+++ b/lm_eval/tasks/belebele/belebele_ars_Arab.yaml
-"dataset_name": "ars_Arab"
+"fewshot_split": "ars_Arab"
 "include": "_default_template_yaml"
 "task": "belebele_ars_Arab"
+"test_split": "ars_Arab"
--- a/lm_eval/tasks/belebele/belebele_ary_Arab.yaml
+++ b/lm_eval/tasks/belebele/belebele_ary_Arab.yaml
-"dataset_name": "ary_Arab"
+"fewshot_split": "ary_Arab"
 "include": "_default_template_yaml"
 "task": "belebele_ary_Arab"
+"test_split": "ary_Arab"