Merge pull request #883 from chrisociepa/transformation-filters

Add transformation filters

Merge pull request #883 from chrisociepa/transformation-filters
Add transformation filters
4824a832 · Lintang Sutawika · GitHub · b8faaa93 · 06ce7a62 · 4824a832
Unverified Commit 4824a832 authored Oct 05, 2023 by Lintang Sutawika Committed by GitHub Oct 05, 2023
8 changed files
--- a/docs/new_task_guide.md
+++ b/docs/new_task_guide.md
@@ -214,7 +214,7 @@ metric_list:
 ```
 `aggregation` and `higher_is_better` can optionally be left out to default to the manually-set defaults if using a natively supported metric, otherwise it must be defined explicitly (for example, when using a custom metric implemented as a function).

-For a full list of natively supported metrics and aggregation functions see `docs/advanced_task_guide.md`. All metrics supported in [HuggingFace Evaluate](https://github.com/huggingface/evaluate/tree/main/metrics) can also be used, and will be loaded if a given metric name is not one natively supported in `lm-eval`.
+For a full list of natively supported metrics and aggregation functions see `docs/advanced_task_guide.md`. All metrics supported in [HuggingFace Evaluate](https://github.com/huggingface/evaluate/tree/main/metrics) can also be used, and will be loaded if a given metric name is not one natively supported in `lm-eval` or `hf_evaluate` is set to `true`.

 ### Optional, More Advanced Setup


--- a/lm_eval/api/registry.py
+++ b/lm_eval/api/registry.py
@@ -117,24 +117,23 @@ def register_metric(**args):
    return decorate


-def get_metric(name):
+def get_metric(name, hf_evaluate_metric=False):
+
+    if not hf_evaluate_metric:
+        if name in METRIC_REGISTRY:
+            return METRIC_REGISTRY[name]
+        else:
+            eval_logger.warning(
+                f"Could not find registered metric '{name}' in lm-eval, searching in HF Evaluate library..."
+            )

    try:
-        return METRIC_REGISTRY[name]
-    except KeyError:
-        # TODO: change this print to logging?
-        print(
-            f"Could not find registered metric '{name}' in lm-eval, \
-searching in HF Evaluate library..."
+        metric_object = evaluate.load(name)
+        return metric_object.compute
+    except Exception:
+        eval_logger.error(
+            f"{name} not found in the evaluate library! Please check https://huggingface.co/evaluate-metric",
        )
-        try:
-            metric_object = evaluate.load(name)
-            return metric_object.compute
-        except Exception:
-            eval_logger.error(
-                "{} not found in the evaluate library!".format(name),
-                "Please check https://huggingface.co/evaluate-metric",
-            )


 def register_aggregation(name):

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -555,8 +555,13 @@ class ConfigurableTask(Task):
                kwargs = {
                    key: metric_config[key]
                    for key in metric_config
-                    if key not in ["metric", "aggregation", "higher_is_better"]
+                    if key
+                    not in ["metric", "aggregation", "higher_is_better", "hf_evaluate"]
                }
+                hf_evaluate_metric = (
+                    "hf_evaluate" in metric_config
+                    and metric_config["hf_evaluate"] is True
+                )

                if self.config.process_results is not None:
                    self._metric_fn_list[metric_name] = None
@@ -567,7 +572,9 @@ class ConfigurableTask(Task):
                    self._metric_fn_list[metric_name] = metric_fn
                    self._metric_fn_kwargs[metric_name] = kwargs
                else:
-                    self._metric_fn_list[metric_name] = get_metric(metric_name)
+                    self._metric_fn_list[metric_name] = get_metric(
+                        metric_name, hf_evaluate_metric
+                    )
                    self._metric_fn_kwargs[metric_name] = kwargs

                if "aggregation" in metric_config:
@@ -1068,6 +1075,7 @@ class ConfigurableTask(Task):

        elif self.OUTPUT_TYPE == "greedy_until":
            gold = self.doc_to_target(doc)
+            result = results[0]
            if self.config.doc_to_choice is not None:
                # If you set doc_to_choice,
                # it assumes that doc_to_target returns a number.
@@ -1076,10 +1084,10 @@ class ConfigurableTask(Task):
            # we expect multiple_targets to be a list.
            elif self.multiple_target:
                gold = list(gold)
-            else:
-                gold = str(gold)
+            elif type(gold) != type(result):
+                # cast gold to the same type as result
+                gold = type(result)(gold)

-            result = results[0]
            for metric in self._metric_fn_list.keys():
                if self.multiple_target:
                    # in the case where we have multiple targets,

--- a/lm_eval/filters/__init__.py
+++ b/lm_eval/filters/__init__.py
 from lm_eval.api.filter import FilterEnsemble
 from . import selection
 from . import extraction
+from . import transformation


 FILTER_REGISTRY = {
@@ -9,6 +10,9 @@ FILTER_REGISTRY = {
    "majority_vote": selection.MajorityVoteFilter,
    "take_first_k": selection.TakeKFilter,
    "remove_whitespace": extraction.WhitespaceFilter,
+    "lowercase": transformation.LowercaseFilter,
+    "uppercase": transformation.UppercaseFilter,
+    "map": transformation.MapFilter,
    # TODO: implement this filter. either it should take in an arbitrary "scoring"/reward function
    # that takes an input and returns a scalar and then should select the max reward,
    # or should implement different filters for different ways of handling a reward model's inference.

--- a/lm_eval/filters/transformation.py
+++ b/lm_eval/filters/transformation.py
+from lm_eval.api.filter import Filter
+
+
+class LowercaseFilter(Filter):
+    def __init__(self) -> None:
+        pass
+
+    def apply(self, resps, docs):
+        def filter_set(inst):
+            return [resp.lower() for resp in inst]
+
+        return [filter_set(resp) for resp in resps]
+
+
+class UppercaseFilter(Filter):
+    def __init__(self) -> None:
+        pass
+
+    def apply(self, resps, docs):
+        def filter_set(inst):
+            return [resp.upper() for resp in inst]
+
+        return [filter_set(resp) for resp in resps]
+
+
+class MapFilter(Filter):
+    def __init__(self, mapping_dict: dict = {}, default_value=None) -> None:
+        """
+        Initializes the MapFilter with a given mapping dictionary and default value.
+
+        Args:
+        - mapping_dict (dict): A dictionary containing the key-value mappings.
+                               Default is an empty dictionary.
+        - default_value (Any): The value to be returned when a key is not found in the mapping_dict.
+                               Default is None.
+
+        Example:
+        mapper = MapFilter({'A': 1, 'B': 2}, default_value=0)
+        """
+        assert isinstance(
+            mapping_dict, dict
+        ), "Provided mapping_dict is not a dictionary"
+        self.mapping_dict = mapping_dict
+        self.default_value = default_value
+
+    def apply(self, resps, docs):
+        def filter_set(inst):
+            return [self.mapping_dict.get(resp, self.default_value) for resp in inst]
+
+        return [filter_set(resp) for resp in resps]
--- a/lm_eval/tasks/polemo2/README.md
+++ b/lm_eval/tasks/polemo2/README.md
+# PolEmo 2.0
+
+### Paper
+
+Title: `Multi-Level Sentiment Analysis of PolEmo 2.0: Extended Corpus of Multi-Domain Consumer Reviews`
+
+Abstract: https://aclanthology.org/K19-1092/
+
+The PolEmo 2.0 is a dataset of online consumer reviews in Polish from four domains: medicine, hotels, products, and university. It is human-annotated on a level of full reviews and individual sentences. It comprises over 8000 reviews, about 85% from the medicine and hotel domains.
+The goal is to predict the sentiment of a review. There are two separate test sets, to allow for in-domain (medicine and hotels) as well as out-of-domain (products and university) validation.
+
+Homepage: https://clarin-pl.eu/dspace/handle/11321/710
+
+
+### Citation
+
+```
+@inproceedings{kocon-etal-2019-multi,
+    title = "Multi-Level Sentiment Analysis of {P}ol{E}mo 2.0: Extended Corpus of Multi-Domain Consumer Reviews",
+    author = "Koco{\'n}, Jan  and
+      Mi{\l}kowski, Piotr  and
+      Za{\'s}ko-Zieli{\'n}ska, Monika",
+    booktitle = "Proceedings of the 23rd Conference on Computational Natural Language Learning (CoNLL)",
+    month = nov,
+    year = "2019",
+    address = "Hong Kong, China",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/K19-1092",
+    doi = "10.18653/v1/K19-1092",
+    pages = "980--991",
+    abstract = "In this article we present an extended version of PolEmo {--} a corpus of consumer reviews from 4 domains: medicine, hotels, products and school. Current version (PolEmo 2.0) contains 8,216 reviews having 57,466 sentences. Each text and sentence was manually annotated with sentiment in 2+1 scheme, which gives a total of 197,046 annotations. We obtained a high value of Positive Specific Agreement, which is 0.91 for texts and 0.88 for sentences. PolEmo 2.0 is publicly available under a Creative Commons copyright license. We explored recent deep learning approaches for the recognition of sentiment, such as Bi-directional Long Short-Term Memory (BiLSTM) and Bidirectional Encoder Representations from Transformers (BERT).",
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+* `polemo2`: Evaluates `polemo2_in` and `polemo2_out`
+
+#### Tasks
+
+* `polemo2_in`: evaluates sentiment predictions of in-domain (medicine and hotels) reviews
+* `polemo2_out`: evaluates sentiment predictions of out-of-domain (products and university) reviews
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation?
+
+
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/polemo2/polemo2_in.yaml
+++ b/lm_eval/tasks/polemo2/polemo2_in.yaml
+group:
+  - polemo2
+task: polemo2_in
+dataset_path: allegro/klej-polemo2-in
+dataset_name: klej-polemo2-in
+output_type: greedy_until
+training_split: train
+validation_split: validation
+test_split: test
+doc_to_text: "Opinia: \"{{sentence}}\"\nOkreśl sentyment podanej opinii. Możliwe odpowiedzi:\nA - Neutralny\nB - Negatywny\nC - Pozytywny\nD - Niejednoznaczny\nPrawidłowa odpowiedź:"
+doc_to_target: "{{['__label__meta_zero', '__label__meta_minus_m', '__label__meta_plus_m', '__label__meta_amb'].index(target)}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{sentence}}"
+generation_kwargs:
+  until:
+    - "."
+    - ","
+  do_sample: false
+  temperature: 0.0
+  max_gen_toks: 50
+filter_list:
+  - name: "score-first"
+    filter:
+      - function: "regex"
+        regex_pattern: "(\\b[ABCD]\\b)"
+      - function: "take_first"
+      - function: "map"
+        mapping_dict:
+          A: 0
+          B: 1
+          C: 2
+          D: 3
+        default_value: -1
+      - function: "take_first"
+metric_list:
+  - metric: f1
+    aggregation: mean
+    higher_is_better: true
+    hf_evaluate: true
+    average: micro
+  - metric: accuracy
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/polemo2/polemo2_out.yaml
+++ b/lm_eval/tasks/polemo2/polemo2_out.yaml
+include: polemo2_in.yaml
+task: polemo2_out
+dataset_path: allegro/klej-polemo2-out
+dataset_name: klej-polemo2-out