Unverified Commit 4824a832 authored by Lintang Sutawika's avatar Lintang Sutawika Committed by GitHub
Browse files

Merge pull request #883 from chrisociepa/transformation-filters

Add transformation filters
parents b8faaa93 06ce7a62
......@@ -214,7 +214,7 @@ metric_list:
```
`aggregation` and `higher_is_better` can optionally be left out to default to the manually-set defaults if using a natively supported metric, otherwise it must be defined explicitly (for example, when using a custom metric implemented as a function).
For a full list of natively supported metrics and aggregation functions see `docs/advanced_task_guide.md`. All metrics supported in [HuggingFace Evaluate](https://github.com/huggingface/evaluate/tree/main/metrics) can also be used, and will be loaded if a given metric name is not one natively supported in `lm-eval`.
For a full list of natively supported metrics and aggregation functions see `docs/advanced_task_guide.md`. All metrics supported in [HuggingFace Evaluate](https://github.com/huggingface/evaluate/tree/main/metrics) can also be used, and will be loaded if a given metric name is not one natively supported in `lm-eval` or `hf_evaluate` is set to `true`.
### Optional, More Advanced Setup
......
......@@ -117,24 +117,23 @@ def register_metric(**args):
return decorate
def get_metric(name):
def get_metric(name, hf_evaluate_metric=False):
if not hf_evaluate_metric:
if name in METRIC_REGISTRY:
return METRIC_REGISTRY[name]
else:
eval_logger.warning(
f"Could not find registered metric '{name}' in lm-eval, searching in HF Evaluate library..."
)
try:
return METRIC_REGISTRY[name]
except KeyError:
# TODO: change this print to logging?
print(
f"Could not find registered metric '{name}' in lm-eval, \
searching in HF Evaluate library..."
metric_object = evaluate.load(name)
return metric_object.compute
except Exception:
eval_logger.error(
f"{name} not found in the evaluate library! Please check https://huggingface.co/evaluate-metric",
)
try:
metric_object = evaluate.load(name)
return metric_object.compute
except Exception:
eval_logger.error(
"{} not found in the evaluate library!".format(name),
"Please check https://huggingface.co/evaluate-metric",
)
def register_aggregation(name):
......
......@@ -555,8 +555,13 @@ class ConfigurableTask(Task):
kwargs = {
key: metric_config[key]
for key in metric_config
if key not in ["metric", "aggregation", "higher_is_better"]
if key
not in ["metric", "aggregation", "higher_is_better", "hf_evaluate"]
}
hf_evaluate_metric = (
"hf_evaluate" in metric_config
and metric_config["hf_evaluate"] is True
)
if self.config.process_results is not None:
self._metric_fn_list[metric_name] = None
......@@ -567,7 +572,9 @@ class ConfigurableTask(Task):
self._metric_fn_list[metric_name] = metric_fn
self._metric_fn_kwargs[metric_name] = kwargs
else:
self._metric_fn_list[metric_name] = get_metric(metric_name)
self._metric_fn_list[metric_name] = get_metric(
metric_name, hf_evaluate_metric
)
self._metric_fn_kwargs[metric_name] = kwargs
if "aggregation" in metric_config:
......@@ -1068,6 +1075,7 @@ class ConfigurableTask(Task):
elif self.OUTPUT_TYPE == "greedy_until":
gold = self.doc_to_target(doc)
result = results[0]
if self.config.doc_to_choice is not None:
# If you set doc_to_choice,
# it assumes that doc_to_target returns a number.
......@@ -1076,10 +1084,10 @@ class ConfigurableTask(Task):
# we expect multiple_targets to be a list.
elif self.multiple_target:
gold = list(gold)
else:
gold = str(gold)
elif type(gold) != type(result):
# cast gold to the same type as result
gold = type(result)(gold)
result = results[0]
for metric in self._metric_fn_list.keys():
if self.multiple_target:
# in the case where we have multiple targets,
......
from lm_eval.api.filter import FilterEnsemble
from . import selection
from . import extraction
from . import transformation
FILTER_REGISTRY = {
......@@ -9,6 +10,9 @@ FILTER_REGISTRY = {
"majority_vote": selection.MajorityVoteFilter,
"take_first_k": selection.TakeKFilter,
"remove_whitespace": extraction.WhitespaceFilter,
"lowercase": transformation.LowercaseFilter,
"uppercase": transformation.UppercaseFilter,
"map": transformation.MapFilter,
# TODO: implement this filter. either it should take in an arbitrary "scoring"/reward function
# that takes an input and returns a scalar and then should select the max reward,
# or should implement different filters for different ways of handling a reward model's inference.
......
from lm_eval.api.filter import Filter
class LowercaseFilter(Filter):
def __init__(self) -> None:
pass
def apply(self, resps, docs):
def filter_set(inst):
return [resp.lower() for resp in inst]
return [filter_set(resp) for resp in resps]
class UppercaseFilter(Filter):
def __init__(self) -> None:
pass
def apply(self, resps, docs):
def filter_set(inst):
return [resp.upper() for resp in inst]
return [filter_set(resp) for resp in resps]
class MapFilter(Filter):
def __init__(self, mapping_dict: dict = {}, default_value=None) -> None:
"""
Initializes the MapFilter with a given mapping dictionary and default value.
Args:
- mapping_dict (dict): A dictionary containing the key-value mappings.
Default is an empty dictionary.
- default_value (Any): The value to be returned when a key is not found in the mapping_dict.
Default is None.
Example:
mapper = MapFilter({'A': 1, 'B': 2}, default_value=0)
"""
assert isinstance(
mapping_dict, dict
), "Provided mapping_dict is not a dictionary"
self.mapping_dict = mapping_dict
self.default_value = default_value
def apply(self, resps, docs):
def filter_set(inst):
return [self.mapping_dict.get(resp, self.default_value) for resp in inst]
return [filter_set(resp) for resp in resps]
# PolEmo 2.0
### Paper
Title: `Multi-Level Sentiment Analysis of PolEmo 2.0: Extended Corpus of Multi-Domain Consumer Reviews`
Abstract: https://aclanthology.org/K19-1092/
The PolEmo 2.0 is a dataset of online consumer reviews in Polish from four domains: medicine, hotels, products, and university. It is human-annotated on a level of full reviews and individual sentences. It comprises over 8000 reviews, about 85% from the medicine and hotel domains.
The goal is to predict the sentiment of a review. There are two separate test sets, to allow for in-domain (medicine and hotels) as well as out-of-domain (products and university) validation.
Homepage: https://clarin-pl.eu/dspace/handle/11321/710
### Citation
```
@inproceedings{kocon-etal-2019-multi,
title = "Multi-Level Sentiment Analysis of {P}ol{E}mo 2.0: Extended Corpus of Multi-Domain Consumer Reviews",
author = "Koco{\'n}, Jan and
Mi{\l}kowski, Piotr and
Za{\'s}ko-Zieli{\'n}ska, Monika",
booktitle = "Proceedings of the 23rd Conference on Computational Natural Language Learning (CoNLL)",
month = nov,
year = "2019",
address = "Hong Kong, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/K19-1092",
doi = "10.18653/v1/K19-1092",
pages = "980--991",
abstract = "In this article we present an extended version of PolEmo {--} a corpus of consumer reviews from 4 domains: medicine, hotels, products and school. Current version (PolEmo 2.0) contains 8,216 reviews having 57,466 sentences. Each text and sentence was manually annotated with sentiment in 2+1 scheme, which gives a total of 197,046 annotations. We obtained a high value of Positive Specific Agreement, which is 0.91 for texts and 0.88 for sentences. PolEmo 2.0 is publicly available under a Creative Commons copyright license. We explored recent deep learning approaches for the recognition of sentiment, such as Bi-directional Long Short-Term Memory (BiLSTM) and Bidirectional Encoder Representations from Transformers (BERT).",
}
```
### Groups and Tasks
#### Groups
* `polemo2`: Evaluates `polemo2_in` and `polemo2_out`
#### Tasks
* `polemo2_in`: evaluates sentiment predictions of in-domain (medicine and hotels) reviews
* `polemo2_out`: evaluates sentiment predictions of out-of-domain (products and university) reviews
### Checklist
For adding novel benchmarks/datasets to the library:
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation?
If other tasks on this dataset are already supported:
* [x] Is the "Main" variant of this task clearly denoted?
* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
group:
- polemo2
task: polemo2_in
dataset_path: allegro/klej-polemo2-in
dataset_name: klej-polemo2-in
output_type: greedy_until
training_split: train
validation_split: validation
test_split: test
doc_to_text: "Opinia: \"{{sentence}}\"\nOkreśl sentyment podanej opinii. Możliwe odpowiedzi:\nA - Neutralny\nB - Negatywny\nC - Pozytywny\nD - Niejednoznaczny\nPrawidłowa odpowiedź:"
doc_to_target: "{{['__label__meta_zero', '__label__meta_minus_m', '__label__meta_plus_m', '__label__meta_amb'].index(target)}}"
should_decontaminate: true
doc_to_decontamination_query: "{{sentence}}"
generation_kwargs:
until:
- "."
- ","
do_sample: false
temperature: 0.0
max_gen_toks: 50
filter_list:
- name: "score-first"
filter:
- function: "regex"
regex_pattern: "(\\b[ABCD]\\b)"
- function: "take_first"
- function: "map"
mapping_dict:
A: 0
B: 1
C: 2
D: 3
default_value: -1
- function: "take_first"
metric_list:
- metric: f1
aggregation: mean
higher_is_better: true
hf_evaluate: true
average: micro
- metric: accuracy
aggregation: mean
higher_is_better: true
include: polemo2_in.yaml
task: polemo2_out
dataset_path: allegro/klej-polemo2-out
dataset_name: klej-polemo2-out
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment