f'Both target_delimiter and target choice: "{choice}" does not have whitespace, ignore if the language you are evaluating on does not require/use whitespace'
f'Both target_delimiter and target choice: "{choice}" does not have whitespace, ignore if the language you are evaluating on does not require/use whitespace'
)
defdownload(self,dataset_kwargs=None)->None:
self.dataset=datasets.load_dataset(
path=self.DATASET_PATH,
...
...
@@ -1070,6 +1070,9 @@ class ConfigurableTask(Task):
# it assumes that doc_to_target returns a number.
choices=self.doc_to_choice(doc)
gold=choices[gold]
# we expect multiple_targets to be a list.
elifself.multiple_target:
gold=list(gold)
else:
gold=str(gold)
...
...
@@ -1080,6 +1083,10 @@ class ConfigurableTask(Task):
# return true if any are true
# TODO: this may break for multipLe_target, non zero-or-1 metrics
scores=[]
ifnotisinstance(gold,list):
# sometimes, a multiple_target dataset has exceptions where one doc has only one string answer
MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is
modified from Chinese high school English listening comprehension test data.
Homepage: https://github.com/Nealcly/MuTual
### Citation
```
@inproceedings{mutual,
title = "MuTual: A Dataset for Multi-Turn Dialogue Reasoning",
author = "Cui, Leyang and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming" ,
booktitle = "Proceedings of the 58th Conference of the Association for Computational Linguistics",
year = "2020",
publisher = "Association for Computational Linguistics",
}
```
### Groups and Tasks
#### Groups
* Not part of a group yet.
#### Tasks
*`mutual`
*`mutual_plus`
### Checklist
For adding novel benchmarks/datasets to the library:
* [ ] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?