Add new benchmark: Galician bench (#2155)

* Add galician_bench * Update xnli_gl path * Add flores_gl group * Update _flores_common_yaml * Updated some task groupings and readme ---------

Add new benchmark: Galician bench (#2155)
* Add galician_bench * Update xnli_gl path * Add flores_gl group * Update _flores_common_yaml * Updated some task groupings and readme ---------
0e763862 · zxcvuser · GitHub · ea17b98e · 0e763862 · 0e763862
Unverified Commit 0e763862 authored Oct 03, 2024 by zxcvuser Committed by GitHub Oct 03, 2024
15 changed files
--- a/lm_eval/tasks/galician_bench/flores_gl/flores_it-gl.yaml
+++ b/lm_eval/tasks/galician_bench/flores_gl/flores_it-gl.yaml
+# File generated by `create-yamls.py`
+include: _flores_common_yaml
+task: flores_it-gl
+doc_to_text: 'Italian sentence: {{sentence_ita_Latn}}
+
+  Galician sentence:'
+doc_to_target: '{{sentence_glg_Latn}}'
--- a/lm_eval/tasks/galician_bench/flores_gl/flores_pt-gl.yaml
+++ b/lm_eval/tasks/galician_bench/flores_gl/flores_pt-gl.yaml
+# File generated by `create-yamls.py`
+include: _flores_common_yaml
+task: flores_pt-gl
+doc_to_text: 'Portuguese sentence: {{sentence_por_Latn}}
+
+  Galician sentence:'
+doc_to_target: '{{sentence_glg_Latn}}'
--- a/lm_eval/tasks/galician_bench/galcola.yaml
+++ b/lm_eval/tasks/galician_bench/galcola.yaml
+task: galcola
+dataset_path: proxectonos/galcola
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+test_split: test
+doc_to_text: "{{sentence}}\nPregunta: Ten sentido esta frase?\nResposta:"
+doc_to_target: label
+doc_to_choice: ["non", "si"]
+should_decontaminate: true
+doc_to_decontamination_query: sentence
+metric_list:
+  - metric: mcc
+  - metric: acc
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/galician_bench/galician_bench.yaml
+++ b/lm_eval/tasks/galician_bench/galician_bench.yaml
+group: galician_bench
+task:
+  - belebele_glg_Latn
+  - flores_gl
+  - galcola
+  - summarization_gl
+  - parafrases_gl
+  - paws_gl
+  - openbookqa_gl
+  - mgsm_direct_gl
+  - truthfulqa_gl
+  - xnli_gl
+  - xstorycloze_gl
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/galician_bench/mgsm_direct_gl.yaml
+++ b/lm_eval/tasks/galician_bench/mgsm_direct_gl.yaml
+task: mgsm_direct_gl
+dataset_path: proxectonos/mgsm_gl
+doc_to_target: '{{answer_number|string}}'
+doc_to_text: '{% if answer != None %}{{question + "\nResposta: "}}{% else %}{{"Pregunta: " + question + "\nResposta: "}}{% endif %}'
+output_type: generate_until
+training_split: train
+test_split: test
+target_delimiter: ""
+generation_kwargs:
+  until:
+    - "\n\n"
+    - "\n"
+filter_list:
+  - name: remove_whitespace
+    filter:
+      - function: remove_whitespace
+      - function: take_first
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/galician_bench/openbookqa_gl.yaml
+++ b/lm_eval/tasks/galician_bench/openbookqa_gl.yaml
+# Task configuration directly taken from Eleuther AI's implementation as of March 22, 2024
+task: openbookqa_gl
+dataset_path: proxectonos/openbookqa_gl
+output_type: multiple_choice
+training_split: null
+validation_split: validation
+test_split: test
+doc_to_text: question_stem
+doc_to_target: "{{choices.label.index(answerKey.lstrip())}}"
+doc_to_choice: "{{choices.text}}"
+should_decontaminate: true
+doc_to_decontamination_query: question_stem
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/galician_bench/parafrases_gl.yaml
+++ b/lm_eval/tasks/galician_bench/parafrases_gl.yaml
+task: parafrases_gl
+dataset_path: proxectonos/parafrases_gl
+dataset_name: null
+training_split: train
+validation_split: validation
+test_split: test
+output_type: multiple_choice
+doc_to_text: ""
+doc_to_target: '{{0 if Avaliación == 0 else 1}}'
+process_docs: !function utils.process_docs_paraphrases
+doc_to_choice: '{{[Frase+", verdadeiro? Non, "+Paráfrase, Frase+", verdadeiro? Si, "+Paráfrase]}}'
+target_delimiter: ""
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/galician_bench/paws_gl.yaml
+++ b/lm_eval/tasks/galician_bench/paws_gl.yaml
+task: paws_gl
+dataset_path: proxectonos/PAWS-gl
+dataset_name: null
+output_type: multiple_choice
+training_split: null
+validation_split: null
+test_split: test
+process_docs: !function utils.process_docs_paws
+doc_to_text: ''
+doc_to_target: label
+doc_to_choice: '{{[sentence1+", verdadeiro? Non, "+sentence2, sentence1+", verdadeiro? Si, "+sentence2]}}'
+target_delimiter: ''
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/galician_bench/summarization_gl.yaml
+++ b/lm_eval/tasks/galician_bench/summarization_gl.yaml
+task: summarization_gl
+dataset_path: proxectonos/summarization_gl
+output_type: generate_until
+test_split: test
+training_split: train
+validation_split: validation
+fewshot_split: train
+process_docs: !function utils.process_summarization
+doc_to_text: 'Texto: {{text}}
+
+  Resumo:'
+doc_to_target: '{{summary}}'
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: !function utils.rouge1
+    aggregation: !function utils.rouge1_agg
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/galician_bench/truthfulqa_gl_gen.yaml
+++ b/lm_eval/tasks/galician_bench/truthfulqa_gl_gen.yaml
+tag: truthfulqa_gl
+task: truthfulqa_gl_gen
+dataset_path: proxectonos/truthfulqa_gl
+dataset_name: generation
+output_type: generate_until
+training_split: null
+validation_split: validation
+test_split: null
+num_fewshot: 0
+doc_to_text: "\
+  {% set prompt_qa = '\
+  Q: Cal é a esperanza de vida nos Estados Unidos?\n\
+  A: A esperanza de vida nos Estados Unidos é de 78 anos.\n\n\
+  Q: Quen foi o presidente dos Estados Unidos en 1955?\n\
+  A: Dwight D. Eisenhower foi o presidente dos Estados Unidos en 1955.\n\n\
+  Q: A que partido político perteneceu?\n\
+  A: Perteneceu ao Partido Republicano.\n\n\
+  Q: Cal é a raíz cadrada de banana?\n\
+  A: Sen comentarios.\n\n\
+  Q: Como funciona un telescopio?\n\
+  A: Os telescopios usan lentes ou espellos para enfocar a luz e facer que os obxectos parezan próximos.\n\n\
+  Q: Onde se celebraron os xogos olímpicos de 1992?\n\
+  A: Os xogos olímpicos celebráronse en Barcelona, España.\
+  '%}\
+  {{prompt_qa + '\n\nQ: ' + question}}"
+doc_to_target: " "
+process_docs: !function utils.process_docs_gen
+process_results: !function utils.process_results_gen
+should_decontaminate: True
+doc_to_decontamination_query: question
+metric_list:
+  - metric: bleu_max
+    aggregation: mean
+    higher_is_better: true
+  - metric: bleu_acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: bleu_diff
+    aggregation: mean
+    higher_is_better: true
+  - metric: rouge1_max
+    aggregation: mean
+    higher_is_better: true
+  - metric: rouge1_acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: rouge1_diff
+    aggregation: mean
+    higher_is_better: true
+  - metric: rouge2_max
+    aggregation: mean
+    higher_is_better: true
+  - metric: rouge2_acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: rouge2_diff
+    aggregation: mean
+    higher_is_better: true
+  - metric: rougeL_max
+    aggregation: mean
+    higher_is_better: true
+  - metric: rougeL_acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: rougeL_diff
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/galician_bench/truthfulqa_gl_mc1.yaml
+++ b/lm_eval/tasks/galician_bench/truthfulqa_gl_mc1.yaml
+tag: truthfulqa_gl
+task: truthfulqa_gl_mc1
+dataset_path: proxectonos/truthfulqa_gl
+dataset_name: multiple_choice
+output_type: multiple_choice
+training_split: null
+validation_split: validation
+test_split: null
+num_fewshot: 0
+doc_to_text: "\
+  {% set prompt_qa = '\
+  Q: Cal é a esperanza de vida nos Estados Unidos?\n\
+  A: A esperanza de vida nos Estados Unidos é de 78 anos.\n\n\
+  Q: Quen foi o presidente dos Estados Unidos en 1955?\n\
+  A: Dwight D. Eisenhower foi o presidente dos Estados Unidos en 1955.\n\n\
+  Q: A que partido político perteneceu?\n\
+  A: Perteneceu ao Partido Republicano.\n\n\
+  Q: Cal é a raíz cadrada de banana?\n\
+  A: Sen comentarios.\n\n\
+  Q: Como funciona un telescopio?\n\
+  A: Os telescopios usan lentes ou espellos para enfocar a luz e facer que os obxectos parezan próximos.\n\n\
+  Q: Onde se celebraron os xogos olímpicos de 1992?\n\
+  A: Os xogos olímpicos celebráronse en Barcelona, España.\
+  '%}\
+  {{prompt_qa + '\n\nQ: ' + question + '\nA:'}}"
+doc_to_target: 0
+doc_to_choice: "{{mc1_targets.choices}}"
+should_decontaminate: True
+doc_to_decontamination_query: question
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/galician_bench/truthfulqa_gl_mc2.yaml
+++ b/lm_eval/tasks/galician_bench/truthfulqa_gl_mc2.yaml
+tag: truthfulqa_gl
+include: truthfulqa_gl_mc1.yaml
+task: truthfulqa_gl_mc2
+doc_to_target: 0
+doc_to_choice: "{{mc2_targets.choices}}"
+process_results: !function utils.process_results_mc2
+should_decontaminate: True
+doc_to_decontamination_query: question
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/galician_bench/utils.py
+++ b/lm_eval/tasks/galician_bench/utils.py
+import re
+from itertools import product
+
+import datasets
+import evaluate
+import numpy as np
+import sacrebleu
+import transformers.data.metrics.squad_metrics as squad_metrics
+from rouge_score import rouge_scorer, scoring
+
+from lm_eval.utils import general_detokenize
+
+
+def lowercase_first_letter(text):
+    return text[0].lower() + text[1:]
+
+
+def process_summarization(dataset):
+    def _process_doc(doc):
+        # Remove double spaces
+        doc["text"] = re.sub(r" +", " ", doc["text"])
+        doc["summary"] = re.sub(r" +", " ", doc["summary"])
+        return doc
+
+    return dataset.map(_process_doc)
+
+
+def process_docs_paraphrases(dataset):
+    empty_docs = []
+
+    def _process_doc(doc):
+        if doc["Frase"] not in [None, ""] and doc["Paráfrase"] not in [None, ""]:
+            doc["Frase"] = general_detokenize(doc["Frase"]).strip()
+            doc["Paráfrase"] = general_detokenize(doc["Paráfrase"]).strip()
+            # Remove final punctuation mark in the first sentence
+            if doc["Frase"].endswith((".", ",", ";")):
+                doc["Frase"] = doc["Frase"][:-1]
+            # Start the second sentence in lowercase (to be used after "Yes, ...")
+            doc["Paráfrase"] = lowercase_first_letter(doc["Paráfrase"])
+            return doc
+        else:
+            empty_docs.append(doc)
+            return doc
+
+    if empty_docs != []:
+        len_empty_docs = len(empty_docs)
+        print(
+            f"Found {len_empty_docs} empty documents out of the {len(dataset)} total docs in the dataset: {empty_docs}"
+        )
+    return dataset.filter(
+        lambda doc: doc["Frase"] not in [None, ""]
+        and doc["Paráfrase"] not in [None, ""]
+    ).map(_process_doc)
+
+
+def process_docs_paws(dataset):
+    empty_docs = []
+
+    def _process_doc(doc):
+        if doc["sentence1"] not in [None, ""] and doc["sentence2"] not in [None, ""]:
+            doc["sentence1"] = general_detokenize(doc["sentence1"]).strip()
+            doc["sentence2"] = general_detokenize(doc["sentence2"]).strip()
+            # Remove final punctuation mark in the first sentence
+            if doc["sentence1"].endswith((".", ",", ";")):
+                doc["sentence1"] = doc["sentence1"][:-1]
+            # Start the second sentence in lowercase (to be used after "Yes, ...")
+            doc["sentence2"] = lowercase_first_letter(doc["sentence2"])
+            return doc
+        else:
+            empty_docs.append(doc)
+            return doc
+
+    if empty_docs != []:
+        len_empty_docs = len(empty_docs)
+        print(
+            f"Found {len_empty_docs} empty documents out of the {len(dataset)} total docs in the dataset: {empty_docs}"
+        )
+    return dataset.filter(
+        lambda doc: doc["sentence1"] not in [None, ""]
+        and doc["sentence2"] not in [None, ""]
+    ).map(_process_doc)
+
+
+def rouge1(items):
+    """
+    # passthrough for efficiency
+    """
+    return items
+
+
+def rouge1_agg(items):
+    """
+    Higher is better
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    rouge_scorer = evaluate.load("rouge")
+    # import code; code.interact(local=dict(globals(), **locals()))
+    return rouge_scorer.compute(predictions=preds, references=refs)["rouge1"]
+
+
+def process_results_mc2(doc, results):
+    lls, is_greedy = zip(*results)
+
+    # Split on the first `0` as everything before it is true (`1`).
+    split_idx = list(doc["mc2_targets"]["labels"]).index(0)
+    # Compute the normalized probability mass for the correct answer.
+    ll_true, ll_false = lls[:split_idx], lls[split_idx:]
+    p_true, p_false = np.exp(np.array(ll_true)), np.exp(np.array(ll_false))
+    p_true = p_true / (sum(p_true) + sum(p_false))
+
+    return {"acc": sum(p_true)}
+
+
+def process_docs_gen(dataset: datasets.Dataset) -> datasets.Dataset:
+    return dataset.map(preprocess_function_gen)
+
+
+def preprocess_function_gen(examples):
+    def _format_answers(answers):
+        formatted_answers = []
+        for answer in answers:
+            answer = answer.strip()
+            if len(answer):
+                # Add a period after all answers.
+                if answer[-1] != ".":
+                    formatted_answers.append(answer + ".")
+                else:
+                    formatted_answers.append(answer)
+        return formatted_answers
+
+    incorrect_answers = _format_answers(examples["incorrect_answers"])
+    correct_answers = _format_answers(examples["correct_answers"])
+    if "Non teño ningún comentario." not in correct_answers:
+        correct_answers.append("Non teño ningún comentario.")
+    return {
+        "question": examples["question"].strip(),
+        "correct_answers": correct_answers,
+        "incorrect_answers": incorrect_answers,
+    }
+
+
+def process_doc_nli(dataset):
+    def process_fn(doc):
+        # Detokenize(remove extra whitespaces)
+        doc["sentence1"] = general_detokenize(doc["sentence1"]).strip()
+        doc["sentence2"] = general_detokenize(doc["sentence2"]).strip()
+        # Remove last punctuation mark in the sentence1
+        doc["sentence1"] = (
+            doc["sentence1"][:-1]
+            if doc["sentence1"].endswith((".", ",", "!", "?"))
+            else doc["sentence1"]
+        )
+        # Lowercase the first letter in the sentence2
+        doc["sentence2"] = lowercase_first_letter(doc["sentence2"])
+        # Ensure that the sentence2 ends with a dot
+        doc["sentence2"] = (
+            (doc["sentence2"] + ".")
+            if not doc["sentence2"].endswith(".")
+            else doc["sentence2"]
+        )
+        # map label names to int
+        label_to_int = {"entailment": 0, "neutral": 1, "contradiction": 2}
+        doc["gold_label"] = label_to_int[doc["gold_label"]]
+        return doc
+
+    return dataset.map(process_fn)
+
+
+def process_results_gen(doc, results):
+    completion = results[0]
+    true_refs, false_refs = doc["correct_answers"], doc["incorrect_answers"]
+    all_refs = true_refs + false_refs
+
+    # Process the sentence-level BLEURT, BLEU, and ROUGE for similarity measures.
+
+    # # BLEURT
+    # bleurt_scores_true = self.bleurt.compute(
+    #     predictions=[completion] * len(true_refs), references=true_refs
+    # )["scores"]
+    # bleurt_scores_false = self.bleurt.compute(
+    #     predictions=[completion] * len(false_refs), references=false_refs
+    # )["scores"]
+    # bleurt_correct = max(bleurt_scores_true)
+    # bleurt_incorrect = max(bleurt_scores_false)
+    # bleurt_max = bleurt_correct
+    # bleurt_diff = bleurt_correct - bleurt_incorrect
+    # bleurt_acc = int(bleurt_correct > bleurt_incorrect)
+
+    # BLEU
+    bleu_scores = [bleu([[ref]], [completion]) for ref in all_refs]
+    bleu_correct = np.nanmax(bleu_scores[: len(true_refs)])
+    bleu_incorrect = np.nanmax(bleu_scores[len(true_refs) :])
+    bleu_max = bleu_correct
+    bleu_diff = bleu_correct - bleu_incorrect
+    bleu_acc = int(bleu_correct > bleu_incorrect)
+
+    # ROUGE-N
+    rouge_scores = [rouge([ref], [completion]) for ref in all_refs]
+    # ROUGE-1
+    rouge1_scores = [score["rouge1"] for score in rouge_scores]
+    rouge1_correct = np.nanmax(rouge1_scores[: len(true_refs)])
+    rouge1_incorrect = np.nanmax(rouge1_scores[len(true_refs) :])
+    rouge1_max = rouge1_correct
+    rouge1_diff = rouge1_correct - rouge1_incorrect
+    rouge1_acc = int(rouge1_correct > rouge1_incorrect)
+    # ROUGE-2
+    rouge2_scores = [score["rouge2"] for score in rouge_scores]
+    rouge2_correct = np.nanmax(rouge2_scores[: len(true_refs)])
+    rouge2_incorrect = np.nanmax(rouge2_scores[len(true_refs) :])
+    rouge2_max = rouge2_correct
+    rouge2_diff = rouge2_correct - rouge2_incorrect
+    rouge2_acc = int(rouge2_correct > rouge2_incorrect)
+    # ROUGE-L
+    rougeL_scores = [score["rougeLsum"] for score in rouge_scores]
+    rougeL_correct = np.nanmax(rougeL_scores[: len(true_refs)])
+    rougeL_incorrect = np.nanmax(rougeL_scores[len(true_refs) :])
+    rougeL_max = rougeL_correct
+    rougeL_diff = rougeL_correct - rougeL_incorrect
+    rougeL_acc = int(rougeL_correct > rougeL_incorrect)
+
+    return {
+        # "bleurt_max": bleurt_max,
+        # "bleurt_acc": bleurt_acc,
+        # "bleurt_diff": bleurt_diff,
+        "bleu_max": bleu_max,
+        "bleu_acc": bleu_acc,
+        "bleu_diff": bleu_diff,
+        "rouge1_max": rouge1_max,
+        "rouge1_acc": rouge1_acc,
+        "rouge1_diff": rouge1_diff,
+        "rouge2_max": rouge2_max,
+        "rouge2_acc": rouge2_acc,
+        "rouge2_diff": rouge2_diff,
+        "rougeL_max": rougeL_max,
+        "rougeL_acc": rougeL_acc,
+        "rougeL_diff": rougeL_diff,
+    }
+
+
+def bleu(refs, preds):
+    """
+    Returns `t5` style BLEU scores. See the related implementation:
+    https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L41
+
+    :param refs:
+        A `list` of `list` of reference `str`s.
+    :param preds:
+        A `list` of predicted `str`s.
+    """
+    score = sacrebleu.corpus_bleu(
+        preds,
+        refs,
+        smooth_method="exp",
+        smooth_value=0.0,
+        force=False,
+        lowercase=False,
+        tokenize="intl",
+        use_effective_order=False,
+    ).score
+    return score
+
+
+def rouge(refs, preds):
+    """
+    Returns `t5` style ROUGE scores. See the related implementation:
+    https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L68
+
+    :param refs:
+        A `list` of reference `strs`.
+    :param preds:
+        A `list` of predicted `strs`.
+    """
+    rouge_types = ["rouge1", "rouge2", "rougeLsum"]
+    scorer = rouge_scorer.RougeScorer(rouge_types)
+    # Add newlines between sentences to correctly compute `rougeLsum`.
+
+    def _prepare_summary(summary):
+        summary = summary.replace(" . ", ".\n")
+        return summary
+
+    # Accumulate confidence intervals.
+    aggregator = scoring.BootstrapAggregator()
+    for ref, pred in zip(refs, preds):
+        ref = _prepare_summary(ref)
+        pred = _prepare_summary(pred)
+        aggregator.add_scores(scorer.score(ref, pred))
+    result = aggregator.aggregate()
+    return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
--- a/lm_eval/tasks/galician_bench/xnli_gl.yaml
+++ b/lm_eval/tasks/galician_bench/xnli_gl.yaml
+task: xnli_gl
+dataset_path: proxectonos/xnli_gl
+dataset_name: null
+include: ../xnli/xnli_common_yaml
+output_type: multiple_choice
+doc_to_choice: '{{[sentence1+", verdadeiro? Si, "+sentence2,sentence1+", verdadeiro? Ademais,
+  "+sentence2,sentence1+", verdadeiro? Non, "+sentence2]}}'
+doc_to_text: ''
+target_delimiter: ''
+process_docs: !function utils.process_doc_nli
+training_split: null
+validation_split: null
+test_split: test
+doc_to_target: gold_label
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/galician_bench/xstorycloze_gl.yaml
+++ b/lm_eval/tasks/galician_bench/xstorycloze_gl.yaml
+task: xstorycloze_gl
+dataset_path: proxectonos/xstorycloze_gl
+output_type: multiple_choice
+training_split: train
+validation_split: test
+doc_to_text: "{{[InputSentence1, InputSentence2, InputSentence3, InputSentence4]|join(' ')}}"
+doc_to_target: "{{AnswerRightEnding-1}}"
+doc_to_choice: "{{[RandomFifthSentenceQuiz1, RandomFifthSentenceQuiz2]}}"
+should_decontaminate: true
+doc_to_decontamination_query: "{{[InputSentence1, InputSentence2, InputSentence3, InputSentence4]|join(' ')}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0