Unverified Commit 0256c682 authored by Uanu's avatar Uanu Committed by GitHub
Browse files

Add multilingual ARC task (#1419)

parent 1ff84897
include: _arc_yaml
task: arc_ne
dataset_path: alexandrainst/m_arc
dataset_name: ne
training_split: train
validation_split: validation
test_split: test
include: _arc_yaml
task: arc_nl
dataset_path: alexandrainst/m_arc
dataset_name: nl
training_split: train
validation_split: validation
test_split: test
include: _arc_yaml
task: arc_pt
dataset_path: alexandrainst/m_arc
dataset_name: pt
training_split: train
validation_split: validation
test_split: test
include: _arc_yaml
task: arc_ro
dataset_path: alexandrainst/m_arc
dataset_name: ro
training_split: train
validation_split: validation
test_split: test
include: _arc_yaml
task: arc_ru
dataset_path: alexandrainst/m_arc
dataset_name: ru
training_split: train
validation_split: validation
test_split: test
include: _arc_yaml
task: arc_sk
dataset_path: alexandrainst/m_arc
dataset_name: sk
training_split: train
validation_split: validation
test_split: test
include: _arc_yaml
task: arc_sr
dataset_path: alexandrainst/m_arc
dataset_name: sr
training_split: train
validation_split: validation
test_split: test
include: _arc_yaml
task: arc_sv
dataset_path: alexandrainst/m_arc
dataset_name: sv
training_split: train
validation_split: validation
test_split: test
include: _arc_yaml
task: arc_ta
dataset_path: alexandrainst/m_arc
dataset_name: ta
training_split: train
validation_split: validation
test_split: test
include: _arc_yaml
task: arc_te
dataset_path: alexandrainst/m_arc
dataset_name: te
training_split: train
validation_split: validation
test_split: test
include: _arc_yaml
task: arc_uk
dataset_path: alexandrainst/m_arc
dataset_name: uk
training_split: train
validation_split: validation
test_split: test
include: _arc_yaml
task: arc_vi
dataset_path: alexandrainst/m_arc
dataset_name: vi
training_split: train
validation_split: validation
test_split: test
include: _arc_yaml
task: arc_zh
dataset_path: alexandrainst/m_arc
dataset_name: zh
training_split: train
validation_split: validation
test_split: test
import datasets
import re
def preprocess(text):
if text is None:
return " "
text = text.strip()
text = text.replace(" [title]", ". ")
text = re.sub("\\[.*?\\]", "", text)
text = text.replace(" ", " ")
return text
def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
def _process_doc(doc):
# breakpoint()
out_doc = {
"id": doc["id"],
"query": "Question: " + preprocess(doc["instruction"]) + "\nAnswer:",
"choices": [preprocess(doc['option_a']), preprocess(doc['option_b']), preprocess(doc['option_c']), preprocess(doc['option_d']), preprocess(doc['option_e'])],
"gold": ["A", "B", "C", "D", "E"].index(doc["answer"]),
}
return out_doc
return dataset.map(_process_doc)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment