Commit bf2517cc authored by lintangsutawika's avatar lintangsutawika
Browse files

update latest

parents 8bca751c 7397b965
......@@ -45,6 +45,7 @@ git clone https://github.com/EleutherAI/lm-evaluation-harness
cd lm-evaluation-harness
pip install -e .
```
We also provide a number of optional dependencies for extended functionality. A detailed table is available at the end of this document.
## Basic Usage
......@@ -174,6 +175,7 @@ Note that for externally hosted models, configs such as `--device` and `--batch_
| vLLM | :heavy_check_mark: | `vllm` | [Most HF Causal Language Models](https://docs.vllm.ai/en/latest/models/supported_models.html) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| Mamba | :heavy_check_mark: | `mamba_ssm` | [Mamba architecture Language Models via the `mamba_ssm` package](https://huggingface.co/state-spaces) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| Huggingface Optimum (Causal LMs) | ✔️ | `openvino` | Any decoder-only AutoModelForCausalLM converted with Huggingface Optimum into OpenVINO™ Intermediate Representation (IR) format | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | ... |
| Neuron via AWS Inf2 (Causal LMs) | ✔️ | `neuronx` | Any decoder-only AutoModelForCausalLM supported to run on [huggingface-ami image for inferentia2](https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | ... |
| Your local inference server! | :heavy_check_mark: | `local-completions` or `local-chat-completions` (using `openai-chat-completions` model type) | Any server address that accepts GET requests using HF models and mirror's OpenAI's Completions or ChatCompletions interface | `generate_until` | | ... |
Models which do not supply logits or logprobs can be used with tasks of type `generate_until` only, while local models, or APIs that supply logprobs/logits of their prompts, can be run on all task types: `generate_until`, `loglikelihood`, `loglikelihood_rolling`, and `multiple_choice`.
......@@ -196,7 +198,7 @@ If you have a Metal compatible Mac, you can run the eval harness using the MPS b
> You can inspect what the LM inputs look like by running the following command:
> ```bash
> python write_out.py \
> --tasks all_tasks \
> --tasks <task1,task2,...> \
> --num_fewshot 5 \
> --num_examples 10 \
> --output_base_path /path/to/output/folder
......@@ -312,7 +314,9 @@ Extras dependencies can be installed via `pip install -e ".[NAME]"`
| anthropic | For using Anthropic's models |
| dev | For linting PRs and contributions |
| gptq | For loading models with GPTQ |
| hf_transfer | For speeding up HF Hub file downloads |
| ifeval | For running the IFEval task |
| neuronx | For running on AWS inf2 instances |
| mamba | For loading Mamba SSM models |
| math | For running math task answer checking |
| multilingual | For multilingual tokenizers |
......
......@@ -237,7 +237,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
results = evaluator.simple_evaluate(
model=args.model,
model_args=args.model_args,
tasks=all_tasks,
tasks=task_names,
num_fewshot=args.num_fewshot,
batch_size=args.batch_size,
max_batch_size=args.max_batch_size,
......
......@@ -3,6 +3,7 @@ import math
import random
from collections.abc import Iterable
from collections import defaultdict
from typing import List
import evaluate
import numpy as np
......@@ -459,3 +460,64 @@ def stderr_for_metric(metric, bootstrap_iters):
stderr = {mean: mean_stderr, acc_all: acc_all_stderr}
return stderr.get(metric, None)
def pooled_sample_stderr(stderrs: List[float], sizes: List[int]):
# Used to aggregate bootstrapped stderrs across subtasks in a group,
# when we are weighting by the size of each subtask.
#
assert len(stderrs) == len(sizes)
# formula source: https://en.wikipedia.org/wiki/Pooled_variance
# this empirically matches running `stderr_for_metric` on all instances
# from the subtasks concatenated with each other.
pooled_sample_var = (
sum([(size - 1) * stderr**2 for size, stderr in zip(sizes, stderrs)])
) / (sum(sizes) - len(sizes))
return np.sqrt(pooled_sample_var)
def combined_sample_stderr(stderrs: List[float], sizes: List[int], metrics=None):
assert (
metrics is not None
), "Need to pass a list of each subtask's metric for this stderr aggregation"
assert len(stderrs) == len(sizes) and len(sizes) == len(metrics)
# See https://github.com/EleutherAI/lm-evaluation-harness/pull/1390 for more documentation.
# This formula depends on sample means.
# removed because it seems to give erroneously huge stderrs for groupings of tasks
# and does not seem to match up with bootstrap-calculated stderrs for groups.
### don't use this unless a statistician has told you it's the right thing to do ###
# accumulators: we'll aggregate pairwise N - 1 times
variance = stderrs[0] ** 2
curr_size = sizes[0]
curr_score = metrics[0]
for stderr, size, score in zip(stderrs[1:], sizes[1:], metrics[1:]):
curr_score = ((curr_score * curr_size) + (score * size)) / (
curr_size + size
) # NOTE: this assumes our aggregation fn is "mean"
variance = ((curr_size - 1) * variance + (size - 1) * (stderr**2)) / (
curr_size + size - 1
) + curr_size * size / ((curr_size + size) * (curr_size + size - 1)) * (
curr_score - score
) ** 2
return np.sqrt(variance)
def aggregate_subtask_metrics(metrics, sizes, weight_by_size=True):
# A helper function that is used to aggregate
# subtask scores cross-task.
# TODO: does not hold for non-mean aggregations
if weight_by_size:
sizes = [1] * len(sizes)
assert len(metrics) == len(sizes)
return sum([metric * size for metric, size in zip(metrics, sizes)]) / sum(sizes)
import logging
from typing import Callable, Dict
import evaluate
......@@ -75,7 +76,7 @@ def register_group(name):
OUTPUT_TYPE_REGISTRY = {}
METRIC_REGISTRY = {}
METRIC_AGGREGATION_REGISTRY = {}
AGGREGATION_REGISTRY = {}
AGGREGATION_REGISTRY: Dict[str, Callable[[], Dict[str, Callable]]] = {}
HIGHER_IS_BETTER_REGISTRY = {}
DEFAULT_METRIC_REGISTRY = {
......@@ -118,7 +119,7 @@ def register_metric(**args):
return decorate
def get_metric(name, hf_evaluate_metric=False):
def get_metric(name: str, hf_evaluate_metric=False) -> Callable:
if not hf_evaluate_metric:
if name in METRIC_REGISTRY:
return METRIC_REGISTRY[name]
......@@ -136,7 +137,7 @@ def get_metric(name, hf_evaluate_metric=False):
)
def register_aggregation(name):
def register_aggregation(name: str):
def decorate(fn):
assert (
name not in AGGREGATION_REGISTRY
......@@ -148,21 +149,21 @@ def register_aggregation(name):
return decorate
def get_aggregation(name):
def get_aggregation(name: str) -> Callable[[], Dict[str, Callable]]:
try:
return AGGREGATION_REGISTRY[name]
except KeyError:
eval_logger.warning(f"{name} not a registered aggregation metric!")
def get_metric_aggregation(name):
def get_metric_aggregation(name: str) -> Callable[[], Dict[str, Callable]]:
try:
return METRIC_AGGREGATION_REGISTRY[name]
except KeyError:
eval_logger.warning(f"{name} metric is not assigned a default aggregation!")
def is_higher_better(metric_name):
def is_higher_better(metric_name) -> bool:
try:
return HIGHER_IS_BETTER_REGISTRY[metric_name]
except KeyError:
......
......@@ -373,7 +373,7 @@ class Task(abc.ABC):
else:
assert False, f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
eval_logger.info(f"Building contexts for task on rank {rank}...")
eval_logger.info(f"Building contexts for {self.config.task} on rank {rank}...")
instances = []
for doc_id, doc in utils.create_iterator(
......@@ -527,6 +527,7 @@ class Task(abc.ABC):
return description + labeled_examples + example
def apply_filters(self):
"""Iterates over FilterEnsembles and applies them to instances"""
if hasattr(self, "_filters"):
for f in self._filters:
f.apply(self._instances)
......@@ -535,15 +536,51 @@ class Task(abc.ABC):
return self._instances
def dump_config(self) -> dict:
"""Returns a dictionary representing the task's config.
:returns: str
The fewshot context.
"""
"""Returns the config as a dictionary."""
# TODO: this should only return the overrides applied to a non-YAML task's configuration.
# (num_fewshot)
return self.config.to_dict()
def set_config(self, key: str, value: Any, update: bool = False) -> None:
"""Set or update the configuration for a given key."""
if key is None:
raise ValueError("Key must be provided.")
if update:
current_value = getattr(self._config, key, {})
if not isinstance(current_value, dict):
raise TypeError(
f"Expected a dict for key '{key}', got {type(current_value).__name__} instead."
)
current_value.update(value)
else:
setattr(self._config, key, value)
def override_metric(self, metric_name: str) -> None:
"""
Override the default metrics used for evaluation with custom metrics.
Parameters:
- metric_name (str): The name of the custom metric to override. Should be registered in api.metrics.
"""
(
self._metric_fn_list,
self._aggregation_list,
self._metric_fn_kwargs,
self._higher_is_better,
) = ({}, {}, {}, {})
self._metric_fn_list[metric_name] = get_metric(metric_name)
self._aggregation_list[metric_name] = get_metric_aggregation(metric_name)
self._higher_is_better[metric_name] = is_higher_better(metric_name)
self._metric_fn_kwargs[metric_name] = {}
if not isinstance(self, ConfigurableTask):
self.process_results = lambda x, y: {metric_name: get_metric(metric_name)}
self.aggregation = lambda: {
metric_name: get_metric_aggregation(metric_name)
}
setattr(self._config, "metric_list", [{"metric": metric_name}])
setattr(self._config, "process_results", None)
class ConfigurableTask(Task):
VERSION = "Yaml"
......@@ -849,6 +886,7 @@ class ConfigurableTask(Task):
return labeled_examples + str(example)
def apply_filters(self):
"""Iterates over FilterEnsembles and applies them to instances"""
if hasattr(self, "_filters"):
for f in self._filters:
f.apply(self._instances)
......@@ -1255,37 +1293,6 @@ class ConfigurableTask(Task):
def get_config(self, key: str) -> Any:
return getattr(self._config, key, None)
def override_metric(self, metric_name: str) -> None:
"""
Override the default metrics used for evaluation with custom metrics.
Parameters:
- metric_name (str): The name of the custom metric to override. Should be registered in api.metrics.
"""
(
self._metric_fn_list,
self._aggregation_list,
self._metric_fn_kwargs,
self._higher_is_better,
) = ({}, {}, {}, {})
self._metric_fn_list[metric_name] = get_metric(metric_name)
self._aggregation_list[metric_name] = get_metric_aggregation(metric_name)
self._higher_is_better[metric_name] = is_higher_better(metric_name)
self._metric_fn_kwargs[metric_name] = {}
setattr(self._config, "metric_list", [{"metric": metric_name}])
setattr(self._config, "process_results", None)
def override_config(
self, key: str = None, value: Any = None, update: bool = False
) -> None:
if update:
current_value = getattr(self._config, key)
assert isinstance(current_value, dict)
current_value.update(value)
setattr(self._config, key, current_value)
else:
setattr(self._config, key, value)
class MultipleChoiceTask(Task):
OUTPUT_TYPE: str = "loglikelihood"
......
import random
import itertools
import collections
import torch
import itertools
import logging
import random
from typing import Optional, Union
import numpy as np
import torch
import lm_eval.api
import lm_eval.models
import lm_eval.api.metrics
import lm_eval.api.registry
from lm_eval.tasks import (
get_task_dict,
TaskManager
)
import lm_eval.models
from lm_eval.tasks import TaskManager, get_task_dict
from lm_eval.utils import (
eval_logger,
get_git_commit_hash,
positional_deprecated,
run_task_tests,
get_git_commit_hash,
simple_parse_args_string,
eval_logger
)
@positional_deprecated
def simple_evaluate(
model,
model_args=None,
model_args: Optional[str] = None,
tasks=None,
num_fewshot=None,
batch_size=None,
max_batch_size=None,
device=None,
use_cache=None,
limit=None,
num_fewshot: Optional[int] = None,
batch_size: Optional[int] = None,
max_batch_size: Optional[int] = None,
device: Optional[str] = None,
use_cache: Optional[str] = None,
limit: Optional[Union[int, float]] = None,
bootstrap_iters: int = 100000,
check_integrity: bool = False,
decontamination_ngrams_path=None,
......@@ -138,8 +133,8 @@ def simple_evaluate(
eval_logger.info(
"get_task_dict has been updated to accept an optional argument, `task_manager`"
"Read more here: https://github.com/EleutherAI/lm-evaluation-harness/blob/recursive-groups/docs/interface.md#external-library-usage"
)
"Read more here:https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage"
)
task_dict = get_task_dict(tasks, task_manager)
for task_name in task_dict.keys():
task_obj = task_dict[task_name]
......@@ -150,7 +145,7 @@ def simple_evaluate(
if task_obj.get_config("output_type") == "generate_until":
if gen_kwargs is not None:
task_obj.override_config(
task_obj.set_config(
key="generation_kwargs", value=gen_kwargs, update=True
)
......@@ -171,7 +166,7 @@ def simple_evaluate(
eval_logger.warning(
f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
)
task_obj.override_config(key="num_fewshot", value=num_fewshot)
task_obj.set_config(key="num_fewshot", value=num_fewshot)
if check_integrity:
run_task_tests(task_list=tasks)
......@@ -222,8 +217,8 @@ decontaminate_suffix = "_decontaminate"
def evaluate(
lm,
task_dict,
limit=None,
bootstrap_iters: int = 100000,
limit: Optional[int] = None,
bootstrap_iters: Optional[int] = 100000,
decontamination_ngrams_path=None,
write_out: bool = False,
log_samples: bool = True,
......@@ -297,13 +292,9 @@ def evaluate(
versions[task_name] = task.VERSION
configs[task_name] = dict(task.dump_config())
if "num_fewshot" in configs[task_name]:
if configs[task_name]["metadata"]:
n_shot = configs[task_name]["metadata"].get("num_fewshot", None)
if not n_shot:
n_shot = configs[task_name]["num_fewshot"]
else:
n_shot = 0 # TODO: is this always right?
# Number of few-shots for printing.
if (n_shot := configs[task_name].get("num_fewshot")) == 0:
n_shot = configs[task_name].get("metadata", {}).get("num_fewshot", 0)
num_fewshot[task_name] = n_shot
if "task_alias" in configs[task_name]:
......@@ -483,97 +474,70 @@ def evaluate(
vals = vals_torch
if lm.rank == 0:
### Aggregate results over all datapoints ###
# aggregate results ; run bootstrap CIs
for (task_name, key, metric), items in vals.items():
task = task_dict[task_name]
metric_key = metric + "," + key
if isinstance(task, tuple):
group_name, task = task
else:
group_name = None
group_name, task = task if isinstance(task, tuple) else (None, task)
metric_key = f"{metric},{key}"
agg_fn = task.aggregation()[metric]
results[task_name][metric_key] = agg_fn(items)
results[task_name]["samples"] = len(items)
# hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
# so we run them less iterations. still looking for a cleaner way to do this
if bootstrap_iters > 0:
stderr = lm_eval.api.metrics.stderr_for_metric(
metric=task.aggregation()[metric],
stderr_fn = lm_eval.api.metrics.stderr_for_metric(
metric=agg_fn,
bootstrap_iters=min(bootstrap_iters, 100)
if metric in ["bleu", "chrf", "ter"]
else bootstrap_iters,
)
if stderr is not None and len(items) > 1:
results[task_name][metric + "_stderr" + "," + key] = stderr(items)
else:
results[task_name][metric + "_stderr" + "," + key] = "N/A"
results[task_name][f"{metric}_stderr,{key}"] = (
stderr_fn(items) if (stderr_fn and len(items) > 1) else "N/A"
)
if bool(results):
for group, task_list in reversed(task_hierarchy.items()):
if task_list == []:
# TODO: No samples when bypass
total_size = results[group].get("samples", 999)
else:
total_size = 0
for task in task_list:
metrics = results[task].copy()
if "alias" in metrics:
metrics.pop("alias")
current_size = metrics.pop("samples")
all_stderr = []
for metric in [
key for key in metrics.keys() if "_stderr" not in key
]:
stderr = "_stderr,".join(metric.split(","))
stderr_score = results[task][stderr]
if stderr_score == "N/A":
var_score = "N/A"
else:
var_score = stderr_score**2
all_stderr.append(stderr)
metric_score = results[task][metric]
if metric in results[group]:
results[group][metric] = (
results[group][metric] * total_size
+ metric_score * current_size
) / (total_size + current_size)
# $$s_z^2 = \frac{(n-1) s_x^2 + (m-1) s_y^2}{n+m-1} + \frac{nm(\bar x - \bar y)^2}{(n+m)(n+m-1)}.$$
if var_score == "N/A" or results[group][stderr] == "N/A":
results[group][stderr] = "N/A"
else:
results[group][stderr] = (
(total_size - 1) * results[group][stderr]
+ (current_size - 1) * var_score
) / (
total_size + current_size - 1
) + total_size * current_size / (
(total_size + current_size)
* (total_size + current_size - 1)
) * (
results[group][metric] - metric_score
) ** 2
else:
results[group][metric] = metric_score
results[group][stderr] = var_score
total_size += current_size
for stderr in all_stderr:
results[group][stderr] = np.sqrt(results[group][stderr])
results[group]["samples"] = total_size
if len(task_list) == 0:
# task_hierarchy entries are either
# `group_name: [subtask1, subtask2, ...]`
# or `task_name: []`.
# we only want to operate on groups here.
continue
for metric in [
key
for key in results[task_list[0]].keys()
if "_stderr" not in key and key not in ["alias", "samples"]
]: # TODO: what if tasks don't all share the same metrics
stderr = "_stderr,".join(metric.split(","))
# gather metrics, sizes, and stderrs from subtasks
metrics = [
results[task][metric] for task in task_list
] # TODO: copy?
stderrs = [results[task][stderr] for task in task_list]
sizes = [results[task]["samples"] for task in task_list]
# compute group's pooled metric and stderr
results[group][
metric
] = lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
# TODO: calculate grouped metric using aggregation fn
if "N/A" in stderrs:
results[group][stderr] = "N/A"
else:
results[group][
stderr
] = lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
# TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility
# To use the old (likely incorrect) variance formula, comment out the above and uncomment this line:
# results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)
results[group]["samples"] = sum(sizes)
def print_tasks(task_hierarchy, results, tab=0):
results_agg = collections.defaultdict(dict)
......@@ -648,8 +612,10 @@ def evaluate(
groups_agg = {**groups_agg, **_groups_agg}
for group_name, task_list in task_hierarchy.items():
if task_list != []:
num_fewshot[group_name] = num_fewshot[task_list[0]] # TODO: validate this
if task_list:
num_fewshot[group_name] = num_fewshot[
task_list[0]
] # TODO: validate this
results_dict = {
"results": dict(results_agg.items()),
......
......@@ -7,5 +7,16 @@ from . import gguf
from . import vllm_causallms
from . import mamba_lm
from . import optimum_lm
from . import neuron_optimum
# TODO: implement __all__
import os
try:
# enabling faster model download
import hf_transfer
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
except ImportError:
pass
import copy
import os
from datetime import timedelta
from pathlib import Path
from typing import List, Literal, Optional, Tuple, Union
import torch
import torch.nn.functional as F
import transformers
from accelerate import Accelerator, DistributedType, find_executable_batch_size
from accelerate import (
Accelerator,
DistributedType,
InitProcessGroupKwargs,
find_executable_batch_size,
)
from packaging import version
from peft import PeftModel
from peft import __version__ as PEFT_VERSION
......@@ -132,7 +138,8 @@ class HFLM(LM):
assert isinstance(batch_size, (int, str))
gpus = torch.cuda.device_count()
accelerator = Accelerator()
accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs])
if accelerator.num_processes > 1:
self.accelerator = accelerator
......@@ -617,7 +624,13 @@ class HFLM(LM):
return batch_size
batch_size = forward_batch()
try:
batch_size = forward_batch()
except RuntimeError as e:
if "No executable batch size found" in str(e):
batch_size = 1
else:
raise
if self.world_size > 1:
# if multi-GPU, always take minimum over all selected batch sizes
......@@ -721,6 +734,11 @@ class HFLM(LM):
# and we don't want a warning from HF
generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
do_sample = generation_kwargs.get("do_sample", None)
# The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
generation_kwargs["do_sample"] = do_sample = False
if do_sample is False and generation_kwargs.get("temperature") == 0.0:
generation_kwargs.pop("temperature")
# build stopping criteria
......
This diff is collapsed.
# Multilingual ARC
### Paper
Title: `Okapi: Instruction-tuned Large Language Models in Multiple Languages with Reinforcement Learning from Human Feedback`
Abstract: https://arxiv.org/abs/2307.16039
A key technology for the development of large language models (LLMs) involves instruction tuning that helps align the models' responses with human expectations to realize impressive learning abilities. Two major approaches for instruction tuning characterize supervised fine-tuning (SFT) and reinforcement learning from human feedback (RLHF), which are currently applied to produce the best commercial LLMs (e.g., ChatGPT). To improve the accessibility of LLMs for research and development efforts, various instruction-tuned open-source LLMs have also been introduced recently, e.g., Alpaca, Vicuna, to name a few. However, existing open-source LLMs have only been instruction-tuned for English and a few popular languages, thus hindering their impacts and accessibility to many other languages in the world. Among a few very recent work to explore instruction tuning for LLMs in multiple languages, SFT has been used as the only approach to instruction-tune LLMs for multiple languages. This has left a significant gap for fine-tuned LLMs based on RLHF in diverse languages and raised important questions on how RLHF can boost the performance of multilingual instruction tuning. To overcome this issue, we present Okapi, the first system with instruction-tuned LLMs based on RLHF for multiple languages. Okapi introduces instruction and response-ranked data in 26 diverse languages to facilitate the experiments and development of future multilingual LLM research. We also present benchmark datasets to enable the evaluation of generative LLMs in multiple languages. Our experiments demonstrate the advantages of RLHF for multilingual instruction over SFT for different base models and datasets. Our framework and resources are released at this https URL.
Homepage: `https://github.com/nlp-uoregon/Okapi`
### Citation
```
@article{dac2023okapi,
title={Okapi: Instruction-tuned Large Language Models in Multiple Languages with Reinforcement Learning from Human Feedback},
author={Dac Lai, Viet and Van Nguyen, Chien and Ngo, Nghia Trung and Nguyen, Thuat and Dernoncourt, Franck and Rossi, Ryan A and Nguyen, Thien Huu},
journal={arXiv e-prints},
pages={arXiv--2307},
year={2023}
}
```
### Groups and Tasks
#### Groups
- arc_multilingual
#### Tasks
- `arc_{ar,bn,ca,da,de,es,eu,fr,gu,hi,hr,hu,hy,id,it,kn,ml,mr,ne,nl,pt,ro,ru,sk,sr,sv,ta,te,uk,vi,zh}`
### Checklist
For adding novel benchmarks/datasets to the library:
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
group:
- arc_multilingual
dataset_path: null
dataset_name: null
output_type: multiple_choice
training_split: train
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "query"
doc_to_target: "gold"
doc_to_choice: "choices"
should_decontaminate: true
doc_to_decontamination_query: "query"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
include: _arc_yaml
task: arc_ar
dataset_path: alexandrainst/m_arc
dataset_name: ar
training_split: train
validation_split: validation
test_split: test
include: _arc_yaml
task: arc_bn
dataset_path: alexandrainst/m_arc
dataset_name: bn
training_split: train
validation_split: validation
test_split: test
include: _arc_yaml
task: arc_ca
dataset_path: alexandrainst/m_arc
dataset_name: ca
training_split: train
validation_split: validation
test_split: test
include: _arc_yaml
task: arc_da
dataset_path: alexandrainst/m_arc
dataset_name: da
training_split: train
validation_split: validation
test_split: test
include: _arc_yaml
task: arc_de
dataset_path: alexandrainst/m_arc
dataset_name: de
training_split: train
validation_split: validation
test_split: test
include: _arc_yaml
task: arc_es
dataset_path: alexandrainst/m_arc
dataset_name: es
training_split: train
validation_split: validation
test_split: test
include: _arc_yaml
task: arc_eu
dataset_path: alexandrainst/m_arc
dataset_name: eu
training_split: train
validation_split: validation
test_split: test
include: _arc_yaml
task: arc_fr
dataset_path: alexandrainst/m_arc
dataset_name: fr
training_split: train
validation_split: validation
test_split: test
include: _arc_yaml
task: arc_gu
dataset_path: alexandrainst/m_arc
dataset_name: gu
training_split: train
validation_split: validation
test_split: test
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment