Commit bf2517cc authored by lintangsutawika's avatar lintangsutawika
Browse files

update latest

parents 8bca751c 7397b965
......@@ -45,6 +45,7 @@ git clone https://github.com/EleutherAI/lm-evaluation-harness
cd lm-evaluation-harness
pip install -e .
```
We also provide a number of optional dependencies for extended functionality. A detailed table is available at the end of this document.
## Basic Usage
......@@ -174,6 +175,7 @@ Note that for externally hosted models, configs such as `--device` and `--batch_
| vLLM | :heavy_check_mark: | `vllm` | [Most HF Causal Language Models](https://docs.vllm.ai/en/latest/models/supported_models.html) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| Mamba | :heavy_check_mark: | `mamba_ssm` | [Mamba architecture Language Models via the `mamba_ssm` package](https://huggingface.co/state-spaces) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| Huggingface Optimum (Causal LMs) | ✔️ | `openvino` | Any decoder-only AutoModelForCausalLM converted with Huggingface Optimum into OpenVINO™ Intermediate Representation (IR) format | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | ... |
| Neuron via AWS Inf2 (Causal LMs) | ✔️ | `neuronx` | Any decoder-only AutoModelForCausalLM supported to run on [huggingface-ami image for inferentia2](https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | ... |
| Your local inference server! | :heavy_check_mark: | `local-completions` or `local-chat-completions` (using `openai-chat-completions` model type) | Any server address that accepts GET requests using HF models and mirror's OpenAI's Completions or ChatCompletions interface | `generate_until` | | ... |
Models which do not supply logits or logprobs can be used with tasks of type `generate_until` only, while local models, or APIs that supply logprobs/logits of their prompts, can be run on all task types: `generate_until`, `loglikelihood`, `loglikelihood_rolling`, and `multiple_choice`.
......@@ -196,7 +198,7 @@ If you have a Metal compatible Mac, you can run the eval harness using the MPS b
> You can inspect what the LM inputs look like by running the following command:
> ```bash
> python write_out.py \
> --tasks all_tasks \
> --tasks <task1,task2,...> \
> --num_fewshot 5 \
> --num_examples 10 \
> --output_base_path /path/to/output/folder
......@@ -312,7 +314,9 @@ Extras dependencies can be installed via `pip install -e ".[NAME]"`
| anthropic | For using Anthropic's models |
| dev | For linting PRs and contributions |
| gptq | For loading models with GPTQ |
| hf_transfer | For speeding up HF Hub file downloads |
| ifeval | For running the IFEval task |
| neuronx | For running on AWS inf2 instances |
| mamba | For loading Mamba SSM models |
| math | For running math task answer checking |
| multilingual | For multilingual tokenizers |
......
......@@ -237,7 +237,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
results = evaluator.simple_evaluate(
model=args.model,
model_args=args.model_args,
tasks=all_tasks,
tasks=task_names,
num_fewshot=args.num_fewshot,
batch_size=args.batch_size,
max_batch_size=args.max_batch_size,
......
......@@ -3,6 +3,7 @@ import math
import random
from collections.abc import Iterable
from collections import defaultdict
from typing import List
import evaluate
import numpy as np
......@@ -459,3 +460,64 @@ def stderr_for_metric(metric, bootstrap_iters):
stderr = {mean: mean_stderr, acc_all: acc_all_stderr}
return stderr.get(metric, None)
def pooled_sample_stderr(stderrs: List[float], sizes: List[int]):
# Used to aggregate bootstrapped stderrs across subtasks in a group,
# when we are weighting by the size of each subtask.
#
assert len(stderrs) == len(sizes)
# formula source: https://en.wikipedia.org/wiki/Pooled_variance
# this empirically matches running `stderr_for_metric` on all instances
# from the subtasks concatenated with each other.
pooled_sample_var = (
sum([(size - 1) * stderr**2 for size, stderr in zip(sizes, stderrs)])
) / (sum(sizes) - len(sizes))
return np.sqrt(pooled_sample_var)
def combined_sample_stderr(stderrs: List[float], sizes: List[int], metrics=None):
assert (
metrics is not None
), "Need to pass a list of each subtask's metric for this stderr aggregation"
assert len(stderrs) == len(sizes) and len(sizes) == len(metrics)
# See https://github.com/EleutherAI/lm-evaluation-harness/pull/1390 for more documentation.
# This formula depends on sample means.
# removed because it seems to give erroneously huge stderrs for groupings of tasks
# and does not seem to match up with bootstrap-calculated stderrs for groups.
### don't use this unless a statistician has told you it's the right thing to do ###
# accumulators: we'll aggregate pairwise N - 1 times
variance = stderrs[0] ** 2
curr_size = sizes[0]
curr_score = metrics[0]
for stderr, size, score in zip(stderrs[1:], sizes[1:], metrics[1:]):
curr_score = ((curr_score * curr_size) + (score * size)) / (
curr_size + size
) # NOTE: this assumes our aggregation fn is "mean"
variance = ((curr_size - 1) * variance + (size - 1) * (stderr**2)) / (
curr_size + size - 1
) + curr_size * size / ((curr_size + size) * (curr_size + size - 1)) * (
curr_score - score
) ** 2
return np.sqrt(variance)
def aggregate_subtask_metrics(metrics, sizes, weight_by_size=True):
# A helper function that is used to aggregate
# subtask scores cross-task.
# TODO: does not hold for non-mean aggregations
if weight_by_size:
sizes = [1] * len(sizes)
assert len(metrics) == len(sizes)
return sum([metric * size for metric, size in zip(metrics, sizes)]) / sum(sizes)
import logging
from typing import Callable, Dict
import evaluate
......@@ -75,7 +76,7 @@ def register_group(name):
OUTPUT_TYPE_REGISTRY = {}
METRIC_REGISTRY = {}
METRIC_AGGREGATION_REGISTRY = {}
AGGREGATION_REGISTRY = {}
AGGREGATION_REGISTRY: Dict[str, Callable[[], Dict[str, Callable]]] = {}
HIGHER_IS_BETTER_REGISTRY = {}
DEFAULT_METRIC_REGISTRY = {
......@@ -118,7 +119,7 @@ def register_metric(**args):
return decorate
def get_metric(name, hf_evaluate_metric=False):
def get_metric(name: str, hf_evaluate_metric=False) -> Callable:
if not hf_evaluate_metric:
if name in METRIC_REGISTRY:
return METRIC_REGISTRY[name]
......@@ -136,7 +137,7 @@ def get_metric(name, hf_evaluate_metric=False):
)
def register_aggregation(name):
def register_aggregation(name: str):
def decorate(fn):
assert (
name not in AGGREGATION_REGISTRY
......@@ -148,21 +149,21 @@ def register_aggregation(name):
return decorate
def get_aggregation(name):
def get_aggregation(name: str) -> Callable[[], Dict[str, Callable]]:
try:
return AGGREGATION_REGISTRY[name]
except KeyError:
eval_logger.warning(f"{name} not a registered aggregation metric!")
def get_metric_aggregation(name):
def get_metric_aggregation(name: str) -> Callable[[], Dict[str, Callable]]:
try:
return METRIC_AGGREGATION_REGISTRY[name]
except KeyError:
eval_logger.warning(f"{name} metric is not assigned a default aggregation!")
def is_higher_better(metric_name):
def is_higher_better(metric_name) -> bool:
try:
return HIGHER_IS_BETTER_REGISTRY[metric_name]
except KeyError:
......
......@@ -373,7 +373,7 @@ class Task(abc.ABC):
else:
assert False, f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
eval_logger.info(f"Building contexts for task on rank {rank}...")
eval_logger.info(f"Building contexts for {self.config.task} on rank {rank}...")
instances = []
for doc_id, doc in utils.create_iterator(
......@@ -527,6 +527,7 @@ class Task(abc.ABC):
return description + labeled_examples + example
def apply_filters(self):
"""Iterates over FilterEnsembles and applies them to instances"""
if hasattr(self, "_filters"):
for f in self._filters:
f.apply(self._instances)
......@@ -535,15 +536,51 @@ class Task(abc.ABC):
return self._instances
def dump_config(self) -> dict:
"""Returns a dictionary representing the task's config.
:returns: str
The fewshot context.
"""
"""Returns the config as a dictionary."""
# TODO: this should only return the overrides applied to a non-YAML task's configuration.
# (num_fewshot)
return self.config.to_dict()
def set_config(self, key: str, value: Any, update: bool = False) -> None:
"""Set or update the configuration for a given key."""
if key is None:
raise ValueError("Key must be provided.")
if update:
current_value = getattr(self._config, key, {})
if not isinstance(current_value, dict):
raise TypeError(
f"Expected a dict for key '{key}', got {type(current_value).__name__} instead."
)
current_value.update(value)
else:
setattr(self._config, key, value)
def override_metric(self, metric_name: str) -> None:
"""
Override the default metrics used for evaluation with custom metrics.
Parameters:
- metric_name (str): The name of the custom metric to override. Should be registered in api.metrics.
"""
(
self._metric_fn_list,
self._aggregation_list,
self._metric_fn_kwargs,
self._higher_is_better,
) = ({}, {}, {}, {})
self._metric_fn_list[metric_name] = get_metric(metric_name)
self._aggregation_list[metric_name] = get_metric_aggregation(metric_name)
self._higher_is_better[metric_name] = is_higher_better(metric_name)
self._metric_fn_kwargs[metric_name] = {}
if not isinstance(self, ConfigurableTask):
self.process_results = lambda x, y: {metric_name: get_metric(metric_name)}
self.aggregation = lambda: {
metric_name: get_metric_aggregation(metric_name)
}
setattr(self._config, "metric_list", [{"metric": metric_name}])
setattr(self._config, "process_results", None)
class ConfigurableTask(Task):
VERSION = "Yaml"
......@@ -849,6 +886,7 @@ class ConfigurableTask(Task):
return labeled_examples + str(example)
def apply_filters(self):
"""Iterates over FilterEnsembles and applies them to instances"""
if hasattr(self, "_filters"):
for f in self._filters:
f.apply(self._instances)
......@@ -1255,37 +1293,6 @@ class ConfigurableTask(Task):
def get_config(self, key: str) -> Any:
return getattr(self._config, key, None)
def override_metric(self, metric_name: str) -> None:
"""
Override the default metrics used for evaluation with custom metrics.
Parameters:
- metric_name (str): The name of the custom metric to override. Should be registered in api.metrics.
"""
(
self._metric_fn_list,
self._aggregation_list,
self._metric_fn_kwargs,
self._higher_is_better,
) = ({}, {}, {}, {})
self._metric_fn_list[metric_name] = get_metric(metric_name)
self._aggregation_list[metric_name] = get_metric_aggregation(metric_name)
self._higher_is_better[metric_name] = is_higher_better(metric_name)
self._metric_fn_kwargs[metric_name] = {}
setattr(self._config, "metric_list", [{"metric": metric_name}])
setattr(self._config, "process_results", None)
def override_config(
self, key: str = None, value: Any = None, update: bool = False
) -> None:
if update:
current_value = getattr(self._config, key)
assert isinstance(current_value, dict)
current_value.update(value)
setattr(self._config, key, current_value)
else:
setattr(self._config, key, value)
class MultipleChoiceTask(Task):
OUTPUT_TYPE: str = "loglikelihood"
......
import random
import itertools
import collections
import torch
import itertools
import logging
import random
from typing import Optional, Union
import numpy as np
import torch
import lm_eval.api
import lm_eval.models
import lm_eval.api.metrics
import lm_eval.api.registry
from lm_eval.tasks import (
get_task_dict,
TaskManager
)
import lm_eval.models
from lm_eval.tasks import TaskManager, get_task_dict
from lm_eval.utils import (
eval_logger,
get_git_commit_hash,
positional_deprecated,
run_task_tests,
get_git_commit_hash,
simple_parse_args_string,
eval_logger
)
@positional_deprecated
def simple_evaluate(
model,
model_args=None,
model_args: Optional[str] = None,
tasks=None,
num_fewshot=None,
batch_size=None,
max_batch_size=None,
device=None,
use_cache=None,
limit=None,
num_fewshot: Optional[int] = None,
batch_size: Optional[int] = None,
max_batch_size: Optional[int] = None,
device: Optional[str] = None,
use_cache: Optional[str] = None,
limit: Optional[Union[int, float]] = None,
bootstrap_iters: int = 100000,
check_integrity: bool = False,
decontamination_ngrams_path=None,
......@@ -138,8 +133,8 @@ def simple_evaluate(
eval_logger.info(
"get_task_dict has been updated to accept an optional argument, `task_manager`"
"Read more here: https://github.com/EleutherAI/lm-evaluation-harness/blob/recursive-groups/docs/interface.md#external-library-usage"
)
"Read more here:https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage"
)
task_dict = get_task_dict(tasks, task_manager)
for task_name in task_dict.keys():
task_obj = task_dict[task_name]
......@@ -150,7 +145,7 @@ def simple_evaluate(
if task_obj.get_config("output_type") == "generate_until":
if gen_kwargs is not None:
task_obj.override_config(
task_obj.set_config(
key="generation_kwargs", value=gen_kwargs, update=True
)
......@@ -171,7 +166,7 @@ def simple_evaluate(
eval_logger.warning(
f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
)
task_obj.override_config(key="num_fewshot", value=num_fewshot)
task_obj.set_config(key="num_fewshot", value=num_fewshot)
if check_integrity:
run_task_tests(task_list=tasks)
......@@ -222,8 +217,8 @@ decontaminate_suffix = "_decontaminate"
def evaluate(
lm,
task_dict,
limit=None,
bootstrap_iters: int = 100000,
limit: Optional[int] = None,
bootstrap_iters: Optional[int] = 100000,
decontamination_ngrams_path=None,
write_out: bool = False,
log_samples: bool = True,
......@@ -297,13 +292,9 @@ def evaluate(
versions[task_name] = task.VERSION
configs[task_name] = dict(task.dump_config())
if "num_fewshot" in configs[task_name]:
if configs[task_name]["metadata"]:
n_shot = configs[task_name]["metadata"].get("num_fewshot", None)
if not n_shot:
n_shot = configs[task_name]["num_fewshot"]
else:
n_shot = 0 # TODO: is this always right?
# Number of few-shots for printing.
if (n_shot := configs[task_name].get("num_fewshot")) == 0:
n_shot = configs[task_name].get("metadata", {}).get("num_fewshot", 0)
num_fewshot[task_name] = n_shot
if "task_alias" in configs[task_name]:
......@@ -483,97 +474,70 @@ def evaluate(
vals = vals_torch
if lm.rank == 0:
### Aggregate results over all datapoints ###
# aggregate results ; run bootstrap CIs
for (task_name, key, metric), items in vals.items():
task = task_dict[task_name]
metric_key = metric + "," + key
if isinstance(task, tuple):
group_name, task = task
else:
group_name = None
group_name, task = task if isinstance(task, tuple) else (None, task)
metric_key = f"{metric},{key}"
agg_fn = task.aggregation()[metric]
results[task_name][metric_key] = agg_fn(items)
results[task_name]["samples"] = len(items)
# hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
# so we run them less iterations. still looking for a cleaner way to do this
if bootstrap_iters > 0:
stderr = lm_eval.api.metrics.stderr_for_metric(
metric=task.aggregation()[metric],
stderr_fn = lm_eval.api.metrics.stderr_for_metric(
metric=agg_fn,
bootstrap_iters=min(bootstrap_iters, 100)
if metric in ["bleu", "chrf", "ter"]
else bootstrap_iters,
)
if stderr is not None and len(items) > 1:
results[task_name][metric + "_stderr" + "," + key] = stderr(items)
else:
results[task_name][metric + "_stderr" + "," + key] = "N/A"
results[task_name][f"{metric}_stderr,{key}"] = (
stderr_fn(items) if (stderr_fn and len(items) > 1) else "N/A"
)
if bool(results):
for group, task_list in reversed(task_hierarchy.items()):
if task_list == []:
# TODO: No samples when bypass
total_size = results[group].get("samples", 999)
else:
total_size = 0
for task in task_list:
metrics = results[task].copy()
if "alias" in metrics:
metrics.pop("alias")
current_size = metrics.pop("samples")
all_stderr = []
for metric in [
key for key in metrics.keys() if "_stderr" not in key
]:
stderr = "_stderr,".join(metric.split(","))
stderr_score = results[task][stderr]
if stderr_score == "N/A":
var_score = "N/A"
else:
var_score = stderr_score**2
all_stderr.append(stderr)
metric_score = results[task][metric]
if metric in results[group]:
results[group][metric] = (
results[group][metric] * total_size
+ metric_score * current_size
) / (total_size + current_size)
# $$s_z^2 = \frac{(n-1) s_x^2 + (m-1) s_y^2}{n+m-1} + \frac{nm(\bar x - \bar y)^2}{(n+m)(n+m-1)}.$$
if var_score == "N/A" or results[group][stderr] == "N/A":
results[group][stderr] = "N/A"
else:
results[group][stderr] = (
(total_size - 1) * results[group][stderr]
+ (current_size - 1) * var_score
) / (
total_size + current_size - 1
) + total_size * current_size / (
(total_size + current_size)
* (total_size + current_size - 1)
) * (
results[group][metric] - metric_score
) ** 2
else:
results[group][metric] = metric_score
results[group][stderr] = var_score
total_size += current_size
for stderr in all_stderr:
results[group][stderr] = np.sqrt(results[group][stderr])
results[group]["samples"] = total_size
if len(task_list) == 0:
# task_hierarchy entries are either
# `group_name: [subtask1, subtask2, ...]`
# or `task_name: []`.
# we only want to operate on groups here.
continue
for metric in [
key
for key in results[task_list[0]].keys()
if "_stderr" not in key and key not in ["alias", "samples"]
]: # TODO: what if tasks don't all share the same metrics
stderr = "_stderr,".join(metric.split(","))
# gather metrics, sizes, and stderrs from subtasks
metrics = [
results[task][metric] for task in task_list
] # TODO: copy?
stderrs = [results[task][stderr] for task in task_list]
sizes = [results[task]["samples"] for task in task_list]
# compute group's pooled metric and stderr
results[group][
metric
] = lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
# TODO: calculate grouped metric using aggregation fn
if "N/A" in stderrs:
results[group][stderr] = "N/A"
else:
results[group][
stderr
] = lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
# TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility
# To use the old (likely incorrect) variance formula, comment out the above and uncomment this line:
# results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)
results[group]["samples"] = sum(sizes)
def print_tasks(task_hierarchy, results, tab=0):
results_agg = collections.defaultdict(dict)
......@@ -648,8 +612,10 @@ def evaluate(
groups_agg = {**groups_agg, **_groups_agg}
for group_name, task_list in task_hierarchy.items():
if task_list != []:
num_fewshot[group_name] = num_fewshot[task_list[0]] # TODO: validate this
if task_list:
num_fewshot[group_name] = num_fewshot[
task_list[0]
] # TODO: validate this
results_dict = {
"results": dict(results_agg.items()),
......
......@@ -7,5 +7,16 @@ from . import gguf
from . import vllm_causallms
from . import mamba_lm
from . import optimum_lm
from . import neuron_optimum
# TODO: implement __all__
import os
try:
# enabling faster model download
import hf_transfer
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
except ImportError:
pass
import copy
import os
from datetime import timedelta
from pathlib import Path
from typing import List, Literal, Optional, Tuple, Union
import torch
import torch.nn.functional as F
import transformers
from accelerate import Accelerator, DistributedType, find_executable_batch_size
from accelerate import (
Accelerator,
DistributedType,
InitProcessGroupKwargs,
find_executable_batch_size,
)
from packaging import version
from peft import PeftModel
from peft import __version__ as PEFT_VERSION
......@@ -132,7 +138,8 @@ class HFLM(LM):
assert isinstance(batch_size, (int, str))
gpus = torch.cuda.device_count()
accelerator = Accelerator()
accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs])
if accelerator.num_processes > 1:
self.accelerator = accelerator
......@@ -617,7 +624,13 @@ class HFLM(LM):
return batch_size
batch_size = forward_batch()
try:
batch_size = forward_batch()
except RuntimeError as e:
if "No executable batch size found" in str(e):
batch_size = 1
else:
raise
if self.world_size > 1:
# if multi-GPU, always take minimum over all selected batch sizes
......@@ -721,6 +734,11 @@ class HFLM(LM):
# and we don't want a warning from HF
generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
do_sample = generation_kwargs.get("do_sample", None)
# The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
generation_kwargs["do_sample"] = do_sample = False
if do_sample is False and generation_kwargs.get("temperature") == 0.0:
generation_kwargs.pop("temperature")
# build stopping criteria
......
import copy
import json
import logging
import subprocess
from collections import defaultdict
from typing import List, Optional, Union
import torch
import torch.nn.functional as F
import transformers
from packaging import version
from tqdm import tqdm
from transformers import GenerationConfig
from transformers.generation import StoppingCriteriaList
from lm_eval import utils
from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
from lm_eval.utils import stop_sequences_criteria
try:
NEURON_AVAILABLE = True
from optimum.neuron import NeuronModelForCausalLM
from optimum.neuron.generation import TokenSelector
from optimum.neuron.version import __version__ as optimum_neuron_version
except ImportError:
NeuronModelForCausalLM = object
NEURON_AVAILABLE = False
logger = logging.getLogger(__name__)
def get_nc_count() -> Union[int, None]:
"""Returns the number of neuron cores on the current instance."""
try:
cmd = "neuron-ls --json-output"
result = subprocess.run(cmd, shell=True, capture_output=True)
print(f"inferring nc_count from `neuron-ls` {result.stdout}")
json_output = json.loads(result.stdout)
count = sum([x["nc_count"] for x in json_output])
print(f"nc_count={count}")
return count
except Exception:
return None
def wrap_constant_batch_size(func):
def _decorator(self, input_ids):
"""input_ids a 2D array with batch_size on dim=0
makes sure the func runs with self.batch_size
"""
# access a from TestSample
batch_size = input_ids.shape[0]
if batch_size < self.batch_size:
# handle the event of input_ids.shape[0] != batch_size
# Neuron cores expect constant batch_size
input_ids = torch.concat(
(
input_ids,
# add missing_batch_size dummy
torch.zeros(
[self.batch_size - batch_size, *input_ids.size()[1:]],
dtype=input_ids.dtype,
device=input_ids.device,
),
),
dim=0,
)
elif batch_size > self.batch_size:
raise ValueError(
f"The specified batch_size ({batch_size}) exceeds the model static batch size ({self.batch_size})"
)
# return the forward pass that requires constant batch size
return func(self, input_ids)[:batch_size]
return _decorator
class CustomNeuronModelForCausalLM(NeuronModelForCausalLM):
"""NeuronModelForCausalLM with `stopping_criteria` in `generate`"""
def generate(
self,
input_ids: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
stopping_criteria: Optional["StoppingCriteriaList"] = None,
generation_config: Optional["GenerationConfig"] = None,
**kwargs,
) -> torch.LongTensor:
r"""
A streamlined generate() method overriding the transformers.GenerationMixin.generate() method.
This method uses the same logits processors/warpers and stopping criteria as the transformers library
`generate()` method but restricts the generation to greedy search and sampling.
It does not support transformers `generate()` advanced options.
Please refer to https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationMixin.generate
for details on generation configuration.
Parameters:
input_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`):
The sequence used as a prompt for the generation.
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices.
generation_config (`~transformers.generation.GenerationConfig`, *optional*):
The generation configuration to be used as base parametrization for the generation call. `**kwargs`
passed to generate matching the attributes of `generation_config` will override them. If
`generation_config` is not provided, default will be used, which had the following loading
priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
configuration. Please note that unspecified parameters will inherit [`~transformers.generation.GenerationConfig`]'s
default values, whose documentation should be checked to parameterize generation.
Returns:
`torch.Tensor`: A `torch.FloatTensor`.
"""
# The actual generation configuration is a combination of config and parameters
generation_config = copy.deepcopy(
self.generation_config if generation_config is None else generation_config
)
model_kwargs = generation_config.update(
**kwargs
) # All unused kwargs must be model kwargs
# Check model kwargs are actually used by either prepare_inputs_for_generation or forward
self._validate_model_kwargs(model_kwargs)
# Instantiate a TokenSelector for the specified configuration
selector = TokenSelector.create(
input_ids, generation_config, self, self.max_length
)
selector.stopping_criteria.append(stopping_criteria)
# Verify that the inputs are compatible with the model static input dimensions
batch_size, sequence_length = input_ids.shape
if sequence_length > self.max_length:
raise ValueError(
f"The input sequence length ({sequence_length}) exceeds the model static sequence length ({self.max_length})"
)
padded_input_ids = input_ids
padded_attention_mask = attention_mask
if batch_size > self.batch_size:
raise ValueError(
f"The specified batch_size ({batch_size}) exceeds the model static batch size ({self.batch_size})"
)
elif batch_size < self.batch_size:
logger.warning(
"Inputs will be padded to match the model static batch size. This will increase latency."
)
padding_shape = [self.batch_size - batch_size, sequence_length]
padding = torch.full(
padding_shape, fill_value=self.config.eos_token_id, dtype=torch.int64
)
padded_input_ids = torch.cat([input_ids, padding])
if attention_mask is not None:
padding = torch.zeros(padding_shape, dtype=torch.int64)
padded_attention_mask = torch.cat([attention_mask, padding])
# Drop the current generation context and clear the Key/Value cache
self.reset_generation()
output_ids = self.generate_tokens(
padded_input_ids,
selector,
batch_size,
attention_mask=padded_attention_mask,
**model_kwargs,
)
return output_ids[:batch_size, :]
@register_model("neuronx")
class NEURON_HF(LM):
"""
Enables usage with on AWS Neuron
using the HuggingFace Transformers + Transformers neuronx library.
Tested with neuron 2.17.0
"""
_DEFAULT_MAX_LENGTH = 2048
def __init__(
self,
pretrained: Optional[str] = "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
revision: Optional[str] = "main",
tp_degree: Optional[int] = None,
subfolder: Optional[str] = None,
tokenizer: Optional[str] = None,
truncation: Optional[bool] = False,
max_length: Optional[int] = None,
dtype: Optional[Union[str, torch.dtype]] = "auto",
batch_size: Optional[int] = 1,
low_cpu_mem_usage: Optional[bool] = True,
trust_remote_code: Optional[bool] = False,
use_fast_tokenizer: Optional[bool] = True,
# arguments used for splitting a model across GPUs naively.
# only used if `parallelize=True`.
) -> None:
if not NEURON_AVAILABLE:
raise Exception(
"Tried to load neuron model, but neuron is not installed ",
"please install neuron via pip install transformers-neuron ",
"also make sure you are running on an AWS inf2 instance",
)
if version.parse(optimum_neuron_version) != version.parse("0.0.17"):
logger.warning(
'`optimum-neuron` model requires `pip install "optimum[neuronx]>=0.0.17" '
"preferably using the Hugging Face Neuron Deep Learning AMI (Ubuntu 22.04) "
"https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2 "
f"You are using optimum-neuron={optimum_neuron_version}"
)
super().__init__()
assert isinstance(pretrained, str)
assert isinstance(batch_size, (int, str))
self.batch_size_per_gpu = int(batch_size)
batch_size = int(batch_size)
if tp_degree is None:
# execute `neuron-ls --json-output | jq '.[0].nc_count'``
# to get the number of neuron cores on your instance
tp_degree = get_nc_count()
assert isinstance(tp_degree, int), (
f"model_args must include tp_degree. tp_degree must be set to an integer,"
f" but is tp_degree=`{tp_degree}` with type=`{type(tp_degree)}`."
"Set it to number of neuron cores on your instance."
" For inf2.xlarge and inf2.8xlarge, set it to `2`."
" For inf2.24xlarge, set it to `12`."
" For inf2.48xlarge, set it to `24`."
)
# TODO: update this to be less of a hack once subfolder is fixed in HF
revision = revision + ("/" + subfolder if subfolder is not None else "")
self._config = transformers.AutoConfig.from_pretrained(
pretrained,
revision=revision,
trust_remote_code=trust_remote_code,
)
torch_dtype = utils.get_dtype(dtype)
assert torch_dtype in [
torch.float16,
torch.bfloat16,
], "Only float16 and bfloat16 are supported"
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
pretrained if tokenizer is None else tokenizer,
revision=revision,
trust_remote_code=trust_remote_code,
use_fast=use_fast_tokenizer,
)
# Neuron specific code
if torch_dtype == torch.float16:
self.amp_dtype = "f16"
elif torch_dtype == torch.bfloat16:
self.amp_dtype = "bf16"
elif torch_dtype == torch.float32:
self.amp_dtype = "f32"
else:
raise NotImplementedError("Only float16 and bfloat16 are implemented.")
compiler_args = {"num_cores": tp_degree, "auto_cast_type": self.amp_dtype}
input_shapes = {
"batch_size": batch_size,
"sequence_length": self._DEFAULT_MAX_LENGTH,
}
print(
f"{'='*20} \n loading model to neuron with"
f" {compiler_args}, {input_shapes}..."
)
self.model = CustomNeuronModelForCausalLM.from_pretrained(
pretrained,
revision=revision,
trust_remote_code=trust_remote_code,
low_cpu_mem_usage=low_cpu_mem_usage,
export=True,
**compiler_args,
**input_shapes,
)
print(f"SUCCESS: neuron model compiled. \n {'='*20}")
self.truncation = truncation
self.vocab_size = self.tokenizer.vocab_size
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
self._max_length = max_length
self.batch_schedule = 1
self.batch_sizes = {}
@property
def config(self):
# return the associated transformers.AutoConfig for the given pretrained model.
return self._config
@property
def eot_token_id(self):
# we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
return self.tokenizer.eos_token_id
@property
def max_length(self):
if self._max_length: # if max length manually set, return it
return self._max_length
seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
for attr in seqlen_config_attrs:
if hasattr(self.model.config, attr):
return getattr(self.model.config, attr)
if hasattr(self.tokenizer, "model_max_length"):
if self.tokenizer.model_max_length == 1000000000000000019884624838656:
return self._DEFAULT_MAX_LENGTH
return self.tokenizer.model_max_length
return self._DEFAULT_MAX_LENGTH
@property
def max_gen_toks(self) -> int:
return 256
@property
def batch_size(self):
return self.batch_size_per_gpu
@property
def device(self):
"""device are neuron cores, but the created tensors are on CPU."""
return "cpu"
@property
def rank(self):
return 0
@property
def world_size(self):
return 1
def tok_encode(self, string: str, left_truncate_len=None, add_special_tokens=None):
""" """
if add_special_tokens is None:
add_special_tokens = False
encoding = self.tokenizer.encode(string, add_special_tokens=add_special_tokens)
# left-truncate the encoded context to be at most `left_truncate_len` tokens long
if left_truncate_len:
encoding = encoding[-left_truncate_len:]
return encoding
def tok_batch_encode(
self,
strings: List[str],
padding_side: str = "left",
left_truncate_len: int = None,
truncation: bool = False,
):
# encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
old_padding_side = self.tokenizer.padding_side
self.tokenizer.padding_side = padding_side
add_special_tokens = False
encoding = self.tokenizer(
strings,
truncation=truncation,
padding="longest",
return_tensors="pt",
add_special_tokens=add_special_tokens,
)
if left_truncate_len:
encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
encoding["attention_mask"] = encoding["attention_mask"][
:, -left_truncate_len:
]
self.tokenizer.padding_side = old_padding_side
return encoding["input_ids"], encoding["attention_mask"]
def tok_decode(self, tokens):
return self.tokenizer.decode(tokens)
@wrap_constant_batch_size
def _model_call(self, input_ids: torch.Tensor):
"""
get logits for the entire sequence
:param input_ids: torch.Tensor
A torch tensor of shape [batch, sequence_cont]
the size of sequence may vary from call to call
:return
A torch tensor of shape [batch, sequence, vocab] with the
logits returned from the model's decoder-lm head
"""
_, sequence_length = input_ids.shape
with torch.inference_mode():
cache_ids = torch.arange(0, sequence_length, dtype=torch.int32).split(1)
input_ids_split = input_ids.split(1, dim=1)
return torch.concat(
[
self.model.forward(
input_ids=input_id, cache_ids=cache_id, return_dict=False
)[0]
for input_id, cache_id in zip(input_ids_split, cache_ids)
],
dim=1,
)
def _model_generate(self, context, max_length, stop, **generation_kwargs):
# we require users to pass do_sample=True explicitly
# for non-greedy gen. This should be reevaluated when considering beam search.
with torch.inference_mode():
if "do_sample" not in generation_kwargs.keys():
generation_kwargs["do_sample"] = False
stopping_criteria = stop_sequences_criteria(
self.tokenizer,
stop + [self.tokenizer.decode([self.config.eos_token_id])],
1,
context.shape[0],
)
return self.model.generate(
input_ids=context,
max_length=max_length,
stopping_criteria=stopping_criteria,
pad_token_id=self.eot_token_id,
use_cache=True,
**generation_kwargs,
)
def _select_cont_toks(self, logits, contlen=None, inplen=None):
assert (
contlen and inplen
), "Must pass input len and cont. len to select scored logits for causal LM"
# discard right-padding.
# also discard the input/context tokens. we'll only score continuations.
logits = logits[inplen - contlen : inplen]
return logits
def _encode_pair(self, context, continuation):
n_spaces = len(context) - len(context.rstrip())
if n_spaces > 0:
continuation = context[-n_spaces:] + continuation
context = context[:-n_spaces]
whole_enc = self.tok_encode(context + continuation, add_special_tokens=False)
context_enc = self.tok_encode(context, add_special_tokens=False)
# whole_enc = self.tok_encode(context + continuation)
# context_enc = self.tok_encode(context, add_special_tokens=False)
context_enc_len = len(context_enc)
continuation_enc = whole_enc[context_enc_len:]
return context_enc, continuation_enc
def loglikelihood(self, requests):
new_reqs = []
for context, continuation in [req.args for req in requests]:
if context == "":
# end of text as context
context_enc, continuation_enc = (
[self.eot_token_id],
self.tok_encode(continuation),
)
else:
context_enc, continuation_enc = self._encode_pair(context, continuation)
new_reqs.append(((context, continuation), context_enc, continuation_enc))
return self._loglikelihood_tokens(new_reqs)
def loglikelihood_rolling(self, requests):
loglikelihoods = []
adaptive_batch_size = None
for (string,) in tqdm([req.args for req in requests], disable=(self.rank != 0)):
rolling_token_windows = list(
map(
utils.make_disjoint_window,
utils.get_rolling_token_windows(
token_list=self.tok_encode(string),
prefix_token=self.eot_token_id,
max_seq_len=self.max_length,
context_len=1,
),
)
)
# TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case
rolling_token_windows = [(None,) + x for x in rolling_token_windows]
pad_amnt = 0
if self.world_size > 1:
# We pad out the external document-level iterator so the inner iterator doesn't hang
mytensor = torch.tensor(len(rolling_token_windows), device=self.device)
gathered = (
self.accelerator.gather(mytensor).cpu().detach().numpy().tolist()
)
pad_amnt = max(gathered) - gathered[self.rank]
if pad_amnt > 0:
rolling_token_windows += pad_amnt * [rolling_token_windows[0]]
string_nll = self._loglikelihood_tokens(
rolling_token_windows,
disable_tqdm=True,
override_bs=adaptive_batch_size,
)
if (self.world_size > 1) and (pad_amnt > 0):
string_nll = [x[0] for x in string_nll[:-pad_amnt]]
else:
# discard is_greedy
string_nll = [x[0] for x in string_nll]
string_nll = sum(string_nll)
loglikelihoods.append(string_nll)
return loglikelihoods
def _loglikelihood_tokens(
self, requests, disable_tqdm: bool = False, override_bs=None
):
# TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
res = []
def _collate(x):
# the negative sign on len(toks) sorts descending - this has a few advantages:
# - time estimates will always be over not underestimates, which is more useful for planning
# - to know the size of a batch when going through the list, you know the first one is always the batch
# padded context length. this is useful to simplify the batching logic and more importantly to make
# automatic adaptive batches much much easier to implement
# - any OOMs will happen right away rather than near the end
toks = x[1] + x[2]
return -len(toks), tuple(toks)
re_ord = utils.Reorderer(requests, _collate)
n_reordered_requests = len(re_ord.get_reordered()) # noqa
# automatic (variable) batch size detection for vectorization
# pull longest context sample from request
chunks = utils.chunks(
re_ord.get_reordered(),
n=self.batch_size,
fn=None,
)
for chunk in tqdm(chunks, disable=(disable_tqdm or (self.rank != 0))):
inps = []
cont_toks_list = []
inplens = []
conts = [] # noqa
encoder_attns = [] # noqa
padding_len_inp = None
padding_len_cont = None # noqa
# because vectorizing is annoying, we first convert each (context, continuation) pair to padded
# tensors, then we pack them together into a batch, call the model, and then pick it all apart
# again because vectorizing is annoying
for _, context_enc, continuation_enc in chunk:
# sanity check
assert len(context_enc) > 0
assert len(continuation_enc) > 0
assert len(continuation_enc) <= self.max_length
# how this all works (illustrated on a causal decoder-only setup):
# CTX CONT
# inp 0 1 2 3|4 5 6 7 8 9 <- last token is deleted by inp[:, :-1]
# model \ \
# logits 1 2 3|4 5 6 7 8 9 <- the ctx half gets tossed out by the
# cont_toks 4 5 6 7 8 9 [:, -len(continuation_enc):, :self.vocab_size] slice
# when too long to fit in context, truncate from the left
inp = torch.tensor(
(context_enc + continuation_enc)[-(self.max_length + 1) :][:-1],
dtype=torch.long,
device=self.device,
)
(inplen,) = inp.shape
padding_len_inp = (
max(padding_len_inp, inplen)
if padding_len_inp is not None
else inplen
)
inps.append(inp) # [1, inp_length]
cont_toks_list.append(continuation_enc)
inplens.append(inplen)
# create encoder attn mask and batched conts, if seq2seq
call_kwargs = {}
batched_inps = utils.pad_and_concat(
padding_len_inp, inps, padding_side="right"
) # [batch, padding_len_inp]
multi_logits = F.log_softmax(
self._model_call(batched_inps, **call_kwargs), dim=-1
) # [batch, padding_length (inp or cont), vocab]
for (cache_key, _, _), logits, inplen, cont_toks in zip(
chunk, multi_logits, inplens, cont_toks_list
):
# Slice to original seq length
contlen = len(cont_toks)
# take only logits in the continuation
# (discard context toks if decoder-only ; discard right-padding)
# also discards + checks for "virtual tokens" in the causal LM's input window
# from prompt/prefix tuning tokens, if applicable
ctx_len = inplen + (logits.shape[0] - padding_len_inp)
logits = self._select_cont_toks(logits, contlen=contlen, inplen=ctx_len)
logits = logits.unsqueeze(0) # [1, seq, vocab]
# Check if per-token argmax is exactly equal to continuation
greedy_tokens = logits.argmax(dim=-1)
cont_toks = torch.tensor(
cont_toks, dtype=torch.long, device=self.device
).unsqueeze(0) # [1, seq]
max_equal = (greedy_tokens == cont_toks).all()
# Obtain log-probs at the corresponding continuation token indices
# last_token_slice = logits[:, -1, :].squeeze(0).tolist()
logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(
-1
) # [1, seq]
# Answer: (log prob, is-exact-match)
answer = (float(logits.sum()), bool(max_equal))
res.append(answer)
self.cache_hook.add_partial("loglikelihood", cache_key, answer)
return re_ord.get_original(res)
def generate_until(self, requests):
res = defaultdict(list)
re_ords = {}
def _collate(x):
# the negative sign on len(toks) sorts descending - this has a few advantages:
# - time estimates will always be over not underestimates, which is more useful for planning
# - to know the size of a batch when going through the list, you know the first one is always the batch
# padded context length. this is useful to simplify the batching logic and more importantly to make
# automatic adaptive batches much much easier to implement
# - any OOMs will happen right away rather than near the end
toks = self.tok_encode(x[0])
return -len(toks), x[0]
# we group requests by their generation_kwargs,
# so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
# in the same batch.
grouper = utils.Grouper(requests, lambda x: str(x.args[1]))
for key, reqs in grouper.get_grouped().items():
# within each set of reqs for given kwargs, we reorder by token length, descending.
re_ords[key] = utils.Reorderer([req.args for req in reqs], _collate)
pbar = tqdm(total=len(requests), disable=(self.rank != 0))
# for each different set of kwargs, we execute all requests, by batch.
for key, re_ord in re_ords.items():
chunks = utils.chunks(re_ord.get_reordered(), n=self.batch_size)
for chunk in tqdm(chunks, disable=self.rank != 0):
contexts, all_gen_kwargs = zip(*chunk)
# we assume all gen kwargs in the batch are the same
# this is safe to assume because the `grouper` object ensures it.
gen_kwargs = all_gen_kwargs[0]
# unpack our keyword arguments.
until = None
if isinstance(gen_kwargs, dict):
kwargs = copy.deepcopy(gen_kwargs) # edge case for repeats > 1
if "until" in kwargs.keys():
until = kwargs.pop("until")
if isinstance(until, str):
until = [kwargs]
elif not isinstance(until, list):
raise ValueError(
f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
)
else:
raise ValueError(
f"Expected `kwargs` to be of type `dict` but got {kwargs}"
)
if not until:
until = [self.tok_decode(self.eot_token_id)]
if "max_gen_toks" in kwargs.keys():
max_gen_toks = kwargs.pop("max_gen_toks")
else:
max_gen_toks = self.max_gen_toks
# first stop sequence is used to halt generation upon encountering
primary_until = [until[0]]
max_ctx_len = self.max_length - max_gen_toks
# encode, pad, and truncate contexts for this batch
context_enc, attn_masks = self.tok_batch_encode(
contexts,
left_truncate_len=max_ctx_len,
truncation=self.truncation,
)
context_enc = context_enc.to(self.device)
attn_masks = attn_masks.to(self.device)
if "max_length" not in kwargs:
kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
# perform batched generation
cont = self._model_generate(
context=context_enc,
attention_mask=attn_masks,
stop=primary_until,
**kwargs,
)
cont_toks_list = cont.tolist()
for cont_toks, context in zip(cont_toks_list, contexts):
# discard context + left-padding toks if using causal decoder-only LM
cont_toks = cont_toks[context_enc.shape[1] :]
s = self.tok_decode(cont_toks)
# use secondary stop seqs to cut off should-have-been-stopped content post-hoc
for term in until:
if len(term) > 0:
# ignore '' separator,
# for seq2seq case where self.tok_decode(self.eot_token_id) = ''
s = s.split(term)[0]
res[key].append(s)
self.cache_hook.add_partial(
"generate_until", (context, gen_kwargs), s
)
pbar.update(1)
# reorder this group of results back to original unsorted form
res[key] = re_ord.get_original(res[key])
pbar.close()
return grouper.get_original(res)
# Multilingual ARC
### Paper
Title: `Okapi: Instruction-tuned Large Language Models in Multiple Languages with Reinforcement Learning from Human Feedback`
Abstract: https://arxiv.org/abs/2307.16039
A key technology for the development of large language models (LLMs) involves instruction tuning that helps align the models' responses with human expectations to realize impressive learning abilities. Two major approaches for instruction tuning characterize supervised fine-tuning (SFT) and reinforcement learning from human feedback (RLHF), which are currently applied to produce the best commercial LLMs (e.g., ChatGPT). To improve the accessibility of LLMs for research and development efforts, various instruction-tuned open-source LLMs have also been introduced recently, e.g., Alpaca, Vicuna, to name a few. However, existing open-source LLMs have only been instruction-tuned for English and a few popular languages, thus hindering their impacts and accessibility to many other languages in the world. Among a few very recent work to explore instruction tuning for LLMs in multiple languages, SFT has been used as the only approach to instruction-tune LLMs for multiple languages. This has left a significant gap for fine-tuned LLMs based on RLHF in diverse languages and raised important questions on how RLHF can boost the performance of multilingual instruction tuning. To overcome this issue, we present Okapi, the first system with instruction-tuned LLMs based on RLHF for multiple languages. Okapi introduces instruction and response-ranked data in 26 diverse languages to facilitate the experiments and development of future multilingual LLM research. We also present benchmark datasets to enable the evaluation of generative LLMs in multiple languages. Our experiments demonstrate the advantages of RLHF for multilingual instruction over SFT for different base models and datasets. Our framework and resources are released at this https URL.
Homepage: `https://github.com/nlp-uoregon/Okapi`
### Citation
```
@article{dac2023okapi,
title={Okapi: Instruction-tuned Large Language Models in Multiple Languages with Reinforcement Learning from Human Feedback},
author={Dac Lai, Viet and Van Nguyen, Chien and Ngo, Nghia Trung and Nguyen, Thuat and Dernoncourt, Franck and Rossi, Ryan A and Nguyen, Thien Huu},
journal={arXiv e-prints},
pages={arXiv--2307},
year={2023}
}
```
### Groups and Tasks
#### Groups
- arc_multilingual
#### Tasks
- `arc_{ar,bn,ca,da,de,es,eu,fr,gu,hi,hr,hu,hy,id,it,kn,ml,mr,ne,nl,pt,ro,ru,sk,sr,sv,ta,te,uk,vi,zh}`
### Checklist
For adding novel benchmarks/datasets to the library:
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
group:
- arc_multilingual
dataset_path: null
dataset_name: null
output_type: multiple_choice
training_split: train
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "query"
doc_to_target: "gold"
doc_to_choice: "choices"
should_decontaminate: true
doc_to_decontamination_query: "query"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
include: _arc_yaml
task: arc_ar
dataset_path: alexandrainst/m_arc
dataset_name: ar
training_split: train
validation_split: validation
test_split: test
include: _arc_yaml
task: arc_bn
dataset_path: alexandrainst/m_arc
dataset_name: bn
training_split: train
validation_split: validation
test_split: test
include: _arc_yaml
task: arc_ca
dataset_path: alexandrainst/m_arc
dataset_name: ca
training_split: train
validation_split: validation
test_split: test
include: _arc_yaml
task: arc_da
dataset_path: alexandrainst/m_arc
dataset_name: da
training_split: train
validation_split: validation
test_split: test
include: _arc_yaml
task: arc_de
dataset_path: alexandrainst/m_arc
dataset_name: de
training_split: train
validation_split: validation
test_split: test
include: _arc_yaml
task: arc_es
dataset_path: alexandrainst/m_arc
dataset_name: es
training_split: train
validation_split: validation
test_split: test
include: _arc_yaml
task: arc_eu
dataset_path: alexandrainst/m_arc
dataset_name: eu
training_split: train
validation_split: validation
test_split: test
include: _arc_yaml
task: arc_fr
dataset_path: alexandrainst/m_arc
dataset_name: fr
training_split: train
validation_split: validation
test_split: test
include: _arc_yaml
task: arc_gu
dataset_path: alexandrainst/m_arc
dataset_name: gu
training_split: train
validation_split: validation
test_split: test
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment