Commit e6b798f9 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into metrics

# Conflicts:
#	.pre-commit-config.yaml
#	lm_eval/api/task.py
#	lm_eval/models/huggingface.py
#	lm_eval/models/vllm_causallms.py
#	pyproject.toml
parents 14a29ade 4f8195f1
...@@ -34,7 +34,6 @@ repos: ...@@ -34,7 +34,6 @@ repos:
# Run the linter. # Run the linter.
- id: ruff-check - id: ruff-check
args: [ --fix] args: [ --fix]
# Run the formatter.
- id: ruff-format - id: ruff-format
- repo: https://github.com/codespell-project/codespell - repo: https://github.com/codespell-project/codespell
rev: v2.4.1 rev: v2.4.1
...@@ -42,8 +41,10 @@ repos: ...@@ -42,8 +41,10 @@ repos:
- id: codespell - id: codespell
exclude: > exclude: >
(?x)^( (?x)^(
.*\.json|ignore.txt|lm_eval/tasks/.*|.*yaml|.*\.ipynb .*\.json|ignore.txt|lm_eval/tasks/.*|.*yaml|.*\.ipynb
)$ )$
args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt] args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt]
- repo: https://github.com/jackdewinter/pymarkdown - repo: https://github.com/jackdewinter/pymarkdown
rev: v0.9.30 rev: v0.9.30
...@@ -51,9 +52,3 @@ repos: ...@@ -51,9 +52,3 @@ repos:
- id: pymarkdown - id: pymarkdown
exclude: ^(lm_eval/tasks/.*|docs/footguns\.md)$ exclude: ^(lm_eval/tasks/.*|docs/footguns\.md)$
args: [fix, -r] args: [fix, -r]
# - repo: https://github.com/pre-commit/mirrors-mypy
# rev: v1.5.1
# hooks:
# - id: mypy
# additional_dependencies: [".[sentencepiece,multilingual,promptsource,gptq]", "types-PyYAML", "types-requests"]
# exclude: ^tests/.*$
...@@ -17,7 +17,7 @@ def try_parse_json(value: str) -> Union[str, dict, None]: ...@@ -17,7 +17,7 @@ def try_parse_json(value: str) -> Union[str, dict, None]:
if "{" in value: if "{" in value:
raise argparse.ArgumentTypeError( raise argparse.ArgumentTypeError(
f"Invalid JSON: {value}. Hint: Use double quotes for JSON strings." f"Invalid JSON: {value}. Hint: Use double quotes for JSON strings."
) ) from None
return value return value
...@@ -30,8 +30,8 @@ def _int_or_none_list_arg_type( ...@@ -30,8 +30,8 @@ def _int_or_none_list_arg_type(
return None return None
try: try:
return int(item) return int(item)
except ValueError: except ValueError as e:
raise argparse.ArgumentTypeError(f"{item} is not an integer or None") raise argparse.ArgumentTypeError(f"{item} is not an integer or None") from e
items = [parse_value(v) for v in value.split(split_char)] items = [parse_value(v) for v in value.split(split_char)]
num_items = len(items) num_items = len(items)
...@@ -433,8 +433,10 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ...@@ -433,8 +433,10 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
# because it's already been determined based on the prior env var before launching our # because it's already been determined based on the prior env var before launching our
# script--`datasets` gets imported by lm_eval internally before these lines can update the env. # script--`datasets` gets imported by lm_eval internally before these lines can update the env.
import datasets import datasets
from packaging.version import parse as vparse
datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True if vparse(datasets.__version__) < vparse("4.0.0"):
datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
if isinstance(args.model_args, dict): if isinstance(args.model_args, dict):
args.model_args["trust_remote_code"] = True args.model_args["trust_remote_code"] = True
...@@ -510,7 +512,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ...@@ -510,7 +512,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
) )
if args.log_samples: if args.log_samples:
for task_name, config in results["configs"].items(): for task_name, _config in results["configs"].items():
evaluation_tracker.save_results_samples( evaluation_tracker.save_results_samples(
task_name=task_name, samples=samples[task_name] task_name=task_name, samples=samples[task_name]
) )
......
...@@ -663,6 +663,11 @@ class ConfigurableTask(Task): ...@@ -663,6 +663,11 @@ class ConfigurableTask(Task):
print("hello") print("hello")
def download(self, dataset_kwargs: dict[str, Any] | None = None, **kwargs) -> None: def download(self, dataset_kwargs: dict[str, Any] | None = None, **kwargs) -> None:
from packaging.version import parse as vparse
if dataset_kwargs and vparse(datasets.__version__) >= vparse("4.0.0"):
dataset_kwargs.pop("trust_remote_code", None)
self.config.dataset_kwargs, self.config.metadata = ( self.config.dataset_kwargs, self.config.metadata = (
self.config.dataset_kwargs or {}, self.config.dataset_kwargs or {},
self.config.metadata or {}, self.config.metadata or {},
......
...@@ -7,7 +7,7 @@ import os ...@@ -7,7 +7,7 @@ import os
import random import random
import time import time
from collections import defaultdict from collections import defaultdict
from typing import TYPE_CHECKING, Any, List, Optional, Union from typing import TYPE_CHECKING, Any
import numpy as np import numpy as np
import torch import torch
...@@ -37,6 +37,7 @@ from lm_eval.utils import ( ...@@ -37,6 +37,7 @@ from lm_eval.utils import (
positional_deprecated, positional_deprecated,
setup_logging, setup_logging,
simple_parse_args_string, simple_parse_args_string,
wrap_text,
) )
...@@ -50,28 +51,28 @@ eval_logger = logging.getLogger(__name__) ...@@ -50,28 +51,28 @@ eval_logger = logging.getLogger(__name__)
@positional_deprecated @positional_deprecated
def simple_evaluate( def simple_evaluate(
model, model,
model_args: Optional[Union[str, dict[str, Any]]] = None, model_args: str | dict[str, Any] | None = None,
tasks: Optional[List[Union[str, dict, object]]] = None, tasks: list[str | dict | object] | None = None,
num_fewshot: Optional[int] = None, num_fewshot: int | None = None,
batch_size: Optional[Union[int, str]] = None, batch_size: int | str | None = None,
max_batch_size: Optional[int] = None, max_batch_size: int | None = None,
device: Optional[str] = None, device: str | None = None,
use_cache: Optional[str] = None, use_cache: str | None = None,
cache_requests: bool = False, cache_requests: bool = False,
rewrite_requests_cache: bool = False, rewrite_requests_cache: bool = False,
delete_requests_cache: bool = False, delete_requests_cache: bool = False,
limit: Optional[Union[int, float]] = None, limit: int | float | None = None,
samples: Optional[dict] = None, samples: dict | None = None,
bootstrap_iters: int = 100000, bootstrap_iters: int = 100000,
check_integrity: bool = False, check_integrity: bool = False,
write_out: bool = False, write_out: bool = False,
log_samples: bool = True, log_samples: bool = True,
evaluation_tracker: Optional[EvaluationTracker] = None, evaluation_tracker: EvaluationTracker | None = None,
system_instruction: Optional[str] = None, system_instruction: str | None = None,
apply_chat_template: Union[bool, str] = False, apply_chat_template: bool | str = False,
fewshot_as_multiturn: bool = False, fewshot_as_multiturn: bool = False,
gen_kwargs: Union[str, dict, None] = None, gen_kwargs: str | dict | None = None,
task_manager: Optional[TaskManager] = None, task_manager: TaskManager | None = None,
verbosity=None, verbosity=None,
predict_only: bool = False, predict_only: bool = False,
random_seed: int = 0, random_seed: int = 0,
...@@ -79,7 +80,7 @@ def simple_evaluate( ...@@ -79,7 +80,7 @@ def simple_evaluate(
torch_random_seed: int = 1234, torch_random_seed: int = 1234,
fewshot_random_seed: int = 1234, fewshot_random_seed: int = 1234,
confirm_run_unsafe_code: bool = False, confirm_run_unsafe_code: bool = False,
metadata: Optional[dict] = None, metadata: dict | None = None,
): ):
"""Instantiate and evaluate a model on a list of tasks. """Instantiate and evaluate a model on a list of tasks.
...@@ -171,8 +172,11 @@ def simple_evaluate( ...@@ -171,8 +172,11 @@ def simple_evaluate(
) )
) and not apply_chat_template: ) and not apply_chat_template:
eval_logger.warning( eval_logger.warning(
"Model appears to be an instruct or chat variant but chat template is not applied. " wrap_text(
"Recommend setting `apply_chat_template` (optionally `fewshot_as_multiturn`)." f"""pretrained={model_args.get("pretrained") if isinstance(model_args, dict) else model_args} appears to be an
instruct or chat variant but chat template is not applied.
Recommend setting `apply_chat_template` (optionally `fewshot_as_multiturn`).""",
)
) )
if delete_requests_cache: if delete_requests_cache:
...@@ -236,7 +240,9 @@ def simple_evaluate( ...@@ -236,7 +240,9 @@ def simple_evaluate(
else: else:
eval_logger.info( eval_logger.info(
f"Initializing {model} model, with arguments: {simple_parse_args_string(model_args)}" wrap_text(
f"Initializing {model} model, with arguments: {simple_parse_args_string(model_args)}"
)
) )
lm = lm_eval.api.registry.get_model(model).create_from_arg_string( lm = lm_eval.api.registry.get_model(model).create_from_arg_string(
model_args, model_args,
...@@ -283,7 +289,7 @@ def simple_evaluate( ...@@ -283,7 +289,7 @@ def simple_evaluate(
# helper function to recursively apply config overrides to leaf subtasks, skipping their constituent groups. # helper function to recursively apply config overrides to leaf subtasks, skipping their constituent groups.
# (setting of num_fewshot ; bypassing metric calculation ; setting fewshot seed) # (setting of num_fewshot ; bypassing metric calculation ; setting fewshot seed)
def _adjust_config(task_dict: dict[str, "Task"]) -> dict[str, "Task"]: def _adjust_config(task_dict: dict[str, Task]) -> dict[str, Task]:
adjusted_task_dict = {} adjusted_task_dict = {}
for task_name, task_obj in task_dict.items(): for task_name, task_obj in task_dict.items():
if isinstance(task_obj, dict): if isinstance(task_obj, dict):
...@@ -414,17 +420,17 @@ def simple_evaluate( ...@@ -414,17 +420,17 @@ def simple_evaluate(
@positional_deprecated @positional_deprecated
def evaluate( def evaluate(
lm: "LM", lm: LM,
task_dict, task_dict,
limit: int | float | None = None, limit: int | float | None = None,
samples: Optional[dict] = None, samples: dict | None = None,
cache_requests: bool = False, cache_requests: bool = False,
rewrite_requests_cache: bool = False, rewrite_requests_cache: bool = False,
bootstrap_iters: Optional[int] = 100000, bootstrap_iters: int | None = 100000,
write_out: bool = False, write_out: bool = False,
log_samples: bool = True, log_samples: bool = True,
system_instruction: Optional[str] = None, system_instruction: str | None = None,
apply_chat_template: Union[bool, str] = False, apply_chat_template: bool | str = False,
fewshot_as_multiturn: bool = False, fewshot_as_multiturn: bool = False,
verbosity: str = "INFO", verbosity: str = "INFO",
confirm_run_unsafe_code: bool = False, confirm_run_unsafe_code: bool = False,
...@@ -484,12 +490,11 @@ def evaluate( ...@@ -484,12 +490,11 @@ def evaluate(
# get lists of group hierarchy and each type of request # get lists of group hierarchy and each type of request
eval_tasks = get_task_list(task_dict) eval_tasks = get_task_list(task_dict)
if not log_samples: if not log_samples and not all(
if not all( "bypass" not in getattr(task_output.task, "_metric_fn_list", {})
"bypass" not in getattr(task_output.task, "_metric_fn_list", {}).keys() for task_output in eval_tasks
for task_output in eval_tasks ):
): raise ValueError("log_samples must be True for 'bypass' metric-only tasks")
raise ValueError("log_samples must be True for 'bypass' metric-only tasks")
# validation checks: # validation checks:
# 1.are we running multimodal task <-> non-multimodal model class, or vice-versa. # 1.are we running multimodal task <-> non-multimodal model class, or vice-versa.
...@@ -504,11 +509,10 @@ def evaluate( ...@@ -504,11 +509,10 @@ def evaluate(
raise ValueError( raise ValueError(
f"Attempted to run task: {task_output.task_name} which is marked as unsafe. Set confirm_run_unsafe_code=True to run this task." f"Attempted to run task: {task_output.task_name} which is marked as unsafe. Set confirm_run_unsafe_code=True to run this task."
) )
if len(incompatible_tasks) > 0: if len(incompatible_tasks) > 0 and not getattr(lm, "MULTIMODAL", False):
if not getattr(lm, "MULTIMODAL", False): raise ValueError(
raise ValueError( f"Attempted to run tasks: {incompatible_tasks} which require multimodal input, but the selected model type does not currently implement this. Multimodal support is currently restricted to the ['hf-multimodal', 'vllm-vlm'] model type."
f"Attempted to run tasks: {incompatible_tasks} which require multimodal input, but the selected model type does not currently implement this. Multimodal support is currently restricted to the ['hf-multimodal', 'vllm-vlm'] model type." )
)
# end validation check # end validation check
# Cache the limit arg. # Cache the limit arg.
...@@ -531,9 +535,7 @@ def evaluate( ...@@ -531,9 +535,7 @@ def evaluate(
system_instruction=system_instruction, system_instruction=system_instruction,
apply_chat_template=bool(apply_chat_template), apply_chat_template=bool(apply_chat_template),
fewshot_as_multiturn=fewshot_as_multiturn, fewshot_as_multiturn=fewshot_as_multiturn,
chat_template=getattr(lm, "apply_chat_template") chat_template=getattr(lm, "apply_chat_template", None),
if apply_chat_template
else None,
tokenizer_name=getattr(lm, "tokenizer_name", "") tokenizer_name=getattr(lm, "tokenizer_name", "")
if apply_chat_template if apply_chat_template
else "", else "",
...@@ -606,7 +608,7 @@ def evaluate( ...@@ -606,7 +608,7 @@ def evaluate(
for instances in instances_by_doc_id.values(): for instances in instances_by_doc_id.values():
instances.sort(key=lambda x: x.idx) instances.sort(key=lambda x: x.idx)
# iterate over different filters used # iterate over different filters used
for filter_key in task.instances[0].filtered_resps.keys(): for filter_key in task.instances[0].filtered_resps:
indices = ( indices = (
samples.get(task_output.task_name, None) samples.get(task_output.task_name, None)
if samples is not None if samples is not None
...@@ -619,10 +621,7 @@ def evaluate( ...@@ -619,10 +621,7 @@ def evaluate(
samples=indices, samples=indices,
) )
for doc_id, doc in doc_iterator: for doc_id, doc in doc_iterator:
if indices: doc_id_true = indices[doc_id] if indices else doc_id
doc_id_true = indices[doc_id]
else:
doc_id_true = doc_id
requests = instances_by_doc_id[doc_id] requests = instances_by_doc_id[doc_id]
metrics = task.process_results( metrics = task.process_results(
doc, [req.filtered_resps[filter_key] for req in requests] doc, [req.filtered_resps[filter_key] for req in requests]
...@@ -720,7 +719,7 @@ def evaluate( ...@@ -720,7 +719,7 @@ def evaluate(
): # subtask list will list "task_name": [] for solo tasks ): # subtask list will list "task_name": [] for solo tasks
for task in task_list: for task in task_list:
for m, h in higher_is_better[task].items(): for m, h in higher_is_better[task].items():
if m not in _higher_is_better.keys(): if m not in _higher_is_better:
_higher_is_better[m] = h _higher_is_better[m] = h
if ( if (
......
...@@ -3,9 +3,10 @@ from __future__ import annotations ...@@ -3,9 +3,10 @@ from __future__ import annotations
import copy import copy
import logging import logging
import os import os
from collections.abc import Iterator, Sequence
from datetime import timedelta from datetime import timedelta
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Literal from typing import TYPE_CHECKING, Any, Literal
import jinja2 import jinja2
import torch import torch
...@@ -19,6 +20,7 @@ from accelerate import ( ...@@ -19,6 +20,7 @@ from accelerate import (
from accelerate.utils import get_max_memory from accelerate.utils import get_max_memory
from huggingface_hub import HfApi from huggingface_hub import HfApi
from packaging import version from packaging import version
from packaging.version import parse as vparse
from tqdm import tqdm from tqdm import tqdm
from transformers.models.auto.modeling_auto import ( from transformers.models.auto.modeling_auto import (
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
...@@ -26,7 +28,6 @@ from transformers.models.auto.modeling_auto import ( ...@@ -26,7 +28,6 @@ from transformers.models.auto.modeling_auto import (
) )
from lm_eval import utils from lm_eval import utils
from lm_eval.api.instance import Instance
from lm_eval.api.model import TemplateLM from lm_eval.api.model import TemplateLM
from lm_eval.api.registry import register_model from lm_eval.api.registry import register_model
from lm_eval.models.utils import ( from lm_eval.models.utils import (
...@@ -44,13 +45,15 @@ from lm_eval.models.utils import ( ...@@ -44,13 +45,15 @@ from lm_eval.models.utils import (
if TYPE_CHECKING: if TYPE_CHECKING:
from transformers.quantizers.auto import AutoQuantizationConfig from transformers.quantizers.auto import AutoQuantizationConfig
from lm_eval.api.instance import Instance
eval_logger = logging.getLogger(__name__) eval_logger = logging.getLogger(__name__)
TOKENIZER_INFINITY = 1000000000000000019884624838656
@register_model("hf-auto", "hf", "huggingface") @register_model("hf-auto", "hf", "huggingface")
class HFLM(TemplateLM): class HFLM(TemplateLM):
""" """An abstracted Huggingface model class. Enables usage with both models of
An abstracted Huggingface model class. Enables usage with both models of
`transformers.AutoModelForCausalLM` and `transformers.AutoModelForSeq2SeqLM` classes. `transformers.AutoModelForCausalLM` and `transformers.AutoModelForSeq2SeqLM` classes.
Supports data-parallel multi-GPU with HF Accelerate. Supports data-parallel multi-GPU with HF Accelerate.
...@@ -98,6 +101,8 @@ class HFLM(TemplateLM): ...@@ -98,6 +101,8 @@ class HFLM(TemplateLM):
# end token for thinking, either the string or int token id. # end token for thinking, either the string or int token id.
# splits to get response after this token (if provided). # splits to get response after this token (if provided).
think_end_token: str | int | None = None, think_end_token: str | int | None = None,
enable_thinking: bool | None = None,
chat_template_args: dict[str, Any] | None = None,
**kwargs, **kwargs,
) -> None: ) -> None:
super().__init__() super().__init__()
...@@ -237,6 +242,11 @@ class HFLM(TemplateLM): ...@@ -237,6 +242,11 @@ class HFLM(TemplateLM):
self.vocab_size = self.tokenizer.vocab_size self.vocab_size = self.tokenizer.vocab_size
# select (or create) a pad token to use # select (or create) a pad token to use
self.tokenizer = configure_pad_token(self.tokenizer, model_config=self.config) self.tokenizer = configure_pad_token(self.tokenizer, model_config=self.config)
self.chat_template_args = (
chat_template_args or {} | dict(enable_thinking=enable_thinking)
if enable_thinking is not None
else {}
)
self.add_bos_token = add_bos_token self.add_bos_token = add_bos_token
if "gemma" in getattr(self.config, "model_type", ""): if "gemma" in getattr(self.config, "model_type", ""):
...@@ -370,13 +380,8 @@ class HFLM(TemplateLM): ...@@ -370,13 +380,8 @@ class HFLM(TemplateLM):
} }
else: # Estimating the possible memory requirements else: # Estimating the possible memory requirements
max_memory_all_gpus = get_max_memory() max_memory_all_gpus = get_max_memory()
if "cpu" in max_memory_all_gpus: max_memory_all_gpus.pop("cpu", None)
del max_memory_all_gpus["cpu"] if hasattr(self, "accelerator"):
if not hasattr(self, "accelerator"):
max_memory_per_gpu_map = {
k: v for k, v in max_memory_all_gpus.items()
}
else:
# use only 1 / num_processes of the GPUs if we are running under accelerate launch # use only 1 / num_processes of the GPUs if we are running under accelerate launch
max_memory_per_gpu_map = { max_memory_per_gpu_map = {
k: v k: v
...@@ -384,6 +389,9 @@ class HFLM(TemplateLM): ...@@ -384,6 +389,9 @@ class HFLM(TemplateLM):
if k % num_local_processes if k % num_local_processes
== (self.accelerator.process_index % num_local_processes) == (self.accelerator.process_index % num_local_processes)
} }
else:
max_memory_per_gpu_map = max_memory_all_gpus
args["max_memory"] = max_memory_per_gpu_map args["max_memory"] = max_memory_per_gpu_map
args["device_map"] = "auto" if device_map is None else device_map args["device_map"] = "auto" if device_map is None else device_map
eval_logger.info( eval_logger.info(
...@@ -427,12 +435,12 @@ class HFLM(TemplateLM): ...@@ -427,12 +435,12 @@ class HFLM(TemplateLM):
return self._model return self._model
@property @property
def eot_token_id(self): def eot_token_id(self) -> int:
# we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
return self.tokenizer.eos_token_id return self.tokenizer.eos_token_id
@property @property
def prefix_token_id(self): def prefix_token_id(self) -> int:
# it is used as prefix for loglikelihood # it is used as prefix for loglikelihood
if self.custom_prefix_token_id is not None: if self.custom_prefix_token_id is not None:
return self.custom_prefix_token_id return self.custom_prefix_token_id
...@@ -441,7 +449,7 @@ class HFLM(TemplateLM): ...@@ -441,7 +449,7 @@ class HFLM(TemplateLM):
return self.tokenizer.eos_token_id return self.tokenizer.eos_token_id
@property @property
def max_length(self): def max_length(self) -> int:
if self._max_length: # if max length manually set, return it if self._max_length: # if max length manually set, return it
return self._max_length return self._max_length
seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx") seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
...@@ -449,7 +457,7 @@ class HFLM(TemplateLM): ...@@ -449,7 +457,7 @@ class HFLM(TemplateLM):
if hasattr(self.model.config, attr): if hasattr(self.model.config, attr):
return getattr(self.model.config, attr) return getattr(self.model.config, attr)
if hasattr(self.tokenizer, "model_max_length"): if hasattr(self.tokenizer, "model_max_length"):
if self.tokenizer.model_max_length == 1000000000000000019884624838656: if self.tokenizer.model_max_length == TOKENIZER_INFINITY:
return self._DEFAULT_MAX_LENGTH return self._DEFAULT_MAX_LENGTH
return self.tokenizer.model_max_length return self.tokenizer.model_max_length
return self._DEFAULT_MAX_LENGTH return self._DEFAULT_MAX_LENGTH
...@@ -484,8 +492,8 @@ class HFLM(TemplateLM): ...@@ -484,8 +492,8 @@ class HFLM(TemplateLM):
backend: Literal["default", "causal", "seq2seq"] = "default", backend: Literal["default", "causal", "seq2seq"] = "default",
trust_remote_code: bool | None = False, trust_remote_code: bool | None = False,
) -> None: ) -> None:
""" """Helper method during initialization.
Helper method during initialization.
Determines the backend ("causal" (decoder-only) or "seq2seq" (encoder-decoder)) model type to be used. Determines the backend ("causal" (decoder-only) or "seq2seq" (encoder-decoder)) model type to be used.
sets `self.AUTO_MODEL_CLASS` appropriately if not already set. sets `self.AUTO_MODEL_CLASS` appropriately if not already set.
...@@ -504,13 +512,18 @@ class HFLM(TemplateLM): ...@@ -504,13 +512,18 @@ class HFLM(TemplateLM):
) )
else: else:
# determine and use the default HF backend for this model, based on its config + metadata. # determine and use the default HF backend for this model, based on its config + metadata.
if self.config.model_type in MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES: if (
getattr(config, "model_type", None)
in MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
):
# first check if model type is listed under seq2seq models, since some # first check if model type is listed under seq2seq models, since some
# models like MBart are listed in both seq2seq and causal mistakenly in HF transformers. # models like MBart are listed in both seq2seq and causal mistakenly in HF transformers.
# these special cases should be treated as seq2seq models. # these special cases should be treated as seq2seq models.
self.backend = "seq2seq" self.backend = "seq2seq"
eval_logger.debug(f"Using model type '{self.backend}'") eval_logger.debug(f"Using model type '{self.backend}'")
elif self.config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES: elif (
getattr(config, "model_type", None) in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
):
self.backend = "causal" self.backend = "causal"
eval_logger.debug(f"Using model type '{self.backend}'") eval_logger.debug(f"Using model type '{self.backend}'")
else: else:
...@@ -541,7 +554,7 @@ class HFLM(TemplateLM): ...@@ -541,7 +554,7 @@ class HFLM(TemplateLM):
gguf_file: str | None = None, gguf_file: str | None = None,
subfolder: str = "", subfolder: str = "",
) -> None: ) -> None:
"""Return the model config for HuggingFace models""" """Return the model config for HuggingFace models."""
self._config = transformers.AutoConfig.from_pretrained( self._config = transformers.AutoConfig.from_pretrained(
pretrained, pretrained,
revision=revision, revision=revision,
...@@ -574,8 +587,7 @@ class HFLM(TemplateLM): ...@@ -574,8 +587,7 @@ class HFLM(TemplateLM):
subfolder: str = "", subfolder: str = "",
**kwargs, **kwargs,
) -> None: ) -> None:
""" """Initializes an HF or HF-compatible PreTrainedModel from scratch
Initializes an HF or HF-compatible PreTrainedModel from scratch
inside HFLM, using the kwargs passed into self.__init__(). inside HFLM, using the kwargs passed into self.__init__().
Also handles functionality such as AutoGPTQ usage and PEFT wrapping. Also handles functionality such as AutoGPTQ usage and PEFT wrapping.
...@@ -586,7 +598,7 @@ class HFLM(TemplateLM): ...@@ -586,7 +598,7 @@ class HFLM(TemplateLM):
please consider subclassing HFLM and overriding this and other methods as needed. please consider subclassing HFLM and overriding this and other methods as needed.
""" """
model_kwargs = kwargs if kwargs else {} model_kwargs = kwargs or {}
model_kwargs.update( model_kwargs.update(
self._get_accelerate_args( self._get_accelerate_args(
...@@ -600,15 +612,12 @@ class HFLM(TemplateLM): ...@@ -600,15 +612,12 @@ class HFLM(TemplateLM):
) )
if not autogptq and not gptqmodel: if not autogptq and not gptqmodel:
if model_kwargs.get("load_in_4bit", None): if model_kwargs.get("load_in_4bit"):
assert transformers.__version__ >= "4.30.0", ( assert vparse(transformers.__version__) >= vparse("4.30.0"), (
"load_in_4bit requires transformers >= 4.30.0" "load_in_4bit requires transformers >= 4.30.0"
) )
if transformers.__version__ >= "4.30.0" and ( if compute_dtype := model_kwargs.get("bnb_4bit_compute_dtype"):
model_kwargs.get("load_in_4bit") model_kwargs["bnb_4bit_compute_dtype"] = get_dtype(compute_dtype)
and (compute_dtype := model_kwargs.get("bnb_4bit_compute_dtype"))
):
model_kwargs["bnb_4bit_compute_dtype"] = get_dtype(compute_dtype)
self._model = self.AUTO_MODEL_CLASS.from_pretrained( self._model = self.AUTO_MODEL_CLASS.from_pretrained(
pretrained, pretrained,
...@@ -666,9 +675,9 @@ class HFLM(TemplateLM): ...@@ -666,9 +675,9 @@ class HFLM(TemplateLM):
if peft: if peft:
from peft import PeftModel, __version__ as PEFT_VERSION from peft import PeftModel, __version__ as PEFT_VERSION
if model_kwargs.get("load_in_4bit") and version.parse( if model_kwargs.get("load_in_4bit") and vparse(PEFT_VERSION) < vparse(
PEFT_VERSION "0.4.0"
) < version.parse("0.4.0"): ):
raise AssertionError("load_in_4bit requires peft >= 0.4.0") raise AssertionError("load_in_4bit requires peft >= 0.4.0")
if self._model.config.vocab_size != len(self.tokenizer): if self._model.config.vocab_size != len(self.tokenizer):
# resize model for LoRAs with added tokens # resize model for LoRAs with added tokens
...@@ -694,10 +703,10 @@ class HFLM(TemplateLM): ...@@ -694,10 +703,10 @@ class HFLM(TemplateLM):
for name, param in self._model.state_dict().items(): for name, param in self._model.state_dict().items():
try: try:
param.data += _model_delta.state_dict()[name] param.data += _model_delta.state_dict()[name]
except KeyError: except KeyError as e:
raise KeyError( raise KeyError(
f"Delta model is missing weights for layer: {name}" f"Delta model is missing weights for layer: {name}"
) from None ) from e
except Exception as e: except Exception as e:
raise RuntimeError( raise RuntimeError(
f"Failed to add delta weights to layer {name}. Error: {e}" f"Failed to add delta weights to layer {name}. Error: {e}"
...@@ -705,8 +714,6 @@ class HFLM(TemplateLM): ...@@ -705,8 +714,6 @@ class HFLM(TemplateLM):
del _model_delta del _model_delta
return None
def _create_tokenizer( def _create_tokenizer(
self, self,
pretrained: str | transformers.PreTrainedModel, pretrained: str | transformers.PreTrainedModel,
...@@ -721,8 +728,7 @@ class HFLM(TemplateLM): ...@@ -721,8 +728,7 @@ class HFLM(TemplateLM):
add_bos_token: bool | None = False, add_bos_token: bool | None = False,
subfolder: str | None = "", subfolder: str | None = "",
) -> None: ) -> None:
""" """Helper method during initialization.
Helper method during initialization.
Create a tokenizer object corresponding to the correct Create a tokenizer object corresponding to the correct
tokenizer for value of `pretrained`, or use the pre-initialized tokenizer passed. tokenizer for value of `pretrained`, or use the pre-initialized tokenizer passed.
...@@ -768,9 +774,8 @@ class HFLM(TemplateLM): ...@@ -768,9 +774,8 @@ class HFLM(TemplateLM):
self.tokenizer = transformers.AutoTokenizer.from_pretrained( self.tokenizer = transformers.AutoTokenizer.from_pretrained(
model_name, **kwargs model_name, **kwargs
) )
return None
def _detect_batch_size(self, requests=None, pos: int = 0): def _detect_batch_size(self, requests: Sequence | None = None, pos: int = 0):
if requests: if requests:
_, context_enc, continuation_enc = requests[pos] _, context_enc, continuation_enc = requests[pos]
max_length = len( max_length = len(
...@@ -785,7 +790,7 @@ class HFLM(TemplateLM): ...@@ -785,7 +790,7 @@ class HFLM(TemplateLM):
# if OOM, then halves batch_size and tries again # if OOM, then halves batch_size and tries again
@find_executable_batch_size(starting_batch_size=self.max_batch_size) @find_executable_batch_size(starting_batch_size=self.max_batch_size)
def forward_batch(batch_size): def forward_batch(batch_size: int):
if self.backend == "seq2seq": if self.backend == "seq2seq":
length = max(max_context_enc, max_cont_enc) length = max(max_context_enc, max_cont_enc)
batched_conts = torch.ones( batched_conts = torch.ones(
...@@ -832,7 +837,10 @@ class HFLM(TemplateLM): ...@@ -832,7 +837,10 @@ class HFLM(TemplateLM):
return batch_size return batch_size
def tok_encode( def tok_encode(
self, string: str, left_truncate_len=None, add_special_tokens=None self,
string: str,
left_truncate_len: int | None = None,
add_special_tokens: bool | None = None,
) -> list[int]: ) -> list[int]:
""" """ """ """
# default for None - empty dict, use predefined tokenizer param # default for None - empty dict, use predefined tokenizer param
...@@ -861,7 +869,7 @@ class HFLM(TemplateLM): ...@@ -861,7 +869,7 @@ class HFLM(TemplateLM):
self, self,
strings: list[str], strings: list[str],
padding_side: str = "left", padding_side: str = "left",
left_truncate_len: int = None, left_truncate_len: int | None = None,
truncation: bool = False, truncation: bool = False,
) -> tuple[torch.Tensor, torch.Tensor]: ) -> tuple[torch.Tensor, torch.Tensor]:
# encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode. # encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
...@@ -882,7 +890,7 @@ class HFLM(TemplateLM): ...@@ -882,7 +890,7 @@ class HFLM(TemplateLM):
if left_truncate_len: if left_truncate_len:
original_lengths = encoding["input_ids"].size(1) original_lengths = encoding["input_ids"].size(1)
if original_lengths > left_truncate_len: if original_lengths > left_truncate_len:
eval_logger.warn( eval_logger.warning(
f"Left truncation applied. Original sequence length was {original_lengths}, " f"Left truncation applied. Original sequence length was {original_lengths}, "
f"truncating to last {left_truncate_len} tokens. Some content will be lost.", f"truncating to last {left_truncate_len} tokens. Some content will be lost.",
) )
...@@ -894,11 +902,17 @@ class HFLM(TemplateLM): ...@@ -894,11 +902,17 @@ class HFLM(TemplateLM):
return encoding["input_ids"], encoding["attention_mask"] return encoding["input_ids"], encoding["attention_mask"]
def tok_decode(self, tokens, skip_special_tokens=True): def tok_decode(self, tokens: Iterator[list[str]], skip_special_tokens: bool = True):
return self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens) return self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens)
def _model_call(self, inps, attn_mask=None, labels=None): def _model_call(
self,
inps: torch.Tensor,
attn_mask: torch.Tensor | None = None,
labels: torch.Tensor | None = None,
) -> torch.Tensor:
""" """
:param inps: torch.Tensor :param inps: torch.Tensor
A torch tensor of shape [batch, (sequence_ctx + sequence_cont)] or of shape A torch tensor of shape [batch, (sequence_ctx + sequence_cont)] or of shape
[batch, sequence_ctx]. the size of sequence may vary from call to call [batch, sequence_ctx]. the size of sequence may vary from call to call
...@@ -926,14 +940,20 @@ class HFLM(TemplateLM): ...@@ -926,14 +940,20 @@ class HFLM(TemplateLM):
return self.model( return self.model(
input_ids=inps, attention_mask=attn_mask, labels=labels input_ids=inps, attention_mask=attn_mask, labels=labels
).logits ).logits
else:
assert self.AUTO_MODEL_CLASS in ( assert self.AUTO_MODEL_CLASS in (
transformers.AutoModelForCausalLM, transformers.AutoModelForCausalLM,
transformers.AutoModelForVision2Seq, transformers.AutoModelForVision2Seq,
) )
return self.model(inps).logits return self.model(inps).logits
def _model_generate(self, context, max_length, stop, **generation_kwargs): def _model_generate(
self,
context,
max_length: int,
stop: list[str],
**generation_kwargs: dict[str, Any],
) -> torch.Tensor:
# temperature = 0.0 if not set # temperature = 0.0 if not set
# if do_sample is false and temp==0.0: # if do_sample is false and temp==0.0:
# remove temperature, as do_sample=False takes care of this # remove temperature, as do_sample=False takes care of this
...@@ -966,7 +986,10 @@ class HFLM(TemplateLM): ...@@ -966,7 +986,10 @@ class HFLM(TemplateLM):
) )
def _select_cont_toks( def _select_cont_toks(
self, logits: torch.Tensor, contlen: int = None, inplen: int = None self,
logits: torch.Tensor,
contlen: int | None = None,
inplen: int | None = None,
) -> torch.Tensor: ) -> torch.Tensor:
if self.backend == "causal": if self.backend == "causal":
assert contlen and inplen, ( assert contlen and inplen, (
...@@ -1092,13 +1115,13 @@ class HFLM(TemplateLM): ...@@ -1092,13 +1115,13 @@ class HFLM(TemplateLM):
self, self,
requests: list[tuple[tuple[str, str], list[int], list[int]]], requests: list[tuple[tuple[str, str], list[int], list[int]]],
disable_tqdm: bool = False, disable_tqdm: bool = False,
override_bs: int = None, override_bs: int | None = None,
) -> list[tuple[float, bool]]: ) -> list[tuple[float, bool]]:
# TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context # TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
res = [] res = []
def _collate(req: tuple[tuple[str, str], list[int], list[int]]): def _collate(req: tuple[tuple[str, str], list[int], list[int]]):
"""Defines the key for the sorted method""" """Defines the key for the sorted method."""
# the negative sign on len(toks) sorts descending - this has a few advantages: # the negative sign on len(toks) sorts descending - this has a few advantages:
# - time estimates will always be over not underestimates, which is more useful for planning # - time estimates will always be over not underestimates, which is more useful for planning
# - to know the size of a batch when going through the list, you know the first one is always the batch # - to know the size of a batch when going through the list, you know the first one is always the batch
...@@ -1110,7 +1133,7 @@ class HFLM(TemplateLM): ...@@ -1110,7 +1133,7 @@ class HFLM(TemplateLM):
return -len(toks), tuple(toks) return -len(toks), tuple(toks)
def _lookup_one_token_cont(req: tuple[tuple[str, str], list[int], list[int]]): def _lookup_one_token_cont(req: tuple[tuple[str, str], list[int], list[int]]):
"""Defines the key to group and lookup one-token continuations""" """Defines the key to group and lookup one-token continuations."""
# Use with group_by="contexts" (optional)" # Use with group_by="contexts" (optional)"
# allows for the creation of a lookup, so we can reuse logits in case of one-token continuations. # allows for the creation of a lookup, so we can reuse logits in case of one-token continuations.
# speeds up some multiple-choice tasks proportionally to the number of choices. # speeds up some multiple-choice tasks proportionally to the number of choices.
...@@ -1388,7 +1411,7 @@ class HFLM(TemplateLM): ...@@ -1388,7 +1411,7 @@ class HFLM(TemplateLM):
# add EOS token to stop sequences # add EOS token to stop sequences
until = handle_stop_sequences(kwargs.pop("until", None), eos=eos) until = handle_stop_sequences(kwargs.pop("until", None), eos=eos)
else: else:
raise ValueError( raise TypeError(
f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}" f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
) )
if "max_gen_toks" in kwargs: if "max_gen_toks" in kwargs:
...@@ -1471,15 +1494,14 @@ class HFLM(TemplateLM): ...@@ -1471,15 +1494,14 @@ class HFLM(TemplateLM):
def apply_chat_template( def apply_chat_template(
self, chat_history: list[dict[str, str]], add_generation_prompt: bool = True self, chat_history: list[dict[str, str]], add_generation_prompt: bool = True
) -> str: ) -> str:
""" """Method to apply a chat template to a list of chat history between user and model."""
Method to apply a chat template to a list of chat history between user and model.
"""
try: try:
chat_templated = self.tokenizer.apply_chat_template( chat_templated = self.tokenizer.apply_chat_template(
chat_history, chat_history,
tokenize=False, tokenize=False,
add_generation_prompt=add_generation_prompt, add_generation_prompt=add_generation_prompt,
continue_final_message=not add_generation_prompt, continue_final_message=not add_generation_prompt,
**self.chat_template_args,
) )
except jinja2.exceptions.TemplateError: except jinja2.exceptions.TemplateError:
eval_logger.warning( eval_logger.warning(
...@@ -1491,14 +1513,13 @@ class HFLM(TemplateLM): ...@@ -1491,14 +1513,13 @@ class HFLM(TemplateLM):
tokenize=False, tokenize=False,
add_generation_prompt=add_generation_prompt, add_generation_prompt=add_generation_prompt,
continue_final_message=not add_generation_prompt, continue_final_message=not add_generation_prompt,
**self.chat_template_args,
) )
return chat_templated return chat_templated
def get_model_info(self) -> dict: def get_model_info(self) -> dict:
""" """Method to get Hugging Face model information for experiment reproducibility."""
Method to get Hugging Face model information for experiment reproducibility.
"""
def get_model_num_params(model) -> int: def get_model_num_params(model) -> int:
if hasattr(model, "num_parameters"): if hasattr(model, "num_parameters"):
......
...@@ -133,11 +133,11 @@ class VLLM(TemplateLM): ...@@ -133,11 +133,11 @@ class VLLM(TemplateLM):
max_model_len: int | None = None, max_model_len: int | None = None,
seed: int = 1234, seed: int = 1234,
gpu_memory_utilization: float = 0.9, gpu_memory_utilization: float = 0.9,
device: str = "cuda",
data_parallel_size: int = 1, data_parallel_size: int = 1,
lora_local_path: str | None = None, lora_local_path: str | None = None,
# VLLM: enable thinking tags in the prompt. # VLLM: enable thinking tags in the prompt.
enable_thinking: bool = True, enable_thinking: bool = True,
chat_template_args: dict | None = None,
# End marker for thinking tags - splits to get response after this token (if provided). # End marker for thinking tags - splits to get response after this token (if provided).
think_end_token: str | None = None, think_end_token: str | None = None,
max_lora_rank: int = 16, max_lora_rank: int = 16,
...@@ -154,6 +154,7 @@ class VLLM(TemplateLM): ...@@ -154,6 +154,7 @@ class VLLM(TemplateLM):
assert max_length is None or max_model_len is None, ( assert max_length is None or max_model_len is None, (
"Either max_length or max_model_len may be provided, but not both" "Either max_length or max_model_len may be provided, but not both"
) )
kwargs.pop("device", None)
self.think_end_token = think_end_token self.think_end_token = think_end_token
self.V1 = os.environ.get("VLLM_USE_V1", "1") != "0" self.V1 = os.environ.get("VLLM_USE_V1", "1") != "0"
self._max_length = max_model_len if max_model_len is not None else max_length self._max_length = max_model_len if max_model_len is not None else max_length
...@@ -174,7 +175,6 @@ class VLLM(TemplateLM): ...@@ -174,7 +175,6 @@ class VLLM(TemplateLM):
"swap_space": int(swap_space), "swap_space": int(swap_space),
"quantization": quantization, "quantization": quantization,
"seed": int(seed), "seed": int(seed),
"device": str(device),
"enable_lora": bool(lora_local_path), "enable_lora": bool(lora_local_path),
"max_lora_rank": int(max_lora_rank), "max_lora_rank": int(max_lora_rank),
} }
...@@ -211,7 +211,10 @@ class VLLM(TemplateLM): ...@@ -211,7 +211,10 @@ class VLLM(TemplateLM):
add_bos_token=add_bos_token, add_bos_token=add_bos_token,
) )
self.tokenizer = configure_pad_token(self.tokenizer, model_config=self._config) self.tokenizer = configure_pad_token(self.tokenizer, model_config=self._config)
self.enable_thinking = enable_thinking self.chat_template_args = chat_template_args or {}
self.enable_thinking = self.chat_template_args.pop(
"enable_thinking", enable_thinking
)
self.add_bos_token = add_bos_token self.add_bos_token = add_bos_token
if "gemma" in pretrained.lower(): if "gemma" in pretrained.lower():
self.add_bos_token = True self.add_bos_token = True
...@@ -319,6 +322,7 @@ class VLLM(TemplateLM): ...@@ -319,6 +322,7 @@ class VLLM(TemplateLM):
continue_final_message=not add_generation_prompt, continue_final_message=not add_generation_prompt,
chat_template=self.hf_chat_template, chat_template=self.hf_chat_template,
enable_thinking=self.enable_thinking, enable_thinking=self.enable_thinking,
**self.chat_template_args,
) )
except jinja2.exceptions.TemplateError: except jinja2.exceptions.TemplateError:
eval_logger.warning( eval_logger.warning(
...@@ -331,6 +335,7 @@ class VLLM(TemplateLM): ...@@ -331,6 +335,7 @@ class VLLM(TemplateLM):
continue_final_message=not add_generation_prompt, continue_final_message=not add_generation_prompt,
chat_template=self.hf_chat_template, chat_template=self.hf_chat_template,
enable_thinking=self.enable_thinking, enable_thinking=self.enable_thinking,
**self.chat_template_args,
) )
return chat_templated return chat_templated
......
...@@ -85,6 +85,7 @@ ...@@ -85,6 +85,7 @@
| [lambada_multilingual_stablelm](lambada_multilingual_stablelm/README.md) | Multilingual LAMBADA dataset. Users should prefer evaluating on this version of the multilingual dataset instead of on `lambada_multilingual`. | German, English, Spanish, French, Italian, Dutch, Portuguese | | [lambada_multilingual_stablelm](lambada_multilingual_stablelm/README.md) | Multilingual LAMBADA dataset. Users should prefer evaluating on this version of the multilingual dataset instead of on `lambada_multilingual`. | German, English, Spanish, French, Italian, Dutch, Portuguese |
| [leaderboard](leaderboard/README.md) | Task group used by Hugging Face's [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). Those tasks are static and will not change through time | English | | [leaderboard](leaderboard/README.md) | Task group used by Hugging Face's [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). Those tasks are static and will not change through time | English |
| [lingoly](lingoly/README.md) | Challenging logical reasoning benchmark in low-resource languages with controls for memorization | English, Multilingual | | [lingoly](lingoly/README.md) | Challenging logical reasoning benchmark in low-resource languages with controls for memorization | English, Multilingual |
| [libra](libra/README.md) | Evaluates long-context understanding in Russian across four complexity levels | Russian (MT) |
| [logiqa](logiqa/README.md) | Logical reasoning tasks requiring advanced inference and deduction. | English, Chinese | | [logiqa](logiqa/README.md) | Logical reasoning tasks requiring advanced inference and deduction. | English, Chinese |
| [logiqa2](logiqa2/README.md) | Large-scale logical reasoning dataset adapted from the Chinese Civil Service Examination. | English, Chinese | | [logiqa2](logiqa2/README.md) | Large-scale logical reasoning dataset adapted from the Chinese Civil Service Examination. | English, Chinese |
| [mastermind](mastermind/README.md) | Reasoning benchmark based on the board game of Mastermind. | English | | [mastermind](mastermind/README.md) | Reasoning benchmark based on the board game of Mastermind. | English |
......
...@@ -4,9 +4,9 @@ include: _boolq_cot_2shot_yaml ...@@ -4,9 +4,9 @@ include: _boolq_cot_2shot_yaml
fewshot_config: fewshot_config:
sampler: first_n sampler: first_n
samples: samples:
- context: 'This is a ferry domain, where the task is to transport cars from their start to their goal locations, using a ferry. Each location is accessible by ferry from each other location. The cars can be debarked or boarded, and the ferry can carry only one car at a time. There are 2 locations and 5 cars, numbered consecutively. Currently, the ferry is at l1, with the car c4 on board. The cars are at locations as follows: c0 and c3 are at l1; c1 and c2 are at l0.' - context: "This is a ferry domain, where the task is to transport cars from their start to their goal locations, using a ferry. Each location is accessible by ferry from each other location. The cars can be debarked or boarded, and the ferry can carry only one car at a time. There are 2 locations and 5 cars, numbered consecutively. Currently, the ferry is at l1, with the car c4 on board. The cars are at locations as follows: c0 and c3 are at l1; c1 and c2 are at l0."
question: 'Is it possible to transition to a state where the action "travel by sea from location l0 to location l1" can be applied?' question: 'Is it possible to transition to a state where the action "travel by sea from location l0 to location l1" can be applied?'
answer: "Let's think step by step. Step 1: Verify if there is a sequence of actions which transforms the current state into a state where the precondition of the action \"travel by sea from location l0 to location l1\" hold. Step 2: The following sequence of actions would transition to such a state: sail from location l1 to location l0, unload the car c4 from the ferry to location l0, board car c1 at location l0. **Final Answer**: Yes." answer: "Let's think step by step. Step 1: Verify if there is a sequence of actions which transforms the current state into a state where the precondition of the action \"travel by sea from location l0 to location l1\" hold. Step 2: The following sequence of actions would transition to such a state: sail from location l1 to location l0, unload the car c4 from the ferry to location l0, board car c1 at location l0. **Final Answer**: Yes."
- context: 'There are several cities, each containing several locations, some of which are airports. There are also trucks, which can drive within a single city, and airplanes, which can fly between airports. The goal is to get some packages from various locations to various new locations. There are 2 trucks and 1 airplane, as well as 4 packages. There are 6 locations across 2 cities. The locations are in cities as follows: l0-0, l0-1, and l0-2 are in c0; l1-1, l1-2, and l1-0 are in c1. Currently, a0 is at l1-0, t1 is at l1-1, t0 is at l0-0, p2 and p1 are in t1, p0 and p3 are in a0.' - context: "There are several cities, each containing several locations, some of which are airports. There are also trucks, which can drive within a single city, and airplanes, which can fly between airports. The goal is to get some packages from various locations to various new locations. There are 2 trucks and 1 airplane, as well as 4 packages. There are 6 locations across 2 cities. The locations are in cities as follows: l0-0, l0-1, and l0-2 are in c0; l1-1, l1-2, and l1-0 are in c1. Currently, a0 is at l1-0, t1 is at l1-1, t0 is at l0-0, p2 and p1 are in t1, p0 and p3 are in a0."
question: 'Is it possible to transition to a state where the action "offload the object p0 from the truck p0 at location p1" can be applied?' question: 'Is it possible to transition to a state where the action "offload the object p0 from the truck p0 at location p1" can be applied?'
answer: "Let's think step by step. Step 1: Verify if there is a sequence of actions which transforms the current state into a state where the precondition of the action \"offload the object p0 from the truck p0 at location p1\" hold. Step 2: Action preconditions are \"p0 is in p0 and p0 is at p1\". Step 3: These facts are not reachable together, as they include mutually exclusive facts \"p0 is in p0 and p0 is at p1\". **Final Answer**: No." answer: "Let's think step by step. Step 1: Verify if there is a sequence of actions which transforms the current state into a state where the precondition of the action \"offload the object p0 from the truck p0 at location p1\" hold. Step 2: Action preconditions are \"p0 is in p0 and p0 is at p1\". Step 3: These facts are not reachable together, as they include mutually exclusive facts \"p0 is in p0 and p0 is at p1\". **Final Answer**: No."
...@@ -67,7 +67,7 @@ def span_f1_agg(items): ...@@ -67,7 +67,7 @@ def span_f1_agg(items):
def remove_blank_spaces(text): def remove_blank_spaces(text):
text = re.sub(pattern=get_blank_spaces_pattern(), repl="", string=text) text = re.sub(pattern=get_blank_spaces_pattern(), repl="", string=text)
text = re.sub("\s+", " ", text) text = re.sub(r"\s+", " ", text)
return text return text
def remove_punctuation(text): def remove_punctuation(text):
......
...@@ -67,7 +67,7 @@ def span_f1_agg(items): ...@@ -67,7 +67,7 @@ def span_f1_agg(items):
def remove_blank_spaces(text): def remove_blank_spaces(text):
text = re.sub(pattern=get_blank_spaces_pattern(), repl="", string=text) text = re.sub(pattern=get_blank_spaces_pattern(), repl="", string=text)
text = re.sub("\s+", " ", text) text = re.sub(r"\s+", " ", text)
return text return text
def remove_punctuation(text): def remove_punctuation(text):
......
...@@ -67,7 +67,7 @@ def span_f1_agg(items): ...@@ -67,7 +67,7 @@ def span_f1_agg(items):
def remove_blank_spaces(text): def remove_blank_spaces(text):
text = re.sub(pattern=get_blank_spaces_pattern(), repl="", string=text) text = re.sub(pattern=get_blank_spaces_pattern(), repl="", string=text)
text = re.sub("\s+", " ", text) text = re.sub(r"\s+", " ", text)
return text return text
def remove_punctuation(text): def remove_punctuation(text):
......
...@@ -67,7 +67,7 @@ def span_f1_agg(items): ...@@ -67,7 +67,7 @@ def span_f1_agg(items):
def remove_blank_spaces(text): def remove_blank_spaces(text):
text = re.sub(pattern=get_blank_spaces_pattern(), repl="", string=text) text = re.sub(pattern=get_blank_spaces_pattern(), repl="", string=text)
text = re.sub("\s+", " ", text) text = re.sub(r"\s+", " ", text)
return text return text
def remove_punctuation(text): def remove_punctuation(text):
......
...@@ -67,7 +67,7 @@ def span_f1_agg(items): ...@@ -67,7 +67,7 @@ def span_f1_agg(items):
def remove_blank_spaces(text): def remove_blank_spaces(text):
text = re.sub(pattern=get_blank_spaces_pattern(), repl="", string=text) text = re.sub(pattern=get_blank_spaces_pattern(), repl="", string=text)
text = re.sub("\s+", " ", text) text = re.sub(r"\s+", " ", text)
return text return text
def remove_punctuation(text): def remove_punctuation(text):
......
...@@ -12,9 +12,9 @@ def prompt_func(mode, lang): ...@@ -12,9 +12,9 @@ def prompt_func(mode, lang):
"prompt_3": f"You are an assistant able to classify topics in texts. \n\n" "prompt_3": f"You are an assistant able to classify topics in texts. \n\n"
f"Given the categories technology, religion, politics, sports, health, entertainment, or business; what is " f"Given the categories technology, religion, politics, sports, health, entertainment, or business; what is "
f"the topic of the {lang} statement below? Return only the category. " f"the topic of the {lang} statement below? Return only the category. "
"\n\ntext: {{headline}} \category:\n\n", "\n\ntext: {{headline}} \\category:\n\n",
"prompt_4": "Label the following text as technology, religion, politics, sports, health, entertainment, or geography. Provide only the category as your " "prompt_4": "Label the following text as technology, religion, politics, sports, health, entertainment, or geography. Provide only the category as your "
"response. \n\ntext: {{headline}} \category: \n\n", "response. \n\ntext: {{headline}} \\category: \n\n",
"prompt_5": f"You are tasked with performing topic classification on the following {lang} text. " "prompt_5": f"You are tasked with performing topic classification on the following {lang} text. "
f"For each input, classify the topic as technology, business, politics, sports, health, entertainment, or religion. " f"For each input, classify the topic as technology, business, politics, sports, health, entertainment, or religion. "
f"Use the following guidelines: \n\n " f"Use the following guidelines: \n\n "
...@@ -27,7 +27,7 @@ def prompt_func(mode, lang): ...@@ -27,7 +27,7 @@ def prompt_func(mode, lang):
f"business: The text covers economy, business, or related topics. \n\n" f"business: The text covers economy, business, or related topics. \n\n"
f"If the text contains multiple topics, choose the dominant topic. " f"If the text contains multiple topics, choose the dominant topic. "
f"For ambiguous or unclear topics, select the category that best reflects the overall content. " f"For ambiguous or unclear topics, select the category that best reflects the overall content. "
"Please provide a single classification for each input.\n\ntext: {{headline}} \category: \n\n", "Please provide a single classification for each input.\n\ntext: {{headline}} \\category: \n\n",
} }
return prompt_map[mode] return prompt_map[mode]
......
...@@ -17,9 +17,9 @@ def prompt_func(mode, lang): ...@@ -17,9 +17,9 @@ def prompt_func(mode, lang):
"prompt_3": f"You are an assistant able to classify topics in texts. \n\n" "prompt_3": f"You are an assistant able to classify topics in texts. \n\n"
f"Given the categories science/technology, travel, politics, sports, health, entertainment, or geography; what is " f"Given the categories science/technology, travel, politics, sports, health, entertainment, or geography; what is "
f"the topic of the {lang} statement below? Return only the category. " f"the topic of the {lang} statement below? Return only the category. "
"\n\ntext: {{text}} \category:\n\n", "\n\ntext: {{text}} \\category:\n\n",
"prompt_4": "Label the following text as science/technology, travel, politics, sports, health, entertainment, or geography. Provide only the category as your " "prompt_4": "Label the following text as science/technology, travel, politics, sports, health, entertainment, or geography. Provide only the category as your "
"response. \n\ntext: {{text}} \category: \n\n", "response. \n\ntext: {{text}} \\category: \n\n",
"prompt_5": f"You are tasked with performing topic classification on the following {lang} text. " "prompt_5": f"You are tasked with performing topic classification on the following {lang} text. "
f"For each input, classify the topic as science/technology, travel, politics, sports, health, entertainment, or geography. " f"For each input, classify the topic as science/technology, travel, politics, sports, health, entertainment, or geography. "
f"Use the following guidelines: \n\n " f"Use the following guidelines: \n\n "
...@@ -32,7 +32,7 @@ def prompt_func(mode, lang): ...@@ -32,7 +32,7 @@ def prompt_func(mode, lang):
f"geography: The text involves geographical information, locations, or related topics. \n\n" f"geography: The text involves geographical information, locations, or related topics. \n\n"
f"If the text contains multiple topics, choose the dominant topic. " f"If the text contains multiple topics, choose the dominant topic. "
f"For ambiguous or unclear topics, select the category that best reflects the overall content. " f"For ambiguous or unclear topics, select the category that best reflects the overall content. "
"Please provide a single classification for each input.\n\ntext: {{text}} \category: \n\n", "Please provide a single classification for each input.\n\ntext: {{text}} \\category: \n\n",
} }
return prompt_map[mode] return prompt_map[mode]
......
...@@ -4,8 +4,6 @@ tag: ...@@ -4,8 +4,6 @@ tag:
task: null task: null
dataset_path: csebuetnlp/xlsum dataset_path: csebuetnlp/xlsum
dataset_name: null dataset_name: null
dataset_kwargs:
trust_remote_code: true
output_type: generate_until output_type: generate_until
generation_kwargs: generation_kwargs:
until: until:
......
...@@ -4,8 +4,6 @@ tag: ...@@ -4,8 +4,6 @@ tag:
task: null task: null
dataset_path: csebuetnlp/xlsum dataset_path: csebuetnlp/xlsum
dataset_name: null dataset_name: null
dataset_kwargs:
trust_remote_code: true
output_type: generate_until output_type: generate_until
generation_kwargs: generation_kwargs:
until: until:
......
...@@ -4,8 +4,6 @@ tag: ...@@ -4,8 +4,6 @@ tag:
task: null task: null
dataset_path: csebuetnlp/xlsum dataset_path: csebuetnlp/xlsum
dataset_name: null dataset_name: null
dataset_kwargs:
trust_remote_code: true
output_type: generate_until output_type: generate_until
generation_kwargs: generation_kwargs:
until: until:
......
...@@ -47,7 +47,7 @@ def parse_math_answer(raw_string): ...@@ -47,7 +47,7 @@ def parse_math_answer(raw_string):
return retval return retval
def get_answer_with_dollar_sign(s): def get_answer_with_dollar_sign(s):
first_pattern = "\$(.*)\$" first_pattern = r"\$(.*)\$"
last_match = None last_match = None
matches = re.findall(first_pattern, s) matches = re.findall(first_pattern, s)
if matches: if matches:
...@@ -63,7 +63,7 @@ def parse_math_answer(raw_string): ...@@ -63,7 +63,7 @@ def parse_math_answer(raw_string):
if "\\n" in last_match: if "\\n" in last_match:
last_match = last_match.split("\\n")[0] last_match = last_match.split("\\n")[0]
else: else:
pattern = "(?:\\$)?\d+(?:\.\d+)?(?![\w\d])" pattern = "(?:\\$)?\\d+(?:\\.\\d+)?(?![\\w\\d])"
matches = re.findall(pattern, s) matches = re.findall(pattern, s)
if matches: if matches:
last_match = matches[-1] last_match = matches[-1]
...@@ -186,7 +186,7 @@ def _strip_string(string): ...@@ -186,7 +186,7 @@ def _strip_string(string):
# remove percentage # remove percentage
string = string.replace("\\%", "") string = string.replace("\\%", "")
string = string.replace("\%", "") string = string.replace(r"\%", "")
# " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
string = string.replace(" .", " 0.") string = string.replace(" .", " 0.")
......
...@@ -15,5 +15,3 @@ metric_list: ...@@ -15,5 +15,3 @@ metric_list:
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 2.0 version: 2.0
dataset_kwargs:
trust_remote_code: true
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment