Commit e6b798f9 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into metrics

# Conflicts:
#	.pre-commit-config.yaml
#	lm_eval/api/task.py
#	lm_eval/models/huggingface.py
#	lm_eval/models/vllm_causallms.py
#	pyproject.toml
parents 14a29ade 4f8195f1
......@@ -34,7 +34,6 @@ repos:
# Run the linter.
- id: ruff-check
args: [ --fix]
# Run the formatter.
- id: ruff-format
- repo: https://github.com/codespell-project/codespell
rev: v2.4.1
......@@ -42,8 +41,10 @@ repos:
- id: codespell
exclude: >
(?x)^(
.*\.json|ignore.txt|lm_eval/tasks/.*|.*yaml|.*\.ipynb
)$
args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt]
- repo: https://github.com/jackdewinter/pymarkdown
rev: v0.9.30
......@@ -51,9 +52,3 @@ repos:
- id: pymarkdown
exclude: ^(lm_eval/tasks/.*|docs/footguns\.md)$
args: [fix, -r]
# - repo: https://github.com/pre-commit/mirrors-mypy
# rev: v1.5.1
# hooks:
# - id: mypy
# additional_dependencies: [".[sentencepiece,multilingual,promptsource,gptq]", "types-PyYAML", "types-requests"]
# exclude: ^tests/.*$
......@@ -17,7 +17,7 @@ def try_parse_json(value: str) -> Union[str, dict, None]:
if "{" in value:
raise argparse.ArgumentTypeError(
f"Invalid JSON: {value}. Hint: Use double quotes for JSON strings."
)
) from None
return value
......@@ -30,8 +30,8 @@ def _int_or_none_list_arg_type(
return None
try:
return int(item)
except ValueError:
raise argparse.ArgumentTypeError(f"{item} is not an integer or None")
except ValueError as e:
raise argparse.ArgumentTypeError(f"{item} is not an integer or None") from e
items = [parse_value(v) for v in value.split(split_char)]
num_items = len(items)
......@@ -433,8 +433,10 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
# because it's already been determined based on the prior env var before launching our
# script--`datasets` gets imported by lm_eval internally before these lines can update the env.
import datasets
from packaging.version import parse as vparse
datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
if vparse(datasets.__version__) < vparse("4.0.0"):
datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
if isinstance(args.model_args, dict):
args.model_args["trust_remote_code"] = True
......@@ -510,7 +512,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
)
if args.log_samples:
for task_name, config in results["configs"].items():
for task_name, _config in results["configs"].items():
evaluation_tracker.save_results_samples(
task_name=task_name, samples=samples[task_name]
)
......
......@@ -663,6 +663,11 @@ class ConfigurableTask(Task):
print("hello")
def download(self, dataset_kwargs: dict[str, Any] | None = None, **kwargs) -> None:
from packaging.version import parse as vparse
if dataset_kwargs and vparse(datasets.__version__) >= vparse("4.0.0"):
dataset_kwargs.pop("trust_remote_code", None)
self.config.dataset_kwargs, self.config.metadata = (
self.config.dataset_kwargs or {},
self.config.metadata or {},
......
......@@ -7,7 +7,7 @@ import os
import random
import time
from collections import defaultdict
from typing import TYPE_CHECKING, Any, List, Optional, Union
from typing import TYPE_CHECKING, Any
import numpy as np
import torch
......@@ -37,6 +37,7 @@ from lm_eval.utils import (
positional_deprecated,
setup_logging,
simple_parse_args_string,
wrap_text,
)
......@@ -50,28 +51,28 @@ eval_logger = logging.getLogger(__name__)
@positional_deprecated
def simple_evaluate(
model,
model_args: Optional[Union[str, dict[str, Any]]] = None,
tasks: Optional[List[Union[str, dict, object]]] = None,
num_fewshot: Optional[int] = None,
batch_size: Optional[Union[int, str]] = None,
max_batch_size: Optional[int] = None,
device: Optional[str] = None,
use_cache: Optional[str] = None,
model_args: str | dict[str, Any] | None = None,
tasks: list[str | dict | object] | None = None,
num_fewshot: int | None = None,
batch_size: int | str | None = None,
max_batch_size: int | None = None,
device: str | None = None,
use_cache: str | None = None,
cache_requests: bool = False,
rewrite_requests_cache: bool = False,
delete_requests_cache: bool = False,
limit: Optional[Union[int, float]] = None,
samples: Optional[dict] = None,
limit: int | float | None = None,
samples: dict | None = None,
bootstrap_iters: int = 100000,
check_integrity: bool = False,
write_out: bool = False,
log_samples: bool = True,
evaluation_tracker: Optional[EvaluationTracker] = None,
system_instruction: Optional[str] = None,
apply_chat_template: Union[bool, str] = False,
evaluation_tracker: EvaluationTracker | None = None,
system_instruction: str | None = None,
apply_chat_template: bool | str = False,
fewshot_as_multiturn: bool = False,
gen_kwargs: Union[str, dict, None] = None,
task_manager: Optional[TaskManager] = None,
gen_kwargs: str | dict | None = None,
task_manager: TaskManager | None = None,
verbosity=None,
predict_only: bool = False,
random_seed: int = 0,
......@@ -79,7 +80,7 @@ def simple_evaluate(
torch_random_seed: int = 1234,
fewshot_random_seed: int = 1234,
confirm_run_unsafe_code: bool = False,
metadata: Optional[dict] = None,
metadata: dict | None = None,
):
"""Instantiate and evaluate a model on a list of tasks.
......@@ -171,8 +172,11 @@ def simple_evaluate(
)
) and not apply_chat_template:
eval_logger.warning(
"Model appears to be an instruct or chat variant but chat template is not applied. "
"Recommend setting `apply_chat_template` (optionally `fewshot_as_multiturn`)."
wrap_text(
f"""pretrained={model_args.get("pretrained") if isinstance(model_args, dict) else model_args} appears to be an
instruct or chat variant but chat template is not applied.
Recommend setting `apply_chat_template` (optionally `fewshot_as_multiturn`).""",
)
)
if delete_requests_cache:
......@@ -236,7 +240,9 @@ def simple_evaluate(
else:
eval_logger.info(
f"Initializing {model} model, with arguments: {simple_parse_args_string(model_args)}"
wrap_text(
f"Initializing {model} model, with arguments: {simple_parse_args_string(model_args)}"
)
)
lm = lm_eval.api.registry.get_model(model).create_from_arg_string(
model_args,
......@@ -283,7 +289,7 @@ def simple_evaluate(
# helper function to recursively apply config overrides to leaf subtasks, skipping their constituent groups.
# (setting of num_fewshot ; bypassing metric calculation ; setting fewshot seed)
def _adjust_config(task_dict: dict[str, "Task"]) -> dict[str, "Task"]:
def _adjust_config(task_dict: dict[str, Task]) -> dict[str, Task]:
adjusted_task_dict = {}
for task_name, task_obj in task_dict.items():
if isinstance(task_obj, dict):
......@@ -414,17 +420,17 @@ def simple_evaluate(
@positional_deprecated
def evaluate(
lm: "LM",
lm: LM,
task_dict,
limit: int | float | None = None,
samples: Optional[dict] = None,
samples: dict | None = None,
cache_requests: bool = False,
rewrite_requests_cache: bool = False,
bootstrap_iters: Optional[int] = 100000,
bootstrap_iters: int | None = 100000,
write_out: bool = False,
log_samples: bool = True,
system_instruction: Optional[str] = None,
apply_chat_template: Union[bool, str] = False,
system_instruction: str | None = None,
apply_chat_template: bool | str = False,
fewshot_as_multiturn: bool = False,
verbosity: str = "INFO",
confirm_run_unsafe_code: bool = False,
......@@ -484,12 +490,11 @@ def evaluate(
# get lists of group hierarchy and each type of request
eval_tasks = get_task_list(task_dict)
if not log_samples:
if not all(
"bypass" not in getattr(task_output.task, "_metric_fn_list", {}).keys()
for task_output in eval_tasks
):
raise ValueError("log_samples must be True for 'bypass' metric-only tasks")
if not log_samples and not all(
"bypass" not in getattr(task_output.task, "_metric_fn_list", {})
for task_output in eval_tasks
):
raise ValueError("log_samples must be True for 'bypass' metric-only tasks")
# validation checks:
# 1.are we running multimodal task <-> non-multimodal model class, or vice-versa.
......@@ -504,11 +509,10 @@ def evaluate(
raise ValueError(
f"Attempted to run task: {task_output.task_name} which is marked as unsafe. Set confirm_run_unsafe_code=True to run this task."
)
if len(incompatible_tasks) > 0:
if not getattr(lm, "MULTIMODAL", False):
raise ValueError(
f"Attempted to run tasks: {incompatible_tasks} which require multimodal input, but the selected model type does not currently implement this. Multimodal support is currently restricted to the ['hf-multimodal', 'vllm-vlm'] model type."
)
if len(incompatible_tasks) > 0 and not getattr(lm, "MULTIMODAL", False):
raise ValueError(
f"Attempted to run tasks: {incompatible_tasks} which require multimodal input, but the selected model type does not currently implement this. Multimodal support is currently restricted to the ['hf-multimodal', 'vllm-vlm'] model type."
)
# end validation check
# Cache the limit arg.
......@@ -531,9 +535,7 @@ def evaluate(
system_instruction=system_instruction,
apply_chat_template=bool(apply_chat_template),
fewshot_as_multiturn=fewshot_as_multiturn,
chat_template=getattr(lm, "apply_chat_template")
if apply_chat_template
else None,
chat_template=getattr(lm, "apply_chat_template", None),
tokenizer_name=getattr(lm, "tokenizer_name", "")
if apply_chat_template
else "",
......@@ -606,7 +608,7 @@ def evaluate(
for instances in instances_by_doc_id.values():
instances.sort(key=lambda x: x.idx)
# iterate over different filters used
for filter_key in task.instances[0].filtered_resps.keys():
for filter_key in task.instances[0].filtered_resps:
indices = (
samples.get(task_output.task_name, None)
if samples is not None
......@@ -619,10 +621,7 @@ def evaluate(
samples=indices,
)
for doc_id, doc in doc_iterator:
if indices:
doc_id_true = indices[doc_id]
else:
doc_id_true = doc_id
doc_id_true = indices[doc_id] if indices else doc_id
requests = instances_by_doc_id[doc_id]
metrics = task.process_results(
doc, [req.filtered_resps[filter_key] for req in requests]
......@@ -720,7 +719,7 @@ def evaluate(
): # subtask list will list "task_name": [] for solo tasks
for task in task_list:
for m, h in higher_is_better[task].items():
if m not in _higher_is_better.keys():
if m not in _higher_is_better:
_higher_is_better[m] = h
if (
......
......@@ -3,9 +3,10 @@ from __future__ import annotations
import copy
import logging
import os
from collections.abc import Iterator, Sequence
from datetime import timedelta
from pathlib import Path
from typing import TYPE_CHECKING, Literal
from typing import TYPE_CHECKING, Any, Literal
import jinja2
import torch
......@@ -19,6 +20,7 @@ from accelerate import (
from accelerate.utils import get_max_memory
from huggingface_hub import HfApi
from packaging import version
from packaging.version import parse as vparse
from tqdm import tqdm
from transformers.models.auto.modeling_auto import (
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
......@@ -26,7 +28,6 @@ from transformers.models.auto.modeling_auto import (
)
from lm_eval import utils
from lm_eval.api.instance import Instance
from lm_eval.api.model import TemplateLM
from lm_eval.api.registry import register_model
from lm_eval.models.utils import (
......@@ -44,13 +45,15 @@ from lm_eval.models.utils import (
if TYPE_CHECKING:
from transformers.quantizers.auto import AutoQuantizationConfig
from lm_eval.api.instance import Instance
eval_logger = logging.getLogger(__name__)
TOKENIZER_INFINITY = 1000000000000000019884624838656
@register_model("hf-auto", "hf", "huggingface")
class HFLM(TemplateLM):
"""
An abstracted Huggingface model class. Enables usage with both models of
"""An abstracted Huggingface model class. Enables usage with both models of
`transformers.AutoModelForCausalLM` and `transformers.AutoModelForSeq2SeqLM` classes.
Supports data-parallel multi-GPU with HF Accelerate.
......@@ -98,6 +101,8 @@ class HFLM(TemplateLM):
# end token for thinking, either the string or int token id.
# splits to get response after this token (if provided).
think_end_token: str | int | None = None,
enable_thinking: bool | None = None,
chat_template_args: dict[str, Any] | None = None,
**kwargs,
) -> None:
super().__init__()
......@@ -237,6 +242,11 @@ class HFLM(TemplateLM):
self.vocab_size = self.tokenizer.vocab_size
# select (or create) a pad token to use
self.tokenizer = configure_pad_token(self.tokenizer, model_config=self.config)
self.chat_template_args = (
chat_template_args or {} | dict(enable_thinking=enable_thinking)
if enable_thinking is not None
else {}
)
self.add_bos_token = add_bos_token
if "gemma" in getattr(self.config, "model_type", ""):
......@@ -370,13 +380,8 @@ class HFLM(TemplateLM):
}
else: # Estimating the possible memory requirements
max_memory_all_gpus = get_max_memory()
if "cpu" in max_memory_all_gpus:
del max_memory_all_gpus["cpu"]
if not hasattr(self, "accelerator"):
max_memory_per_gpu_map = {
k: v for k, v in max_memory_all_gpus.items()
}
else:
max_memory_all_gpus.pop("cpu", None)
if hasattr(self, "accelerator"):
# use only 1 / num_processes of the GPUs if we are running under accelerate launch
max_memory_per_gpu_map = {
k: v
......@@ -384,6 +389,9 @@ class HFLM(TemplateLM):
if k % num_local_processes
== (self.accelerator.process_index % num_local_processes)
}
else:
max_memory_per_gpu_map = max_memory_all_gpus
args["max_memory"] = max_memory_per_gpu_map
args["device_map"] = "auto" if device_map is None else device_map
eval_logger.info(
......@@ -427,12 +435,12 @@ class HFLM(TemplateLM):
return self._model
@property
def eot_token_id(self):
def eot_token_id(self) -> int:
# we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
return self.tokenizer.eos_token_id
@property
def prefix_token_id(self):
def prefix_token_id(self) -> int:
# it is used as prefix for loglikelihood
if self.custom_prefix_token_id is not None:
return self.custom_prefix_token_id
......@@ -441,7 +449,7 @@ class HFLM(TemplateLM):
return self.tokenizer.eos_token_id
@property
def max_length(self):
def max_length(self) -> int:
if self._max_length: # if max length manually set, return it
return self._max_length
seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
......@@ -449,7 +457,7 @@ class HFLM(TemplateLM):
if hasattr(self.model.config, attr):
return getattr(self.model.config, attr)
if hasattr(self.tokenizer, "model_max_length"):
if self.tokenizer.model_max_length == 1000000000000000019884624838656:
if self.tokenizer.model_max_length == TOKENIZER_INFINITY:
return self._DEFAULT_MAX_LENGTH
return self.tokenizer.model_max_length
return self._DEFAULT_MAX_LENGTH
......@@ -484,8 +492,8 @@ class HFLM(TemplateLM):
backend: Literal["default", "causal", "seq2seq"] = "default",
trust_remote_code: bool | None = False,
) -> None:
"""
Helper method during initialization.
"""Helper method during initialization.
Determines the backend ("causal" (decoder-only) or "seq2seq" (encoder-decoder)) model type to be used.
sets `self.AUTO_MODEL_CLASS` appropriately if not already set.
......@@ -504,13 +512,18 @@ class HFLM(TemplateLM):
)
else:
# determine and use the default HF backend for this model, based on its config + metadata.
if self.config.model_type in MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES:
if (
getattr(config, "model_type", None)
in MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
):
# first check if model type is listed under seq2seq models, since some
# models like MBart are listed in both seq2seq and causal mistakenly in HF transformers.
# these special cases should be treated as seq2seq models.
self.backend = "seq2seq"
eval_logger.debug(f"Using model type '{self.backend}'")
elif self.config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
elif (
getattr(config, "model_type", None) in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
):
self.backend = "causal"
eval_logger.debug(f"Using model type '{self.backend}'")
else:
......@@ -541,7 +554,7 @@ class HFLM(TemplateLM):
gguf_file: str | None = None,
subfolder: str = "",
) -> None:
"""Return the model config for HuggingFace models"""
"""Return the model config for HuggingFace models."""
self._config = transformers.AutoConfig.from_pretrained(
pretrained,
revision=revision,
......@@ -574,8 +587,7 @@ class HFLM(TemplateLM):
subfolder: str = "",
**kwargs,
) -> None:
"""
Initializes an HF or HF-compatible PreTrainedModel from scratch
"""Initializes an HF or HF-compatible PreTrainedModel from scratch
inside HFLM, using the kwargs passed into self.__init__().
Also handles functionality such as AutoGPTQ usage and PEFT wrapping.
......@@ -586,7 +598,7 @@ class HFLM(TemplateLM):
please consider subclassing HFLM and overriding this and other methods as needed.
"""
model_kwargs = kwargs if kwargs else {}
model_kwargs = kwargs or {}
model_kwargs.update(
self._get_accelerate_args(
......@@ -600,15 +612,12 @@ class HFLM(TemplateLM):
)
if not autogptq and not gptqmodel:
if model_kwargs.get("load_in_4bit", None):
assert transformers.__version__ >= "4.30.0", (
if model_kwargs.get("load_in_4bit"):
assert vparse(transformers.__version__) >= vparse("4.30.0"), (
"load_in_4bit requires transformers >= 4.30.0"
)
if transformers.__version__ >= "4.30.0" and (
model_kwargs.get("load_in_4bit")
and (compute_dtype := model_kwargs.get("bnb_4bit_compute_dtype"))
):
model_kwargs["bnb_4bit_compute_dtype"] = get_dtype(compute_dtype)
if compute_dtype := model_kwargs.get("bnb_4bit_compute_dtype"):
model_kwargs["bnb_4bit_compute_dtype"] = get_dtype(compute_dtype)
self._model = self.AUTO_MODEL_CLASS.from_pretrained(
pretrained,
......@@ -666,9 +675,9 @@ class HFLM(TemplateLM):
if peft:
from peft import PeftModel, __version__ as PEFT_VERSION
if model_kwargs.get("load_in_4bit") and version.parse(
PEFT_VERSION
) < version.parse("0.4.0"):
if model_kwargs.get("load_in_4bit") and vparse(PEFT_VERSION) < vparse(
"0.4.0"
):
raise AssertionError("load_in_4bit requires peft >= 0.4.0")
if self._model.config.vocab_size != len(self.tokenizer):
# resize model for LoRAs with added tokens
......@@ -694,10 +703,10 @@ class HFLM(TemplateLM):
for name, param in self._model.state_dict().items():
try:
param.data += _model_delta.state_dict()[name]
except KeyError:
except KeyError as e:
raise KeyError(
f"Delta model is missing weights for layer: {name}"
) from None
) from e
except Exception as e:
raise RuntimeError(
f"Failed to add delta weights to layer {name}. Error: {e}"
......@@ -705,8 +714,6 @@ class HFLM(TemplateLM):
del _model_delta
return None
def _create_tokenizer(
self,
pretrained: str | transformers.PreTrainedModel,
......@@ -721,8 +728,7 @@ class HFLM(TemplateLM):
add_bos_token: bool | None = False,
subfolder: str | None = "",
) -> None:
"""
Helper method during initialization.
"""Helper method during initialization.
Create a tokenizer object corresponding to the correct
tokenizer for value of `pretrained`, or use the pre-initialized tokenizer passed.
......@@ -768,9 +774,8 @@ class HFLM(TemplateLM):
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
model_name, **kwargs
)
return None
def _detect_batch_size(self, requests=None, pos: int = 0):
def _detect_batch_size(self, requests: Sequence | None = None, pos: int = 0):
if requests:
_, context_enc, continuation_enc = requests[pos]
max_length = len(
......@@ -785,7 +790,7 @@ class HFLM(TemplateLM):
# if OOM, then halves batch_size and tries again
@find_executable_batch_size(starting_batch_size=self.max_batch_size)
def forward_batch(batch_size):
def forward_batch(batch_size: int):
if self.backend == "seq2seq":
length = max(max_context_enc, max_cont_enc)
batched_conts = torch.ones(
......@@ -832,7 +837,10 @@ class HFLM(TemplateLM):
return batch_size
def tok_encode(
self, string: str, left_truncate_len=None, add_special_tokens=None
self,
string: str,
left_truncate_len: int | None = None,
add_special_tokens: bool | None = None,
) -> list[int]:
""" """
# default for None - empty dict, use predefined tokenizer param
......@@ -861,7 +869,7 @@ class HFLM(TemplateLM):
self,
strings: list[str],
padding_side: str = "left",
left_truncate_len: int = None,
left_truncate_len: int | None = None,
truncation: bool = False,
) -> tuple[torch.Tensor, torch.Tensor]:
# encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
......@@ -882,7 +890,7 @@ class HFLM(TemplateLM):
if left_truncate_len:
original_lengths = encoding["input_ids"].size(1)
if original_lengths > left_truncate_len:
eval_logger.warn(
eval_logger.warning(
f"Left truncation applied. Original sequence length was {original_lengths}, "
f"truncating to last {left_truncate_len} tokens. Some content will be lost.",
)
......@@ -894,11 +902,17 @@ class HFLM(TemplateLM):
return encoding["input_ids"], encoding["attention_mask"]
def tok_decode(self, tokens, skip_special_tokens=True):
def tok_decode(self, tokens: Iterator[list[str]], skip_special_tokens: bool = True):
return self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens)
def _model_call(self, inps, attn_mask=None, labels=None):
def _model_call(
self,
inps: torch.Tensor,
attn_mask: torch.Tensor | None = None,
labels: torch.Tensor | None = None,
) -> torch.Tensor:
"""
:param inps: torch.Tensor
A torch tensor of shape [batch, (sequence_ctx + sequence_cont)] or of shape
[batch, sequence_ctx]. the size of sequence may vary from call to call
......@@ -926,14 +940,20 @@ class HFLM(TemplateLM):
return self.model(
input_ids=inps, attention_mask=attn_mask, labels=labels
).logits
else:
assert self.AUTO_MODEL_CLASS in (
transformers.AutoModelForCausalLM,
transformers.AutoModelForVision2Seq,
)
return self.model(inps).logits
def _model_generate(self, context, max_length, stop, **generation_kwargs):
def _model_generate(
self,
context,
max_length: int,
stop: list[str],
**generation_kwargs: dict[str, Any],
) -> torch.Tensor:
# temperature = 0.0 if not set
# if do_sample is false and temp==0.0:
# remove temperature, as do_sample=False takes care of this
......@@ -966,7 +986,10 @@ class HFLM(TemplateLM):
)
def _select_cont_toks(
self, logits: torch.Tensor, contlen: int = None, inplen: int = None
self,
logits: torch.Tensor,
contlen: int | None = None,
inplen: int | None = None,
) -> torch.Tensor:
if self.backend == "causal":
assert contlen and inplen, (
......@@ -1092,13 +1115,13 @@ class HFLM(TemplateLM):
self,
requests: list[tuple[tuple[str, str], list[int], list[int]]],
disable_tqdm: bool = False,
override_bs: int = None,
override_bs: int | None = None,
) -> list[tuple[float, bool]]:
# TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
res = []
def _collate(req: tuple[tuple[str, str], list[int], list[int]]):
"""Defines the key for the sorted method"""
"""Defines the key for the sorted method."""
# the negative sign on len(toks) sorts descending - this has a few advantages:
# - time estimates will always be over not underestimates, which is more useful for planning
# - to know the size of a batch when going through the list, you know the first one is always the batch
......@@ -1110,7 +1133,7 @@ class HFLM(TemplateLM):
return -len(toks), tuple(toks)
def _lookup_one_token_cont(req: tuple[tuple[str, str], list[int], list[int]]):
"""Defines the key to group and lookup one-token continuations"""
"""Defines the key to group and lookup one-token continuations."""
# Use with group_by="contexts" (optional)"
# allows for the creation of a lookup, so we can reuse logits in case of one-token continuations.
# speeds up some multiple-choice tasks proportionally to the number of choices.
......@@ -1388,7 +1411,7 @@ class HFLM(TemplateLM):
# add EOS token to stop sequences
until = handle_stop_sequences(kwargs.pop("until", None), eos=eos)
else:
raise ValueError(
raise TypeError(
f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
)
if "max_gen_toks" in kwargs:
......@@ -1471,15 +1494,14 @@ class HFLM(TemplateLM):
def apply_chat_template(
self, chat_history: list[dict[str, str]], add_generation_prompt: bool = True
) -> str:
"""
Method to apply a chat template to a list of chat history between user and model.
"""
"""Method to apply a chat template to a list of chat history between user and model."""
try:
chat_templated = self.tokenizer.apply_chat_template(
chat_history,
tokenize=False,
add_generation_prompt=add_generation_prompt,
continue_final_message=not add_generation_prompt,
**self.chat_template_args,
)
except jinja2.exceptions.TemplateError:
eval_logger.warning(
......@@ -1491,14 +1513,13 @@ class HFLM(TemplateLM):
tokenize=False,
add_generation_prompt=add_generation_prompt,
continue_final_message=not add_generation_prompt,
**self.chat_template_args,
)
return chat_templated
def get_model_info(self) -> dict:
"""
Method to get Hugging Face model information for experiment reproducibility.
"""
"""Method to get Hugging Face model information for experiment reproducibility."""
def get_model_num_params(model) -> int:
if hasattr(model, "num_parameters"):
......
......@@ -133,11 +133,11 @@ class VLLM(TemplateLM):
max_model_len: int | None = None,
seed: int = 1234,
gpu_memory_utilization: float = 0.9,
device: str = "cuda",
data_parallel_size: int = 1,
lora_local_path: str | None = None,
# VLLM: enable thinking tags in the prompt.
enable_thinking: bool = True,
chat_template_args: dict | None = None,
# End marker for thinking tags - splits to get response after this token (if provided).
think_end_token: str | None = None,
max_lora_rank: int = 16,
......@@ -154,6 +154,7 @@ class VLLM(TemplateLM):
assert max_length is None or max_model_len is None, (
"Either max_length or max_model_len may be provided, but not both"
)
kwargs.pop("device", None)
self.think_end_token = think_end_token
self.V1 = os.environ.get("VLLM_USE_V1", "1") != "0"
self._max_length = max_model_len if max_model_len is not None else max_length
......@@ -174,7 +175,6 @@ class VLLM(TemplateLM):
"swap_space": int(swap_space),
"quantization": quantization,
"seed": int(seed),
"device": str(device),
"enable_lora": bool(lora_local_path),
"max_lora_rank": int(max_lora_rank),
}
......@@ -211,7 +211,10 @@ class VLLM(TemplateLM):
add_bos_token=add_bos_token,
)
self.tokenizer = configure_pad_token(self.tokenizer, model_config=self._config)
self.enable_thinking = enable_thinking
self.chat_template_args = chat_template_args or {}
self.enable_thinking = self.chat_template_args.pop(
"enable_thinking", enable_thinking
)
self.add_bos_token = add_bos_token
if "gemma" in pretrained.lower():
self.add_bos_token = True
......@@ -319,6 +322,7 @@ class VLLM(TemplateLM):
continue_final_message=not add_generation_prompt,
chat_template=self.hf_chat_template,
enable_thinking=self.enable_thinking,
**self.chat_template_args,
)
except jinja2.exceptions.TemplateError:
eval_logger.warning(
......@@ -331,6 +335,7 @@ class VLLM(TemplateLM):
continue_final_message=not add_generation_prompt,
chat_template=self.hf_chat_template,
enable_thinking=self.enable_thinking,
**self.chat_template_args,
)
return chat_templated
......
......@@ -85,6 +85,7 @@
| [lambada_multilingual_stablelm](lambada_multilingual_stablelm/README.md) | Multilingual LAMBADA dataset. Users should prefer evaluating on this version of the multilingual dataset instead of on `lambada_multilingual`. | German, English, Spanish, French, Italian, Dutch, Portuguese |
| [leaderboard](leaderboard/README.md) | Task group used by Hugging Face's [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). Those tasks are static and will not change through time | English |
| [lingoly](lingoly/README.md) | Challenging logical reasoning benchmark in low-resource languages with controls for memorization | English, Multilingual |
| [libra](libra/README.md) | Evaluates long-context understanding in Russian across four complexity levels | Russian (MT) |
| [logiqa](logiqa/README.md) | Logical reasoning tasks requiring advanced inference and deduction. | English, Chinese |
| [logiqa2](logiqa2/README.md) | Large-scale logical reasoning dataset adapted from the Chinese Civil Service Examination. | English, Chinese |
| [mastermind](mastermind/README.md) | Reasoning benchmark based on the board game of Mastermind. | English |
......
......@@ -4,9 +4,9 @@ include: _boolq_cot_2shot_yaml
fewshot_config:
sampler: first_n
samples:
- context: 'This is a ferry domain, where the task is to transport cars from their start to their goal locations, using a ferry. Each location is accessible by ferry from each other location. The cars can be debarked or boarded, and the ferry can carry only one car at a time. There are 2 locations and 5 cars, numbered consecutively. Currently, the ferry is at l1, with the car c4 on board. The cars are at locations as follows: c0 and c3 are at l1; c1 and c2 are at l0.'
question: 'Is it possible to transition to a state where the action "travel by sea from location l0 to location l1" can be applied?'
answer: "Let's think step by step. Step 1: Verify if there is a sequence of actions which transforms the current state into a state where the precondition of the action \"travel by sea from location l0 to location l1\" hold. Step 2: The following sequence of actions would transition to such a state: sail from location l1 to location l0, unload the car c4 from the ferry to location l0, board car c1 at location l0. **Final Answer**: Yes."
- context: 'There are several cities, each containing several locations, some of which are airports. There are also trucks, which can drive within a single city, and airplanes, which can fly between airports. The goal is to get some packages from various locations to various new locations. There are 2 trucks and 1 airplane, as well as 4 packages. There are 6 locations across 2 cities. The locations are in cities as follows: l0-0, l0-1, and l0-2 are in c0; l1-1, l1-2, and l1-0 are in c1. Currently, a0 is at l1-0, t1 is at l1-1, t0 is at l0-0, p2 and p1 are in t1, p0 and p3 are in a0.'
question: 'Is it possible to transition to a state where the action "offload the object p0 from the truck p0 at location p1" can be applied?'
answer: "Let's think step by step. Step 1: Verify if there is a sequence of actions which transforms the current state into a state where the precondition of the action \"offload the object p0 from the truck p0 at location p1\" hold. Step 2: Action preconditions are \"p0 is in p0 and p0 is at p1\". Step 3: These facts are not reachable together, as they include mutually exclusive facts \"p0 is in p0 and p0 is at p1\". **Final Answer**: No."
- context: "This is a ferry domain, where the task is to transport cars from their start to their goal locations, using a ferry. Each location is accessible by ferry from each other location. The cars can be debarked or boarded, and the ferry can carry only one car at a time. There are 2 locations and 5 cars, numbered consecutively. Currently, the ferry is at l1, with the car c4 on board. The cars are at locations as follows: c0 and c3 are at l1; c1 and c2 are at l0."
question: 'Is it possible to transition to a state where the action "travel by sea from location l0 to location l1" can be applied?'
answer: "Let's think step by step. Step 1: Verify if there is a sequence of actions which transforms the current state into a state where the precondition of the action \"travel by sea from location l0 to location l1\" hold. Step 2: The following sequence of actions would transition to such a state: sail from location l1 to location l0, unload the car c4 from the ferry to location l0, board car c1 at location l0. **Final Answer**: Yes."
- context: "There are several cities, each containing several locations, some of which are airports. There are also trucks, which can drive within a single city, and airplanes, which can fly between airports. The goal is to get some packages from various locations to various new locations. There are 2 trucks and 1 airplane, as well as 4 packages. There are 6 locations across 2 cities. The locations are in cities as follows: l0-0, l0-1, and l0-2 are in c0; l1-1, l1-2, and l1-0 are in c1. Currently, a0 is at l1-0, t1 is at l1-1, t0 is at l0-0, p2 and p1 are in t1, p0 and p3 are in a0."
question: 'Is it possible to transition to a state where the action "offload the object p0 from the truck p0 at location p1" can be applied?'
answer: "Let's think step by step. Step 1: Verify if there is a sequence of actions which transforms the current state into a state where the precondition of the action \"offload the object p0 from the truck p0 at location p1\" hold. Step 2: Action preconditions are \"p0 is in p0 and p0 is at p1\". Step 3: These facts are not reachable together, as they include mutually exclusive facts \"p0 is in p0 and p0 is at p1\". **Final Answer**: No."
......@@ -67,7 +67,7 @@ def span_f1_agg(items):
def remove_blank_spaces(text):
text = re.sub(pattern=get_blank_spaces_pattern(), repl="", string=text)
text = re.sub("\s+", " ", text)
text = re.sub(r"\s+", " ", text)
return text
def remove_punctuation(text):
......
......@@ -67,7 +67,7 @@ def span_f1_agg(items):
def remove_blank_spaces(text):
text = re.sub(pattern=get_blank_spaces_pattern(), repl="", string=text)
text = re.sub("\s+", " ", text)
text = re.sub(r"\s+", " ", text)
return text
def remove_punctuation(text):
......
......@@ -67,7 +67,7 @@ def span_f1_agg(items):
def remove_blank_spaces(text):
text = re.sub(pattern=get_blank_spaces_pattern(), repl="", string=text)
text = re.sub("\s+", " ", text)
text = re.sub(r"\s+", " ", text)
return text
def remove_punctuation(text):
......
......@@ -67,7 +67,7 @@ def span_f1_agg(items):
def remove_blank_spaces(text):
text = re.sub(pattern=get_blank_spaces_pattern(), repl="", string=text)
text = re.sub("\s+", " ", text)
text = re.sub(r"\s+", " ", text)
return text
def remove_punctuation(text):
......
......@@ -67,7 +67,7 @@ def span_f1_agg(items):
def remove_blank_spaces(text):
text = re.sub(pattern=get_blank_spaces_pattern(), repl="", string=text)
text = re.sub("\s+", " ", text)
text = re.sub(r"\s+", " ", text)
return text
def remove_punctuation(text):
......
......@@ -12,9 +12,9 @@ def prompt_func(mode, lang):
"prompt_3": f"You are an assistant able to classify topics in texts. \n\n"
f"Given the categories technology, religion, politics, sports, health, entertainment, or business; what is "
f"the topic of the {lang} statement below? Return only the category. "
"\n\ntext: {{headline}} \category:\n\n",
"\n\ntext: {{headline}} \\category:\n\n",
"prompt_4": "Label the following text as technology, religion, politics, sports, health, entertainment, or geography. Provide only the category as your "
"response. \n\ntext: {{headline}} \category: \n\n",
"response. \n\ntext: {{headline}} \\category: \n\n",
"prompt_5": f"You are tasked with performing topic classification on the following {lang} text. "
f"For each input, classify the topic as technology, business, politics, sports, health, entertainment, or religion. "
f"Use the following guidelines: \n\n "
......@@ -27,7 +27,7 @@ def prompt_func(mode, lang):
f"business: The text covers economy, business, or related topics. \n\n"
f"If the text contains multiple topics, choose the dominant topic. "
f"For ambiguous or unclear topics, select the category that best reflects the overall content. "
"Please provide a single classification for each input.\n\ntext: {{headline}} \category: \n\n",
"Please provide a single classification for each input.\n\ntext: {{headline}} \\category: \n\n",
}
return prompt_map[mode]
......
......@@ -17,9 +17,9 @@ def prompt_func(mode, lang):
"prompt_3": f"You are an assistant able to classify topics in texts. \n\n"
f"Given the categories science/technology, travel, politics, sports, health, entertainment, or geography; what is "
f"the topic of the {lang} statement below? Return only the category. "
"\n\ntext: {{text}} \category:\n\n",
"\n\ntext: {{text}} \\category:\n\n",
"prompt_4": "Label the following text as science/technology, travel, politics, sports, health, entertainment, or geography. Provide only the category as your "
"response. \n\ntext: {{text}} \category: \n\n",
"response. \n\ntext: {{text}} \\category: \n\n",
"prompt_5": f"You are tasked with performing topic classification on the following {lang} text. "
f"For each input, classify the topic as science/technology, travel, politics, sports, health, entertainment, or geography. "
f"Use the following guidelines: \n\n "
......@@ -32,7 +32,7 @@ def prompt_func(mode, lang):
f"geography: The text involves geographical information, locations, or related topics. \n\n"
f"If the text contains multiple topics, choose the dominant topic. "
f"For ambiguous or unclear topics, select the category that best reflects the overall content. "
"Please provide a single classification for each input.\n\ntext: {{text}} \category: \n\n",
"Please provide a single classification for each input.\n\ntext: {{text}} \\category: \n\n",
}
return prompt_map[mode]
......
......@@ -4,8 +4,6 @@ tag:
task: null
dataset_path: csebuetnlp/xlsum
dataset_name: null
dataset_kwargs:
trust_remote_code: true
output_type: generate_until
generation_kwargs:
until:
......
......@@ -4,8 +4,6 @@ tag:
task: null
dataset_path: csebuetnlp/xlsum
dataset_name: null
dataset_kwargs:
trust_remote_code: true
output_type: generate_until
generation_kwargs:
until:
......
......@@ -4,8 +4,6 @@ tag:
task: null
dataset_path: csebuetnlp/xlsum
dataset_name: null
dataset_kwargs:
trust_remote_code: true
output_type: generate_until
generation_kwargs:
until:
......
......@@ -47,7 +47,7 @@ def parse_math_answer(raw_string):
return retval
def get_answer_with_dollar_sign(s):
first_pattern = "\$(.*)\$"
first_pattern = r"\$(.*)\$"
last_match = None
matches = re.findall(first_pattern, s)
if matches:
......@@ -63,7 +63,7 @@ def parse_math_answer(raw_string):
if "\\n" in last_match:
last_match = last_match.split("\\n")[0]
else:
pattern = "(?:\\$)?\d+(?:\.\d+)?(?![\w\d])"
pattern = "(?:\\$)?\\d+(?:\\.\\d+)?(?![\\w\\d])"
matches = re.findall(pattern, s)
if matches:
last_match = matches[-1]
......@@ -186,7 +186,7 @@ def _strip_string(string):
# remove percentage
string = string.replace("\\%", "")
string = string.replace("\%", "")
string = string.replace(r"\%", "")
# " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
string = string.replace(" .", " 0.")
......
......@@ -15,5 +15,3 @@ metric_list:
higher_is_better: true
metadata:
version: 2.0
dataset_kwargs:
trust_remote_code: true
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment