Commit bf11ac93 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into llama

parents 83b1c564 ade01428
import copy
import logging
from typing import Dict, List, Optional, Tuple, Union
import torch
......@@ -7,7 +8,6 @@ import transformers
from tqdm import tqdm
from transformers import BatchEncoding
from lm_eval import utils
from lm_eval.api.instance import Instance
from lm_eval.api.registry import register_model
from lm_eval.models.huggingface import HFLM
......@@ -24,7 +24,7 @@ from lm_eval.models.utils import (
DEFAULT_IMAGE_PLACEHOLDER = "<image>"
eval_logger = utils.eval_logger
eval_logger = logging.getLogger(__name__)
@register_model("hf-multimodal")
......
import copy
import logging
import os
from datetime import timedelta
from pathlib import Path
......@@ -39,7 +40,7 @@ from lm_eval.models.utils import (
)
eval_logger = utils.eval_logger
eval_logger = logging.getLogger(__name__)
@register_model("hf-auto", "hf", "huggingface")
......
import copy
import json
import logging
import os
from functools import lru_cache
from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Type, cast
......@@ -10,7 +11,10 @@ from lm_eval.api.instance import Instance
from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
from lm_eval.models.api_models import JsonChatStr
from lm_eval.utils import eval_logger, simple_parse_args_string
from lm_eval.utils import simple_parse_args_string
eval_logger = logging.getLogger(__name__)
class LogLikelihoodResult(NamedTuple):
......
......@@ -13,6 +13,7 @@
# limitations under the License.
import importlib
import logging
import pathlib
from copy import deepcopy
from typing import List, Literal
......@@ -27,13 +28,15 @@ from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
from lm_eval.models.utils import Collator
from lm_eval.utils import (
eval_logger,
get_rolling_token_windows,
make_disjoint_window,
simple_parse_args_string,
)
eval_logger = logging.getLogger(__name__)
def _patch_pretrained_cfg(
pretrained_cfg, trainer, tensor_model_parallel_size, pipeline_model_parallel_size
):
......
import copy
import logging
from typing import List, Optional, Tuple, Union
import numpy
......@@ -13,7 +14,7 @@ from lm_eval.api.registry import register_model
from lm_eval.models.huggingface import HFLM
eval_logger = utils.eval_logger
eval_logger = logging.getLogger(__name__)
@register_model("sparseml")
......
import logging
import os
from functools import cached_property
from operator import itemgetter
......@@ -6,7 +7,9 @@ from typing import Any, Dict, List, Optional, Tuple, Union
from lm_eval.api.registry import register_model
from lm_eval.models.api_models import TemplateAPI
from lm_eval.models.utils import handle_stop_sequences
from lm_eval.utils import eval_logger
eval_logger = logging.getLogger(__name__)
@register_model("local-completions")
......@@ -288,4 +291,6 @@ class OpenAIChatCompletion(LocalChatCompletion):
if "o1" in self.model:
output.pop("stop")
output["temperature"] = 1
elif "o3" in self.model:
output.pop("temperature")
return output
import logging
from importlib.util import find_spec
from lm_eval import utils
from lm_eval.api.registry import register_model
from lm_eval.models.huggingface import HFLM
from lm_eval.models.utils import get_dtype
eval_logger = utils.eval_logger
eval_logger = logging.getLogger(__name__)
@register_model("ipex")
......
import json
import logging
from importlib.util import find_spec
from pathlib import Path
from lm_eval import utils
from lm_eval.api.registry import register_model
from lm_eval.models.huggingface import HFLM
eval_logger = utils.eval_logger
eval_logger = logging.getLogger(__name__)
@register_model("openvino")
......
This diff is collapsed.
......@@ -2,6 +2,7 @@ import collections
import fnmatch
import gc
import itertools
import logging
import time
from functools import wraps
from typing import (
......@@ -22,7 +23,8 @@ from typing import (
import torch
import transformers
from lm_eval.utils import eval_logger
eval_logger = logging.getLogger(__name__)
if TYPE_CHECKING:
......
import copy
import logging
from importlib.metadata import version
from importlib.util import find_spec
from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union
......@@ -17,7 +18,6 @@ from lm_eval.models.utils import (
undistribute,
)
from lm_eval.utils import (
eval_logger,
get_rolling_token_windows,
make_disjoint_window,
)
......@@ -34,7 +34,7 @@ except ModuleNotFoundError:
if TYPE_CHECKING:
pass
eval_logger = eval_logger
eval_logger = logging.getLogger(__name__)
@register_model("vllm")
......@@ -75,7 +75,6 @@ class VLLM(TemplateLM):
"Please install vllm via `pip install lm-eval[vllm]` or `pip install -e .[vllm]`"
)
assert "cuda" in device or device is None, "vLLM only supports CUDA"
assert max_length is None or max_model_len is None, (
"Either max_length or max_model_len may be provided, but not both"
)
......@@ -110,7 +109,7 @@ class VLLM(TemplateLM):
eval_logger.warning(
"You might experience occasional issues with model weight downloading when data_parallel is in use. To ensure stable performance, run with data_parallel_size=1 until the weights are downloaded and cached."
)
self.model_args["worker_use_ray"] = True
self.model_args["distributed_executor_backend"] = "ray"
self.batch_size = "auto"
eval_logger.info("Manual batching is not compatible with data parallelism.")
......@@ -244,15 +243,13 @@ class VLLM(TemplateLM):
temperature=0, prompt_logprobs=1, max_tokens=1, detokenize=False
)
if self.data_parallel_size > 1:
# vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote
# vLLM hangs if resources are set in ray.remote
# also seems to only work with decorator and not with ray.remote() fn
# see https://github.com/vllm-project/vllm/issues/973
# note: this has changed on 0.3.3, and it only works now if num_gpus are set.
# but then tensor_parallel breaks
@ray.remote
def run_inference_one_model(
model_args: dict,
sampling_params,
sampling_params: SamplingParams,
requests: List[List[int]],
lora_request: LoRARequest,
):
......
import copy
import logging
from typing import Dict, List, Optional
import transformers
......@@ -14,7 +15,9 @@ from lm_eval.models.utils import (
undistribute,
)
from lm_eval.models.vllm_causallms import VLLM
from lm_eval.utils import eval_logger
eval_logger = logging.getLogger(__name__)
try:
......@@ -106,11 +109,9 @@ class VLLM_VLM(VLLM):
temperature=0, prompt_logprobs=1, max_tokens=1, detokenize=False
)
if self.data_parallel_size > 1:
# vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote
# vLLM hangs if resources are set in ray.remote
# also seems to only work with decorator and not with ray.remote() fn
# see https://github.com/vllm-project/vllm/issues/973
# note: this has changed on 0.3.3, and it only works now if num_gpus are set.
# but then tensor_parallel breaks
@ray.remote
def run_inference_one_model(
model_args: dict, sampling_params, requests: List[List[dict]]
......
import ast
import logging
import os
from typing import Dict
from lm_eval import utils
from lm_eval.utils import eval_logger
eval_logger = logging.getLogger(__name__)
# Prompt library.
# Stores prompts in a dictionary indexed by 2 levels:
# prompt category name, and prompt name.
......
This diff is collapsed.
......@@ -13,6 +13,8 @@ from lm_eval.evaluator_utils import get_subtask_list
GROUP_ONLY_KEYS = list(GroupConfig().to_dict().keys())
eval_logger = logging.getLogger(__name__)
class TaskManager:
"""TaskManager indexes all tasks from the default `lm_eval/tasks/`
......@@ -22,14 +24,13 @@ class TaskManager:
def __init__(
self,
verbosity="INFO",
verbosity: Optional[str] = None,
include_path: Optional[Union[str, List]] = None,
include_defaults: bool = True,
) -> None:
self.verbosity = verbosity
if verbosity is not None:
utils.setup_logging(verbosity)
self.include_path = include_path
self.logger = utils.eval_logger
self.logger.setLevel(getattr(logging, f"{verbosity}"))
self._task_index = self.initialize_tasks(
include_path=include_path, include_defaults=include_defaults
......@@ -456,7 +457,7 @@ class TaskManager:
"yaml_path": -1,
}
elif tasks_and_groups[tag]["type"] != "tag":
self.logger.info(
eval_logger.info(
f"The tag '{tag}' is already registered as a group, this tag will not be registered. "
"This may affect tasks you want to call."
)
......@@ -519,7 +520,7 @@ class TaskManager:
config, task, tasks_and_groups, print_info
)
else:
self.logger.debug(f"File {f} in {root} could not be loaded")
eval_logger.debug(f"File {f} in {root} could not be loaded")
return tasks_and_groups
......
......@@ -10,7 +10,7 @@ import yaml
from tqdm import tqdm
eval_logger = logging.getLogger("lm-eval")
eval_logger = logging.getLogger(__name__)
SUBJECTS = {
......
......@@ -58,3 +58,6 @@ If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
### Changelog
version 2.0: (2025-Feb-14) set target delimiter to "" as the targets already start with a space.
......@@ -8,11 +8,12 @@ validation_split: validation
test_split: null
doc_to_text: "{{context}}"
doc_to_target: "{{completion}}"
target_delimiter: ""
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
version: 2.0
dataset_kwargs:
trust_remote_code: true
......@@ -5,14 +5,16 @@
BasqueBench is a benchmark for evaluating language models in Basque tasks. This is, it evaluates the ability of a language model to understand and generate Basque text. BasqueBench offers a combination of pre-existing, open datasets and datasets developed exclusivelly for this benchmark. All the details of BasqueBench will be published in a paper soon.
The new evaluation datasets included in BasqueBench are:
| Task | Category | Homepage |
|:-------------:|:-----:|:-----:|
| MGSM_eu | Math | https://huggingface.co/datasets/HiTZ/MGSM-eu |
| PIQA_eu | Question Answering | https://huggingface.co/datasets/HiTZ/PIQA-eu |
| WNLI_eu | Natural Language Inference | https://huggingface.co/datasets/HiTZ/wnli-eu |
| XCOPA_eu | Commonsense Reasoning | https://huggingface.co/datasets/HiTZ/XCOPA-eu |
| Task | Category | Homepage |
|:--------:|:--------------------------:|:---------------------------------------------:|
| ARC_eu | Question Answering | https://huggingface.co/datasets/HiTZ/ARC-eu |
| MGSM_eu | Math | https://huggingface.co/datasets/HiTZ/MGSM-eu |
| PAWS_eu | Paraphrasing | https://huggingface.co/datasets/HiTZ/PAWS-eu |
| PIQA_eu | Question Answering | https://huggingface.co/datasets/HiTZ/PIQA-eu |
| WNLI_eu | Natural Language Inference | https://huggingface.co/datasets/HiTZ/WNLI-eu |
| XCOPA_eu | Commonsense Reasoning | https://huggingface.co/datasets/HiTZ/XCOPA-eu |
The datasets included in BasqueBench that have been made public in previous pubications are:
The datasets included in BasqueBench that have been made public in previous publications are:
| Task | Category | Paper title | Homepage |
|:-------------:|:-----:|:-------------:|:-----:|
......@@ -28,7 +30,40 @@ The datasets included in BasqueBench that have been made public in previous pubi
### Citation
Paper for BasqueBench coming soon.
```
@inproceedings{baucells-etal-2025-iberobench,
title = "{I}bero{B}ench: A Benchmark for {LLM} Evaluation in {I}berian Languages",
author = "Baucells, Irene and
Aula-Blasco, Javier and
de-Dios-Flores, Iria and
Paniagua Su{\'a}rez, Silvia and
Perez, Naiara and
Salles, Anna and
Sotelo Docio, Susana and
Falc{\~a}o, J{\'u}lia and
Saiz, Jose Javier and
Sepulveda Torres, Robiert and
Barnes, Jeremy and
Gamallo, Pablo and
Gonzalez-Agirre, Aitor and
Rigau, German and
Villegas, Marta",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.699/",
pages = "10491--10519",
}
```
### Groups and Tasks
......@@ -40,6 +75,8 @@ Paper for BasqueBench coming soon.
#### Tasks
The following tasks evaluate tasks on BasqueBench dataset using various scoring methods.
- `arc_eu_challenge`
- `arc_eu_easy`
- `belebele_eus_Latn`
- `eus_exams_eu`
- `eus_proficiency`
......@@ -64,6 +101,7 @@ The following tasks evaluate tasks on BasqueBench dataset using various scoring
- `flores_pt-eu`
- `mgsm_direct_eu`
- `mgsm_native_cot_eu`
- `paws_eu`
- `piqa_eu`
- `qnlieu`
- `wnli_eu`
......
include: arc_eu_easy.yaml
task: arc_eu_challenge
dataset_name: ARC-Challenge
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment