"tests/L0/git@developer.sourcefind.cn:OpenDAS/apex.git" did not exist on "dcc7b5132ae85b937ac7d186ba8667b340b747cf"
Commit 4eecbabb authored by Baber's avatar Baber
Browse files

Merge branch 'main' into prefill

parents dac8b534 fb963f0f
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
*Latest News 📣* *Latest News 📣*
- [2024/09] We are prototyping allowing users of LM Evaluation Harness to create and evaluate on text+image multimodal input, text output tasks, and have just added the `hf-multimodal` and `vllm-vlm` model types and `mmmu` task as a prototype feature. We welcome users to try out this in-progress feature and stress-test it for themselves, and suggest they check out [`lmms-eval`](https://github.com/EvolvingLMMs-Lab/lmms-eval), a wonderful project originally forking off of the lm-evaluation-harness, for a broader range of multimodal tasks, models, and features.
- [2024/07] [API model](docs/API_guide.md) support has been updated and refactored, introducing support for batched and async requests, and making it significantly easier to customize and use for your own purposes. **To run Llama 405B, we recommend using VLLM's OpenAI-compliant API to host the model, and use the `local-completions` model type to evaluate the model.** - [2024/07] [API model](docs/API_guide.md) support has been updated and refactored, introducing support for batched and async requests, and making it significantly easier to customize and use for your own purposes. **To run Llama 405B, we recommend using VLLM's OpenAI-compliant API to host the model, and use the `local-completions` model type to evaluate the model.**
- [2024/07] New Open LLM Leaderboard tasks have been added ! You can find them under the [leaderboard](lm_eval/tasks/leaderboard/README.md) task group. - [2024/07] New Open LLM Leaderboard tasks have been added ! You can find them under the [leaderboard](lm_eval/tasks/leaderboard/README.md) task group.
......
...@@ -299,13 +299,6 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ...@@ -299,13 +299,6 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
"When `fewshot_as_multiturn` is selected, `apply_chat_template` must be set (either to `True` or to the chosen template name)." "When `fewshot_as_multiturn` is selected, `apply_chat_template` must be set (either to `True` or to the chosen template name)."
) )
if (
args.num_fewshot is None or args.num_fewshot == 0
) and args.fewshot_as_multiturn:
raise ValueError(
"If fewshot_as_multiturn is set, num_fewshot must be greater than 0."
)
if args.include_path is not None: if args.include_path is not None:
eval_logger.info(f"Including path: {args.include_path}") eval_logger.info(f"Including path: {args.include_path}")
task_manager = TaskManager(args.verbosity, include_path=args.include_path) task_manager = TaskManager(args.verbosity, include_path=args.include_path)
......
...@@ -3,7 +3,7 @@ import hashlib ...@@ -3,7 +3,7 @@ import hashlib
import json import json
import logging import logging
import os import os
from typing import Dict, List, Optional, Tuple, Type, TypeVar from typing import Dict, List, Optional, Tuple, Type, TypeVar, Union
import transformers import transformers
from sqlitedict import SqliteDict from sqlitedict import SqliteDict
...@@ -192,15 +192,13 @@ class LM(abc.ABC): ...@@ -192,15 +192,13 @@ class LM(abc.ABC):
"To use this model with chat templates, please implement the 'tokenizer_name' property." "To use this model with chat templates, please implement the 'tokenizer_name' property."
) )
@property def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
def chat_template(self) -> str: """Returns the chat template structure for user/assistant messages if a template is provided.
"""Must be defined for LM subclasses that implement Chat Templating. This method is intended to be overridden in a subclass to define a specific chat template format.
Should return the structure of the chat template applied to user/assistant messages. For models that do not support chat templates, this method returns None by default.
This is used only to save in the experiment results for reproducibility.
""" """
raise NotImplementedError(
"To use this model with chat templates, please implement the 'chat_template' property." return ""
)
def set_cache_hook(self, cache_hook) -> None: def set_cache_hook(self, cache_hook) -> None:
self.cache_hook = cache_hook self.cache_hook = cache_hook
...@@ -316,6 +314,8 @@ class TemplateLM(LM): ...@@ -316,6 +314,8 @@ class TemplateLM(LM):
and boilerplate often included in other LM subclasses. and boilerplate often included in other LM subclasses.
""" """
tokenizer = None
@property @property
@abc.abstractmethod @abc.abstractmethod
def eot_token_id(self): def eot_token_id(self):
...@@ -386,3 +386,99 @@ class TemplateLM(LM): ...@@ -386,3 +386,99 @@ class TemplateLM(LM):
@abc.abstractmethod @abc.abstractmethod
def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]: def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
pass pass
def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
"""
Set and get the appropriate chat template for the model.
This method sets the tokenizer's chat_template and returns the template string for reproducibility.
The template selection logic is adapted from the Transformers library's `apply_chat_template`
method in the Tokenizer class. The original implementation can be found at:
https://github.com/huggingface/transformers/blob/fc35907f95459d7a6c5281dfadd680b6f7b620e3/src/transformers/tokenization_utils_base.py#L1687
This method ensures that the right template is chosen based on the following:
0. If the model has no 'tokenizer' attribute: assumes that there is only a single possible chat template, handled on the model provider side internally. Returns the empty string.
1. If the model's tokenizer has multiple templates:
a. Use the specified template if it exists in the dictionary.
b. Use the default template from the list if no specific template is provided.
c. Raise an error if no default template exists and no specific template is provided.
2. If the model's tokenizer has a single template or no template:
a. Use the tokenizer's chat template if available.
b. Fall back to the default chat template if no tokenizer chat template exists.
Args:
chat_template (Union[bool, str]): Specifies the chat template to use.
- If False or None, no template is applied.
- If True, the default or only available template is used.
- If a string, the template with the matching name is used.
Returns:
Optional[str]: The selected chat template, or None if no template is applied.
"""
if self.tokenizer is None:
return ""
if chat_template is False or chat_template is None:
eval_logger.warning(
"model.chat_template was called with the chat_template set to False or None. "
"Therefore no chat template will be applied. Make sure this is an intended behavior."
)
return None
# Convert boolean chat_template to None to ensure compatibility with the adapted logic
if isinstance(chat_template, bool):
chat_template = None
using_default_template = False
# First, handle the cases when the model has a dict of multiple templates
template = self.tokenizer.chat_template or self.tokenizer.default_chat_template
if isinstance(template, dict):
using_default_dict = self.tokenizer.chat_template is None
if chat_template is not None:
if chat_template in template:
selected_template = template[chat_template]
if using_default_dict:
using_default_template = True
else:
raise ValueError(
f"The specified chat template '{chat_template}' is not available. "
f"Available template names are {sorted(template.keys())}."
)
else:
# If user didn't pass a chat template, use the default template from the dict
if "default" in template:
selected_template = template["default"]
using_default_template = True
else:
raise ValueError(
"This model has multiple chat templates with no default specified! Please either pass a chat "
"template or the name of the template you wish to use to the `chat_template` argument. Available "
f"template names are {sorted(template.keys())}."
)
# Cases when the model has a single template or no template
else:
# priority: `chat_template` argument > `tokenizer.chat_template` > `tokenizer.default_chat_template
if isinstance(chat_template, str):
eval_logger.warning(
"Chat template name provided, but the tokenizer's chat template is not a dictionary. "
"Using the tokenizer's chat template or the default template instead."
)
if self.tokenizer.chat_template is not None:
selected_template = self.tokenizer.chat_template
else:
selected_template = self.tokenizer.default_chat_template
using_default_template = True
if using_default_template:
eval_logger.warning(
"No chat template is set for this tokenizer, falling back to a default class-level template. This is "
"very error-prone, because models are often trained with templates different from the class default! "
"Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which "
"point any code depending on them will stop working. We recommend setting a valid chat template before "
"then to ensure that this model continues working without issues."
)
return selected_template
...@@ -75,6 +75,7 @@ class TaskConfig(dict): ...@@ -75,6 +75,7 @@ class TaskConfig(dict):
process_docs: Optional[Callable] = None process_docs: Optional[Callable] = None
doc_to_text: Optional[Union[Callable, str]] = None doc_to_text: Optional[Union[Callable, str]] = None
doc_to_target: Optional[Union[Callable, str]] = None doc_to_target: Optional[Union[Callable, str]] = None
doc_to_image: Union[Callable, str] = None
doc_to_choice: Optional[Union[Callable, str, dict, list]] = None doc_to_choice: Optional[Union[Callable, str, dict, list]] = None
process_results: Optional[Union[Callable, str]] = None process_results: Optional[Union[Callable, str]] = None
use_prompt: Optional[str] = None use_prompt: Optional[str] = None
...@@ -378,6 +379,10 @@ class Task(abc.ABC): ...@@ -378,6 +379,10 @@ class Task(abc.ABC):
def doc_to_target(self, doc): def doc_to_target(self, doc):
pass pass
# not an abstractmethod because not every language-only task has to implement this
def doc_to_image(self, doc):
raise NotImplementedError
def build_all_requests( def build_all_requests(
self, self,
*, *,
...@@ -736,6 +741,10 @@ class ConfigurableTask(Task): ...@@ -736,6 +741,10 @@ class ConfigurableTask(Task):
) )
self.OUTPUT_TYPE = self.config.output_type self.OUTPUT_TYPE = self.config.output_type
if self.config.doc_to_image is not None:
# mark the task as requiring multimodality.
self.MULTIMODAL = True
if self.config.dataset_path is not None: if self.config.dataset_path is not None:
self.DATASET_PATH = self.config.dataset_path self.DATASET_PATH = self.config.dataset_path
...@@ -1049,8 +1058,8 @@ class ConfigurableTask(Task): ...@@ -1049,8 +1058,8 @@ class ConfigurableTask(Task):
Whether to apply the chat template to the fewshot context. Whether to apply the chat template to the fewshot context.
:param fewshot_as_multiturn: bool :param fewshot_as_multiturn: bool
Whether to provide the fewshot examples as a multiturn conversation or a single user turn. Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
:param chat_template: Callable :param chat_template:
Chat template to be applied to the fewshot context. callable (from lm.apply_chat_template) that takes in a list[Dict] chat transcript and renders it into a string.
:returns: str :returns: str
The fewshot context. The fewshot context.
""" """
...@@ -1303,9 +1312,34 @@ class ConfigurableTask(Task): ...@@ -1303,9 +1312,34 @@ class ConfigurableTask(Task):
else: else:
raise TypeError raise TypeError
def doc_to_image(self, doc: Any, doc_to_image=None) -> Union[int, str, list]:
if doc_to_image is not None:
doc_to_image = doc_to_image
elif self.config.doc_to_image is not None:
doc_to_image = self.config.doc_to_image
else:
return None
if isinstance(doc_to_image, list):
image_feature = [
self.doc_to_image(doc, feature) for feature in doc_to_image
]
return [feature for feature in image_feature if feature is not None]
elif isinstance(doc_to_image, str):
if doc_to_image in self.features:
return doc[doc_to_image]
else:
return ast.literal_eval(utils.apply_template(doc_to_image, doc))
elif callable(doc_to_image):
return doc_to_image(doc)
else:
return None
def construct_requests( def construct_requests(
self, doc: dict, ctx: str, **kwargs self, doc: dict, ctx: str, **kwargs
) -> Union[List[Instance], Instance]: ) -> Union[List[Instance], Instance]:
aux_arguments = None
if self.OUTPUT_TYPE == "loglikelihood": if self.OUTPUT_TYPE == "loglikelihood":
arguments = (ctx, self.doc_to_target(doc)) arguments = (ctx, self.doc_to_target(doc))
elif self.OUTPUT_TYPE == "loglikelihood_rolling": elif self.OUTPUT_TYPE == "loglikelihood_rolling":
...@@ -1323,6 +1357,37 @@ class ConfigurableTask(Task): ...@@ -1323,6 +1357,37 @@ class ConfigurableTask(Task):
# Otherwise they are placed in the continuation # Otherwise they are placed in the continuation
arguments = [(ctx, f"{target_delimiter}{cont}") for cont in choices] arguments = [(ctx, f"{target_delimiter}{cont}") for cont in choices]
# TODO: we should raise a warning telling users this will at most ~2x runtime.
if "acc_mutual_info" in self._metric_fn_list.keys():
# if we are calculating multiple choice accuracy
# using mutual information instead of raw loglikelihood as metric, need unconditional lls.
# here mutual info refers to calculating
# log(P(choice|ctx) / P(choice)) = log(P(choice|ctx)) - log(P(choice))
# in other words normalizing by subtracting the unconditional logprob of each choice.
aux_arguments = [("", f"{choice}") for choice in choices]
arguments.extend(aux_arguments)
elif self.OUTPUT_TYPE == "generate_until":
arguments = (ctx, deepcopy(self.config.generation_kwargs))
multimodal_arg = {}
if (
self.config.doc_to_image
): # TODO: ensure that non-multimodal tasks aren't getting visual args
multimodal_arg = {
**multimodal_arg,
**{"visual": self.doc_to_image(doc)},
}
if bool(multimodal_arg):
if isinstance(arguments, list):
arguments = [arg + (multimodal_arg,) for arg in arguments]
else:
arguments = arguments + (multimodal_arg,)
if self.OUTPUT_TYPE == "multiple_choice":
request_list = [ request_list = [
Instance( Instance(
request_type="loglikelihood", request_type="loglikelihood",
...@@ -1333,33 +1398,15 @@ class ConfigurableTask(Task): ...@@ -1333,33 +1398,15 @@ class ConfigurableTask(Task):
) )
for i, arg in enumerate(arguments) for i, arg in enumerate(arguments)
] ]
# TODO: we should raise a warning telling users this will at most ~2x runtime.
if "acc_mutual_info" in self._metric_fn_list.keys():
# if we are calculating multiple choice accuracy
# using mutual information instead of raw loglikelihood as metric, need unconditional lls.
# here mutual info refers to calculating
# log(P(choice|ctx) / P(choice)) = log(P(choice|ctx)) - log(P(choice))
# in other words normalizing by subtracting the unconditional logprob of each choice.
request_list.extend(
[
Instance(
request_type="loglikelihood",
doc=doc,
arguments=("", "{}".format(choice)),
idx=i,
**kwargs,
)
for i, choice in enumerate(choices)
]
)
return request_list return request_list
elif self.OUTPUT_TYPE == "generate_until":
arguments = (ctx, deepcopy(self.config.generation_kwargs))
return Instance( return Instance(
request_type=self.OUTPUT_TYPE, doc=doc, arguments=arguments, idx=0, **kwargs request_type=self.OUTPUT_TYPE,
doc=doc,
arguments=arguments,
idx=0,
**kwargs,
) )
def process_results(self, doc, results): def process_results(self, doc, results):
...@@ -1571,7 +1618,7 @@ class ConfigurableTask(Task): ...@@ -1571,7 +1618,7 @@ class ConfigurableTask(Task):
f"ConfigurableTask(task_name={getattr(self.config, 'task', None)}," f"ConfigurableTask(task_name={getattr(self.config, 'task', None)},"
f"output_type={self.OUTPUT_TYPE}," f"output_type={self.OUTPUT_TYPE},"
f"num_fewshot={getattr(self.config, 'num_fewshot', None)}," f"num_fewshot={getattr(self.config, 'num_fewshot', None)},"
f"num_samples={len(self.eval_docs)})" f"num_samples={len(self.eval_docs)})",
) )
......
...@@ -289,18 +289,12 @@ def simple_evaluate( ...@@ -289,18 +289,12 @@ def simple_evaluate(
if check_integrity: if check_integrity:
run_task_tests(task_list=tasks) run_task_tests(task_list=tasks)
# hotfix: delete when chat_template fixed
try:
chat = lm.chat_template(apply_chat_template)
except: # noqa: E722
chat = None
if evaluation_tracker is not None: if evaluation_tracker is not None:
evaluation_tracker.general_config_tracker.log_experiment_args( evaluation_tracker.general_config_tracker.log_experiment_args(
model_source=model, model_source=model,
model_args=model_args, model_args=model_args,
system_instruction=system_instruction, system_instruction=system_instruction,
chat_template=chat, chat_template=lm.chat_template(apply_chat_template),
fewshot_as_multiturn=fewshot_as_multiturn, fewshot_as_multiturn=fewshot_as_multiturn,
) )
...@@ -420,8 +414,28 @@ def evaluate( ...@@ -420,8 +414,28 @@ def evaluate(
for task_output in eval_tasks for task_output in eval_tasks
): ):
raise ValueError("log_samples must be True for 'bypass' metric-only tasks") raise ValueError("log_samples must be True for 'bypass' metric-only tasks")
# validation check: are we running multimodal task <-> non-multimodal model class, or vice-versa.
incompatible_tasks = []
for task_output in eval_tasks:
task: Task = task_output.task
if getattr(lm, "MULTIMODAL", False) != getattr(task, "MULTIMODAL", False):
incompatible_tasks.append(task_output.task_name)
if len(incompatible_tasks) > 0:
if not getattr(lm, "MULTIMODAL", False):
raise ValueError(
f"Attempted to run tasks: {incompatible_tasks} which require multimodal input, but the selected model type does not currently implement this. Multimodal support is currently restricted to the ['hf-multimodal', 'vllm-vlm'] model type."
)
else:
raise ValueError(
f"Attempted to run tasks: {incompatible_tasks} which are text-only, but used a model type which only currently supports multimodal tasks."
)
# end multimodality validation check
for task_output in eval_tasks: for task_output in eval_tasks:
task: Task = task_output.task task: Task = task_output.task
limit = get_sample_size(task, limit) limit = get_sample_size(task, limit)
task.build_all_requests( task.build_all_requests(
limit=limit, limit=limit,
......
...@@ -3,6 +3,7 @@ from . import ( ...@@ -3,6 +3,7 @@ from . import (
api_models, api_models,
dummy, dummy,
gguf, gguf,
hf_vlms,
huggingface, huggingface,
mamba_lm, mamba_lm,
nemo_lm, nemo_lm,
...@@ -12,6 +13,7 @@ from . import ( ...@@ -12,6 +13,7 @@ from . import (
optimum_lm, optimum_lm,
textsynth, textsynth,
vllm_causallms, vllm_causallms,
vllm_vlms,
) )
......
...@@ -104,7 +104,9 @@ class TemplateAPI(TemplateLM): ...@@ -104,7 +104,9 @@ class TemplateAPI(TemplateLM):
self._truncate = truncate self._truncate = truncate
self._max_gen_toks = int(max_gen_toks) self._max_gen_toks = int(max_gen_toks)
self._seed = int(seed) self._seed = int(seed)
self.max_length = max_length # max_length - 1 as we always have 1 token for generation
eval_logger.info(f"Using max length {max_length} - 1")
self.max_length = max_length - 1
if int(num_concurrent) <= 1: if int(num_concurrent) <= 1:
eval_logger.info( eval_logger.info(
"Concurrent requests are disabled. To enable concurrent requests, set `num_concurrent` > 1." "Concurrent requests are disabled. To enable concurrent requests, set `num_concurrent` > 1."
...@@ -223,14 +225,6 @@ class TemplateAPI(TemplateLM): ...@@ -223,14 +225,6 @@ class TemplateAPI(TemplateLM):
"""Override this property to return the headers for the API request.""" """Override this property to return the headers for the API request."""
return {"Authorization": f"Bearer {self.api_key}"} return {"Authorization": f"Bearer {self.api_key}"}
@property
def chat_template(self) -> str:
"""Must be defined for LM subclasses that implement Chat Templating.
Should return the structure of the chat template applied to user/assistant messages.
Only used for logging and reproducibility.
"""
return ""
@property @property
def tokenizer_name(self) -> str: def tokenizer_name(self) -> str:
"""Must be defined for LM subclasses which implement Chat Templating. """Must be defined for LM subclasses which implement Chat Templating.
...@@ -417,6 +411,7 @@ class TemplateAPI(TemplateLM): ...@@ -417,6 +411,7 @@ class TemplateAPI(TemplateLM):
cache_keys = [] cache_keys = []
for chunk in chunks: for chunk in chunks:
for cache_key, context_enc, continuation_enc in chunk: for cache_key, context_enc, continuation_enc in chunk:
# max_length - 1 as we always have 1 token for generation
inp = (context_enc + continuation_enc)[-(self.max_length) :] inp = (context_enc + continuation_enc)[-(self.max_length) :]
ctxlen = len(context_enc) - max( ctxlen = len(context_enc) - max(
0, len(context_enc) + len(continuation_enc) - (self.max_length) 0, len(context_enc) + len(continuation_enc) - (self.max_length)
...@@ -619,7 +614,8 @@ class TemplateAPI(TemplateLM): ...@@ -619,7 +614,8 @@ class TemplateAPI(TemplateLM):
utils.get_rolling_token_windows( utils.get_rolling_token_windows(
token_list=self.tok_encode(string), token_list=self.tok_encode(string),
prefix_token=self.prefix_token_id, prefix_token=self.prefix_token_id,
max_seq_len=self.max_length, # max_seq_len - (1 for context)
max_seq_len=self.max_length - 1,
context_len=1, context_len=1,
), ),
) )
......
This diff is collapsed.
...@@ -438,98 +438,6 @@ class HFLM(TemplateLM): ...@@ -438,98 +438,6 @@ class HFLM(TemplateLM):
def tokenizer_name(self) -> str: def tokenizer_name(self) -> str:
return self.tokenizer.name_or_path.replace("/", "__") return self.tokenizer.name_or_path.replace("/", "__")
def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
"""
Get the appropriate chat template for the model based on configuration and input.
This method determines, and returns the correct chat template, ensuring reproducibility.
The template selection logic is adapted from the Transformers library's `apply_chat_template`
method in the Tokenizer class. The original implementation can be found at:
https://github.com/huggingface/transformers/blob/fc35907f95459d7a6c5281dfadd680b6f7b620e3/src/transformers/tokenization_utils_base.py#L1687
This method ensures that the right template is chosen based on the following:
1. If the model's tokenizer has multiple templates:
a. Use the specified template if it exists in the dictionary.
b. Use the default template from the list if no specific template is provided.
c. Raise an error if no default template exists and no specific template is provided.
2. If the model's tokenizer has a single template or no template:
a. Use the tokenizer's chat template if available.
b. Fall back to the default chat template if no tokenizer chat template exists.
Args:
chat_template (Union[bool, str]): Specifies the chat template to use.
- If False or None, no template is applied.
- If True, the default or only available template is used.
- If a string, the template with the matching name is used.
Returns:
Optional[str]: The selected chat template, or None if no template is applied.
"""
if chat_template is False or chat_template is None:
eval_logger.warning(
"model.chat_template was called with the chat_template set to False or None. "
"Therefore no chat template will be applied. Make sure this is an intended behavior."
)
return None
# Convert boolean chat_template to None to ensure compatibility with the adapted logic
if isinstance(chat_template, bool):
chat_template = None
using_default_template = False
# First, handle the cases when the model has a dict of multiple templates
template = self.tokenizer.chat_template or self.tokenizer.default_chat_template
if isinstance(template, dict):
using_default_dict = self.tokenizer.chat_template is None
if chat_template is not None:
if chat_template in template:
selected_template = template[chat_template]
if using_default_dict:
using_default_template = True
else:
raise ValueError(
f"The specified chat template '{chat_template}' is not available. "
f"Available template names are {sorted(template.keys())}."
)
else:
# If user didn't pass a chat template, use the default template from the dict
if "default" in template:
selected_template = template["default"]
using_default_template = True
else:
raise ValueError(
"This model has multiple chat templates with no default specified! Please either pass a chat "
"template or the name of the template you wish to use to the `chat_template` argument. Available "
f"template names are {sorted(template.keys())}."
)
# Cases when the model has a single template or no template
else:
# priority: `chat_template` argument > `tokenizer.chat_template` > `tokenizer.default_chat_template
if isinstance(chat_template, str):
eval_logger.warning(
"Chat template name provided, but the tokenizer's chat template is not a dictionary. "
"Using the tokenizer's chat template or the default template instead."
)
if self.tokenizer.chat_template is not None:
selected_template = self.tokenizer.chat_template
else:
selected_template = self.tokenizer.default_chat_template
using_default_template = True
if using_default_template:
eval_logger.warning(
"No chat template is set for this tokenizer, falling back to a default class-level template. This is "
"very error-prone, because models are often trained with templates different from the class default! "
"Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which "
"point any code depending on them will stop working. We recommend setting a valid chat template before "
"then to ensure that this model continues working without issues."
)
return selected_template
def _get_backend( def _get_backend(
self, self,
config: Union[transformers.PretrainedConfig, transformers.AutoConfig], config: Union[transformers.PretrainedConfig, transformers.AutoConfig],
...@@ -540,7 +448,16 @@ class HFLM(TemplateLM): ...@@ -540,7 +448,16 @@ class HFLM(TemplateLM):
Helper method during initialization. Helper method during initialization.
Determines the backend ("causal" (decoder-only) or "seq2seq" (encoder-decoder)) Determines the backend ("causal" (decoder-only) or "seq2seq" (encoder-decoder))
model type to be used. model type to be used.
sets `self.AUTO_MODEL_CLASS` appropriately if not already set.
""" """
# escape hatch: if we're using a subclass that shouldn't follow
# the default _get_backend logic,
# then skip over the method.
# TODO: this seems very much undesirable in some cases--our code in HFLM
# references AutoModelForCausalLM at times to check for equality
if self.AUTO_MODEL_CLASS is not None:
return
assert backend in ["default", "causal", "seq2seq"] assert backend in ["default", "causal", "seq2seq"]
if backend != "default": if backend != "default":
......
...@@ -29,7 +29,10 @@ class LocalCompletionsAPI(TemplateAPI): ...@@ -29,7 +29,10 @@ class LocalCompletionsAPI(TemplateAPI):
) -> dict: ) -> dict:
if generate: if generate:
gen_kwargs.pop("do_sample", False) gen_kwargs.pop("do_sample", False)
max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks) if "max_tokens" in gen_kwargs:
max_tokens = gen_kwargs.pop("max_tokens")
else:
max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
temperature = gen_kwargs.pop("temperature", 0) temperature = gen_kwargs.pop("temperature", 0)
stop = gen_kwargs.pop("until", ["<|endoftext|>"]) stop = gen_kwargs.pop("until", ["<|endoftext|>"])
return { return {
...@@ -124,7 +127,10 @@ class LocalChatCompletion(LocalCompletionsAPI): ...@@ -124,7 +127,10 @@ class LocalChatCompletion(LocalCompletionsAPI):
**kwargs, **kwargs,
) -> dict: ) -> dict:
gen_kwargs.pop("do_sample", False) gen_kwargs.pop("do_sample", False)
max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks) if "max_tokens" in gen_kwargs:
max_tokens = gen_kwargs.pop("max_tokens")
else:
max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
temperature = gen_kwargs.pop("temperature", 0) temperature = gen_kwargs.pop("temperature", 0)
stop = gen_kwargs.pop("until", ["<|endoftext|>"]) stop = gen_kwargs.pop("until", ["<|endoftext|>"])
if not isinstance(stop, (list, tuple)): if not isinstance(stop, (list, tuple)):
...@@ -194,6 +200,9 @@ class OpenAICompletionsAPI(LocalCompletionsAPI): ...@@ -194,6 +200,9 @@ class OpenAICompletionsAPI(LocalCompletionsAPI):
), "Loglikelihood is not supported for gpt-3.5-turbo" ), "Loglikelihood is not supported for gpt-3.5-turbo"
return super().loglikelihood(requests, **kwargs) return super().loglikelihood(requests, **kwargs)
def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
return ""
@register_model("openai-chat-completions") @register_model("openai-chat-completions")
class OpenAIChatCompletion(LocalChatCompletion): class OpenAIChatCompletion(LocalChatCompletion):
......
...@@ -664,3 +664,37 @@ def configure_pad_token( ...@@ -664,3 +664,37 @@ def configure_pad_token(
tokenizer.add_special_tokens({"pad_token": "<|pad|>"}) tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
return tokenizer return tokenizer
def replace_placeholders(
string: str, default_placeholder: str, image_token: str, max_images: int
):
"""
A utility function used for local multimodal models. It locates all `placeholder` string
occurrences in the given input `string_` and replaces the first `max_count` instances with
`replacement`, and all subsequent occurrences with the empty string.
This is used to replace <image> placeholder tags by model-specific image tokens like <|image_pad|>
and to allow for only the first `max_count` images to be passed to a model if desired.
:param string: The original string containing placeholders.
:param default_placeholder: The placeholder text to be replaced.
:param image_token: The token to replace the placeholder with.
:param max_images: The maximum number of replacements to make.
:return: The string with placeholders replaced.
"""
count = 0
result = []
parts = string.split(default_placeholder)
for part in parts[:-1]: # Iterate through all but the last part
result.append(part)
if count < max_images:
result.append(image_token)
count += 1
elif default_placeholder != image_token:
result.append(default_placeholder)
# Add the last part of the string
result.append(parts[-1])
return "".join(result)
...@@ -187,12 +187,6 @@ class VLLM(TemplateLM): ...@@ -187,12 +187,6 @@ class VLLM(TemplateLM):
chat_history, tokenize=False, add_generation_prompt=True chat_history, tokenize=False, add_generation_prompt=True
) )
@property
def chat_template(self) -> str:
if self.tokenizer.chat_template is not None:
return self.tokenizer.chat_template
return self.tokenizer.default_chat_template
@property @property
def tokenizer_name(self) -> str: def tokenizer_name(self) -> str:
return self.tokenizer.name_or_path.replace("/", "__") return self.tokenizer.name_or_path.replace("/", "__")
...@@ -289,7 +283,8 @@ class VLLM(TemplateLM): ...@@ -289,7 +283,8 @@ class VLLM(TemplateLM):
make_disjoint_window, make_disjoint_window,
get_rolling_token_windows( get_rolling_token_windows(
token_list=self.tok_encode(string), token_list=self.tok_encode(string),
prefix_token=self.eot_token_id, prefix_token=self.prefix_token_id,
# max_seq_len - (1 for context)
max_seq_len=self.max_length - 1, max_seq_len=self.max_length - 1,
context_len=1, context_len=1,
), ),
......
import copy
from typing import Dict, List, Optional
import transformers
from more_itertools import distribute
from tqdm import tqdm
from lm_eval.api.instance import Instance
from lm_eval.api.registry import register_model
from lm_eval.models.utils import Collator, undistribute
from lm_eval.models.vllm_causallms import VLLM
from lm_eval.utils import simple_parse_args_string
try:
import ray
from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest # noqa: F401
from vllm.transformers_utils.tokenizer import get_tokenizer # noqa: F401
except ModuleNotFoundError:
pass
DEFAULT_IMAGE_PLACEHOLDER = "<image>"
@register_model("vllm-vlm")
class VLLM_VLM(VLLM):
MULTIMODAL = True
def __init__(
self,
pretrained: str,
trust_remote_code: Optional[bool] = False,
revision: Optional[str] = None,
interleave: bool = True,
# TODO<baber>: handle max_images and limit_mm_per_prompt better
max_images: int = 999,
limit_mm_per_prompt: str = "image=1",
**kwargs,
):
kwargs["limit_mm_per_prompt"] = simple_parse_args_string(limit_mm_per_prompt)
super().__init__(
pretrained=pretrained,
trust_remote_code=trust_remote_code,
revision=revision,
**kwargs,
)
self.interleave = interleave
self.max_images = max_images
self.processor = transformers.AutoProcessor.from_pretrained(
pretrained,
revision=revision,
trust_remote_code=trust_remote_code,
)
self.chat_applied: bool = False
def tok_batch_multimodal_encode(
self,
strings: List[str], # note that input signature of this fn is different
images, # TODO: typehint on this
left_truncate_len: int = None,
truncation: bool = False,
):
images = [img[: self.max_images] for img in images]
outputs = []
for x, i in zip(strings, images):
inputs = {
"prompt": x,
"multi_modal_data": {"image": i},
}
outputs.append(inputs)
return outputs
def _model_generate(
self,
requests: List[List[dict]] = None,
generate: bool = False,
max_tokens: int = None,
stop: Optional[List[str]] = None,
**kwargs,
):
if generate:
kwargs = self.modify_gen_kwargs(kwargs)
sampling_params = SamplingParams(max_tokens=max_tokens, stop=stop, **kwargs)
else:
sampling_params = SamplingParams(
temperature=0, prompt_logprobs=1, max_tokens=1, detokenize=False
)
if self.data_parallel_size > 1:
# vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote
# also seems to only work with decorator and not with ray.remote() fn
# see https://github.com/vllm-project/vllm/issues/973
# note: this has changed on 0.3.3, and it only works now if num_gpus are set.
# but then tensor_parallel breaks
@ray.remote
def run_inference_one_model(
model_args: dict, sampling_params, requests: List[List[dict]]
):
llm = LLM(**model_args)
return llm.generate(requests, sampling_params=sampling_params)
# dispatch requests to all self.data_parallel_size workers, in interleaved fashion
# interleaved important to balance context lengths across workers
requests = [list(x) for x in distribute(self.data_parallel_size, requests)]
inputs = ((self.model_args, sampling_params, req) for req in requests)
object_refs = [run_inference_one_model.remote(*x) for x in inputs]
results = ray.get(object_refs)
# Invoke ray.shutdown() to prevent hang-ups if subsequent calls required.
ray.shutdown()
# flatten results
return undistribute(results)
if self.lora_request is not None:
outputs = self.model.generate(
requests,
sampling_params=sampling_params,
use_tqdm=True if self.batch_size == "auto" else False,
lora_request=self.lora_request,
)
else:
outputs = self.model.generate(
requests,
sampling_params=sampling_params,
use_tqdm=True if self.batch_size == "auto" else False,
)
return outputs
def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
self.chat_applied = True
if not self.interleave:
for content in chat_history:
c = []
text = content["content"]
# Count and remove image placeholders
image_count = min(
self.max_images, text.count(DEFAULT_IMAGE_PLACEHOLDER)
)
text = text.replace(DEFAULT_IMAGE_PLACEHOLDER, "")
# Add image entries
for _ in range(image_count):
c.append({"type": "image", "image": None})
# Add single text entry at the end
c.append({"type": "text", "text": text})
content["content"] = c
else:
for content in chat_history:
c = []
text = content["content"]
expected_image_count = min(
self.max_images, text.count(DEFAULT_IMAGE_PLACEHOLDER)
)
actual_image_count = 0
text_parts = text.split(DEFAULT_IMAGE_PLACEHOLDER)
for i, part in enumerate(text_parts):
# TODO: concatenate text parts (esp. if skipping images)?
if part: # Add non-empty text parts
c.append({"type": "text", "text": part})
if (
(i < len(text_parts) - 1) and i < self.max_images
): # Add image placeholder after each split except the last
c.append({"type": "image"})
actual_image_count += 1
content["content"] = c
if actual_image_count != expected_image_count:
raise ValueError(
f"Mismatch in image placeholder count. Expected: {expected_image_count}, Actual: {actual_image_count}"
)
return self.processor.apply_chat_template(
chat_history, add_generation_prompt=True
)
def generate_until(
self, requests: List[Instance], disable_tqdm: bool = False
) -> List[str]:
# TODO: support text-only reqs
res = []
def _collate(x):
# the negative sign on len(toks) sorts descending - this has a few advantages:
# - time estimates will always be over not underestimates, which is more useful for planning
# - to know the size of a batch when going through the list, you know the first one is always the batch
# padded context length. this is useful to simplify the batching logic and more importantly to make
# automatic adaptive batches much much easier to implement
# - any OOMs will happen right away rather than near the end
toks = self.tok_encode(x[0])
return -len(toks), x[0]
pbar = tqdm(
total=len(requests),
disable=(disable_tqdm or (self.rank != 0)),
desc="Running generate_until requests with text+image input",
)
# TODO: port auto-batch sizing into this.
# we group requests by their generation_kwargs,
# so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
# in the same batch.
re_ords = Collator(
[reg.args for reg in requests],
_collate,
group_by="gen_kwargs",
group_fn=lambda x: x[1],
)
chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None)
for chunk in chunks:
contexts, all_gen_kwargs, aux_arguments = zip(*chunk)
visuals = [arg["visual"] for arg in aux_arguments]
if not isinstance(contexts, list):
contexts = list(
contexts
) # for Qwen2-VL, processor is unhappy accepting a tuple of strings instead of a list.
# TODO: could we upstream this workaround to HF?
# we assume all gen kwargs in the batch are the same
# this is safe to assume because the `grouper` object ensures it.
gen_kwargs = all_gen_kwargs[0]
# unpack our keyword arguments.
until = None
if isinstance(gen_kwargs, dict):
kwargs = copy.deepcopy(gen_kwargs) # edge case for repeats > 1
if "until" in kwargs.keys():
until = kwargs.pop("until")
if isinstance(until, str):
until = [until]
elif not isinstance(until, list):
raise ValueError(
f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
)
else:
raise ValueError(
f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
)
# add EOS token to stop sequences
eos = self.tokenizer.decode(self.eot_token_id)
if not until:
until = [eos]
else:
until.append(eos)
if "max_gen_toks" in kwargs.keys():
max_gen_toks = kwargs.pop("max_gen_toks")
else:
max_gen_toks = self.max_gen_toks
max_ctx_len = self.max_length - max_gen_toks
inputs = self.tok_batch_multimodal_encode(
contexts,
visuals,
left_truncate_len=max_ctx_len,
)
cont = self._model_generate(inputs, stop=until, generate=True, **kwargs)
for output, context in zip(cont, contexts):
generated_text = output.outputs[0].text
res.append(generated_text)
self.cache_hook.add_partial(
"generate_until", (context, gen_kwargs), generated_text
)
pbar.update(1)
# reorder this group of results back to original unsorted form
res = re_ords.get_original(res)
pbar.close()
return res
...@@ -11,6 +11,8 @@ ...@@ -11,6 +11,8 @@
| [aexams](aexams/README.md) | Tasks in Arabic related to various academic exams covering a range of subjects. | Arabic | | [aexams](aexams/README.md) | Tasks in Arabic related to various academic exams covering a range of subjects. | Arabic |
| [agieval](agieval/README.md) | Tasks involving historical data or questions related to history and historical texts. | English, Chinese | | [agieval](agieval/README.md) | Tasks involving historical data or questions related to history and historical texts. | English, Chinese |
| [anli](anli/README.md) | Adversarial natural language inference tasks designed to test model robustness. | English | | [anli](anli/README.md) | Adversarial natural language inference tasks designed to test model robustness. | English |
| [arabic_leaderboard_complete](arabic_leaderboard_complete/README.md) | A full version of the tasks in the Open Arabic LLM Leaderboard, focusing on the evaluation of models that reflect the characteristics of Arabic language understanding and comprehension, culture, and heritage. Note that some of these tasks are machine-translated. | Arabic (Some MT) |
| [arabic_leaderboard_light](arabic_leaderboard_light/README.md) | A light version of the tasks in the Open Arabic LLM Leaderboard (i.e., 10% samples of the test set in the original benchmarks), focusing on the evaluation of models that reflect the characteristics of Arabic language understanding and comprehension, culture, and heritage. Note that some of these tasks are machine-translated. | Arabic (Some MT) |
| [arabicmmlu](arabicmmlu/README.md) | Localized Arabic version of MMLU with multiple-choice questions from 40 subjects. | Arabic | | [arabicmmlu](arabicmmlu/README.md) | Localized Arabic version of MMLU with multiple-choice questions from 40 subjects. | Arabic |
| [arc](arc/README.md) | Tasks involving complex reasoning over a diverse set of questions. | English | | [arc](arc/README.md) | Tasks involving complex reasoning over a diverse set of questions. | English |
| [arithmetic](arithmetic/README.md) | Tasks involving numerical computations and arithmetic reasoning. | English | | [arithmetic](arithmetic/README.md) | Tasks involving numerical computations and arithmetic reasoning. | English |
......
...@@ -489,10 +489,12 @@ class TaskManager: ...@@ -489,10 +489,12 @@ class TaskManager:
if attr in config: if attr in config:
if attr == "group" and print_info: if attr == "group" and print_info:
self.logger.info( self.logger.info(
"`group` and `group_alias` keys in tasks' configs will no longer be used in the next release of lm-eval. " "`group` and `group_alias` keys in TaskConfigs are deprecated and will be removed in v0.4.5 of lm_eval. "
"`tag` will be used to allow to call a collection of tasks just like `group`. " "The new `tag` field will be used to allow for a shortcut to a group of tasks one does not wish to aggregate metrics across. "
"`group` will be removed in order to not cause confusion with the new ConfigurableGroup " "`group`s which aggregate across subtasks must be only defined in a separate group config file, "
"which will be the official way to create groups with addition of group-wide configurations." "which will be the official way to create groups that support cross-task aggregation as in `mmlu`. "
"Please see the v0.4.4 patch notes and our documentation: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/new_task_guide.md#advanced-group-configs "
"for more information."
) )
print_info = False print_info = False
# attr = "tag" # attr = "tag"
......
# Arabic Leaderboard
Title: Open Arabic LLM Leaderboard
The Open Arabic LLM Leaderboard evaluates language models on a large number of different evaluation tasks that reflect the characteristics of the Arabic language and culture.
The benchmark uses several datasets, most of them translated to Arabic, and validated by native Arabic speakers. They also used benchmarks from other papers or prepared benchmarks from scratch natively for Arabic.
Homepage: https://huggingface.co/spaces/OALL/Open-Arabic-LLM-Leaderboard
### Citation
```
@misc{OALL,
author = {Elfilali, Ali and Alobeidli, Hamza and Fourrier, Clémentine and Boussaha, Basma El Amel and Cojocaru, Ruxandra and Habib, Nathan and Hacid, Hakim},
title = {Open Arabic LLM Leaderboard},
year = {2024},
publisher = {OALL},
howpublished = "\url{https://huggingface.co/spaces/OALL/Open-Arabic-LLM-Leaderboard}"
}
@inproceedings{almazrouei-etal-2023-alghafa,
title = "{A}l{G}hafa Evaluation Benchmark for {A}rabic Language Models",
author = "Almazrouei, Ebtesam and
Cojocaru, Ruxandra and
Baldo, Michele and
Malartic, Quentin and
Alobeidli, Hamza and
Mazzotta, Daniele and
Penedo, Guilherme and
Campesan, Giulia and
Farooq, Mugariya and
Alhammadi, Maitha and
Launay, Julien and
Noune, Badreddine",
editor = "Sawaf, Hassan and
El-Beltagy, Samhaa and
Zaghouani, Wajdi and
Magdy, Walid and
Abdelali, Ahmed and
Tomeh, Nadi and
Abu Farha, Ibrahim and
Habash, Nizar and
Khalifa, Salam and
Keleg, Amr and
Haddad, Hatem and
Zitouni, Imed and
Mrini, Khalil and
Almatham, Rawan",
booktitle = "Proceedings of ArabicNLP 2023",
month = dec,
year = "2023",
address = "Singapore (Hybrid)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.arabicnlp-1.21",
doi = "10.18653/v1/2023.arabicnlp-1.21",
pages = "244--275",
abstract = "Recent advances in the space of Arabic large language models have opened up a wealth of potential practical applications. From optimal training strategies, large scale data acquisition and continuously increasing NLP resources, the Arabic LLM landscape has improved in a very short span of time, despite being plagued by training data scarcity and limited evaluation resources compared to English. In line with contributing towards this ever-growing field, we introduce AlGhafa, a new multiple-choice evaluation benchmark for Arabic LLMs. For showcasing purposes, we train a new suite of models, including a 14 billion parameter model, the largest monolingual Arabic decoder-only model to date. We use a collection of publicly available datasets, as well as a newly introduced HandMade dataset consisting of 8 billion tokens. Finally, we explore the quantitative and qualitative toxicity of several Arabic models, comparing our models to existing public Arabic LLMs.",
}
@misc{huang2023acegpt,
title={AceGPT, Localizing Large Language Models in Arabic},
author={Huang Huang and Fei Yu and Jianqing Zhu and Xuening Sun and Hao Cheng and Dingjie Song and Zhihong Chen and Abdulmohsen Alharthi and Bang An and Ziche Liu and Zhiyi Zhang and Junying Chen and Jianquan Li and Benyou Wang and Lian Zhang and Ruoyu Sun and Xiang Wan and Haizhou Li and Jinchao Xu},
year={2023},
eprint={2309.12053},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{lighteval,
author = {Fourrier, Clémentine and Habib, Nathan and Wolf, Thomas and Tunstall, Lewis},
title = {LightEval: A lightweight framework for LLM evaluation},
year = {2023},
version = {0.3.0},
url = {https://github.com/huggingface/lighteval}
}
```
### Groups and Tasks
* `arabic_leaderboard_alghafa`: A multiple-choice evaluation benchmark for zero- and few-shot evaluation of Arabic LLMs prepared from scratch natively for Arabic.
* Paper: https://aclanthology.org/2023.arabicnlp-1.21.pdf
* You can find the list of the tasks as follows:
* `arabic_leaderboard_alghafa_mcq_exams_test_ar`
* `arabic_leaderboard_alghafa_meta_ar_dialects`
* `arabic_leaderboard_alghafa_meta_ar_msa`
* `arabic_leaderboard_alghafa_multiple_choice_facts_truefalse_balanced_task`
* `arabic_leaderboard_alghafa_multiple_choice_grounded_statement_soqal_task`
* `arabic_leaderboard_alghafa_multiple_choice_grounded_statement_xglue_mlqa_task`
* `arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_no_neutral_task`
* `arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_task`
* `arabic_leaderboard_alghafa_multiple_choice_sentiment_task`
* `arabic_leaderboard_arabic_exams`: A question answering benchmark for high school examinations in different school subjects that requires knowledge and reasoning in different languages in multiple domains.
* Paper: https://aclanthology.org/2020.emnlp-main.438.pdf
* `arabic_leaderboard_arabic_mmlu`: A multi-task language understanding benchmark for the Arabic language, sourced from school exams across diverse educational levels in different countries with native speakers in the region.
The data comprises multiple choice questions in 40 tasks.
* Paper: https://arxiv.org/pdf/2402.12840
* You can find the list of the tasks as follows:
* `arabic_leaderboard_arabic_mmlu_abstract_algebra`
* `arabic_leaderboard_arabic_mmlu_anatomy`
* `arabic_leaderboard_arabic_mmlu_astronomy`
* `arabic_leaderboard_arabic_mmlu_business_ethics`
* `arabic_leaderboard_arabic_mmlu_clinical_knowledge`
* `arabic_leaderboard_arabic_mmlu_college_biology`
* `arabic_leaderboard_arabic_mmlu_college_chemistry`
* `arabic_leaderboard_arabic_mmlu_college_computer_science`
* `arabic_leaderboard_arabic_mmlu_college_mathematics`
* `arabic_leaderboard_arabic_mmlu_college_medicine`
* `arabic_leaderboard_arabic_mmlu_college_physics`
* `arabic_leaderboard_arabic_mmlu_computer_security`
* `arabic_leaderboard_arabic_mmlu_conceptual_physics`
* `arabic_leaderboard_arabic_mmlu_econometrics`
* `arabic_leaderboard_arabic_mmlu_electrical_engineering`
* `arabic_leaderboard_arabic_mmlu_elementary_mathematics`
* `arabic_leaderboard_arabic_mmlu_formal_logic`
* `arabic_leaderboard_arabic_mmlu_global_facts`
* `arabic_leaderboard_arabic_mmlu_high_school_biology`
* `arabic_leaderboard_arabic_mmlu_high_school_chemistry`
* `arabic_leaderboard_arabic_mmlu_high_school_computer_science`
* `arabic_leaderboard_arabic_mmlu_high_school_european_history`
* `arabic_leaderboard_arabic_mmlu_high_school_geography`
* `arabic_leaderboard_arabic_mmlu_high_school_government_and_politics`
* `arabic_leaderboard_arabic_mmlu_high_school_macroeconomics`
* `arabic_leaderboard_arabic_mmlu_high_school_mathematics`
* `arabic_leaderboard_arabic_mmlu_high_school_microeconomics`
* `arabic_leaderboard_arabic_mmlu_high_school_physics`
* `arabic_leaderboard_arabic_mmlu_high_school_psychology`
* `arabic_leaderboard_arabic_mmlu_high_school_statistics`
* `arabic_leaderboard_arabic_mmlu_high_school_us_history`
* `arabic_leaderboard_arabic_mmlu_high_school_us_history`
* `arabic_leaderboard_arabic_mmlu_human_aging`
* `arabic_leaderboard_arabic_mmlu_human_sexuality`
* `arabic_leaderboard_arabic_mmlu_international_law`
* `arabic_leaderboard_arabic_mmlu_jurisprudence`
* `arabic_leaderboard_arabic_mmlu_logical_fallacies`
* `arabic_leaderboard_arabic_mmlu_machine_learning`
* `arabic_leaderboard_arabic_mmlu_management`
* `arabic_leaderboard_arabic_mmlu_marketing`
* `arabic_leaderboard_arabic_mmlu_medical_genetics`
* `arabic_leaderboard_arabic_mmlu_miscellaneous`
* `arabic_leaderboard_arabic_mmlu_moral_disputes`
* `arabic_leaderboard_arabic_mmlu_moral_scenarios`
* `arabic_leaderboard_arabic_mmlu_nutrition`
* `arabic_leaderboard_arabic_mmlu_philosophy`
* `arabic_leaderboard_arabic_mmlu_prehistory`
* `arabic_leaderboard_arabic_mmlu_professional_accounting`
* `arabic_leaderboard_arabic_mmlu_professional_law`
* `arabic_leaderboard_arabic_mmlu_professional_medicine`
* `arabic_leaderboard_arabic_mmlu_professional_psychology`
* `arabic_leaderboard_arabic_mmlu_public_relations`
* `arabic_leaderboard_arabic_mmlu_security_studies`
* `arabic_leaderboard_arabic_mmlu_sociology`
* `arabic_leaderboard_arabic_mmlu_us_foreign_policy`
* `arabic_leaderboard_arabic_mmlu_virology`
* `arabic_leaderboard_arabic_mmlu_world_religions`
* `arabic_leaderboard_arabic_mt_arc_challenge`: AI2 Reasoning Challenge (ARC) is a multiple-choice question task. The dataset contains only natural, grade-school science questions,
written for human tests. The challenge set contains only questions answered incorrectly by both a retrieval-based algorithm and a word co-occurence algorithm. (machine translated benchmark - part of the Alghafa Arabic translated LLM benchmark)
* Paper: https://aclanthology.org/2023.arabicnlp-1.21.pdf
* `arabic_leaderboard_arabic_mt_arc_easy`: This dataset is the same as `arabic_arc_challenge`, except it is not from the challenge set.
* Paper: https://aclanthology.org/2023.arabicnlp-1.21.pdf
* `arabic_leaderboard_arabic_mt_boolq`: A true/false questions dataset that contains the columns passage, question, and the answer (i.e., true/false). (machine translated benchmark - part of the Alghafa Arabic translated LLM benchmark)
* Paper: https://aclanthology.org/2023.arabicnlp-1.21.pdf
* `arabic_leaderboard_arabic_mt_copa`: Choice Of Plausible Alternatives (COPA) is a multiple-choice question dataset, which involves open-domain commonsense causal reasoning. (machine translated benchmark - part of the Alghafa Arabic translated LLM benchmark)
* Paper: https://aclanthology.org/2023.arabicnlp-1.21.pdf
* `arabic_leaderboard_arabic_mt_hellaswag`: The tesk is to choose the next set of sentences, based on the given candidates. The tasks involve reading comprehension and information retrieval challenges
by testing the abilities of the models on basic knowledge (i.e., from 3rd grade to 9th) and commonsense inference. (machine translated benchmark - part of the Alghafa Arabic translated LLM benchmark)
* Paper: https://aclanthology.org/2023.arabicnlp-1.21.pdf
* `arabic_leaderboard_arabic_mt_mmlu`: A multiple-choice question answering dataset from various branches of knowledge including humanities, social sciences, hard sciences, and other areas. The examples in the English dataset are translated into Arabic using ChatGPT with a translation prompt.
* Paper: https://aclanthology.org/2023.arabicnlp-1.21.pdf
* `arabic_leaderboard_arabic_mt_openbook_qa`: A multiple-choice openbook question answering dataset that requires external knowledge and reasoning. The open book that comes with these questions is
based on elementary level science facts. (machine translated benchmark - part of the Alghafa Arabic translated LLM benchmark)
* Paper: https://aclanthology.org/2023.arabicnlp-1.21.pdf
* `arabic_leaderboard_arabic_mt_piqa`: Physical Interaction Question Answering (PIQA) is a multiple-choice question answering based on physical commonsense reasoning. (machine translated benchmark - part of the Alghafa Arabic translated LLM benchmark)
* Paper: https://aclanthology.org/2023.arabicnlp-1.21.pdf
* `arabic_leaderboard_arabic_mt_race`: A multiple-choice questions dataset to assess reading comprehension tasks based on English exams in China - designed for middle school and high school students
(machine translated benchmark - part of the Alghafa Arabic translated LLM benchmark)
* Paper: https://aclanthology.org/2023.arabicnlp-1.21.pdf
* `arabic_leaderboard_arabic_mt_sciq`: A multiple-choice Science Question Answering task to assess understanding of scientific concepts about physics, chemistry, and biology. (machine translated benchmark - part of the Alghafa Arabic translated LLM benchmark)
* Paper: https://aclanthology.org/2023.arabicnlp-1.21.pdf
* `arabic_leaderboard_arabic_mt_toxigen`: This benchmark consists of tasks designed to evaluate language models and classify input text as hateful or not hateful. (machine translated benchmark - part of the Alghafa Arabic translated LLM benchmark)
* Paper: https://aclanthology.org/2023.arabicnlp-1.21.pdf
* `arabic_leaderboard_acva`: Arabic-Culture-Value-Alignment (ACVA) is a yes/no question dataset, generated by GPT3.5 Turbo from Arabic topics to assess model alignment with Arabic values and cultures.
* Paper: https://arxiv.org/pdf/2309.12053
* You can find the list of the tasks as follows:
- `arabic_leaderboard_acva_Algeria`
- `arabic_leaderboard_acva_Ancient_Egypt`
- `arabic_leaderboard_acva_Arab_Empire`
- `arabic_leaderboard_acva_Arabic_Architecture`
- `arabic_leaderboard_acva_Arabic_Art`
- `arabic_leaderboard_acva_Arabic_Astronomy`
- `arabic_leaderboard_acva_Arabic_Calligraphy`
- `arabic_leaderboard_acva_Arabic_Ceremony`
- `arabic_leaderboard_acva_Arabic_Clothing`
- `arabic_leaderboard_acva_Arabic_Culture`
- `arabic_leaderboard_acva_Arabic_Food`
- `arabic_leaderboard_acva_Arabic_Funeral`
- `arabic_leaderboard_acva_Arabic_Geography`
- `arabic_leaderboard_acva_Arabic_History`
- `arabic_leaderboard_acva_Arabic_Language_Origin`
- `arabic_leaderboard_acva_Arabic_Literature`
- `arabic_leaderboard_acva_Arabic_Math`
- `arabic_leaderboard_acva_Arabic_Medicine`
- `arabic_leaderboard_acva_Arabic_Music`
- `arabic_leaderboard_acva_Arabic_Ornament`
- `arabic_leaderboard_acva_Arabic_Philosophy`
- `arabic_leaderboard_acva_Arabic_Physics_and_Chemistry`
- `arabic_leaderboard_acva_Arabic_Wedding`
- `arabic_leaderboard_acva_Bahrain`
- `arabic_leaderboard_acva_Comoros`
- `arabic_leaderboard_acva_Egypt_modern`
- `arabic_leaderboard_acva_InfluenceFromAncientEgypt`
- `arabic_leaderboard_acva_InfluenceFromByzantium`
- `arabic_leaderboard_acva_InfluenceFromChina`
- `arabic_leaderboard_acva_InfluenceFromGreece`
- `arabic_leaderboard_acva_InfluenceFromIslam`
- `arabic_leaderboard_acva_InfluenceFromPersia`
- `arabic_leaderboard_acva_InfluenceFromRome`
- `arabic_leaderboard_acva_Iraq`
- `arabic_leaderboard_acva_Islam_Education`
- `arabic_leaderboard_acva_Islam_branches_and_schools`
- `arabic_leaderboard_acva_Islamic_law_system`
- `arabic_leaderboard_acva_Jordan`
- `arabic_leaderboard_acva_Kuwait`
- `arabic_leaderboard_acva_Lebanon`
- `arabic_leaderboard_acva_Libya`
- `arabic_leaderboard_acva_Mauritania`
- `arabic_acva_Mesopotamia_civilization`
- `arabic_leaderboard_acva_Morocco`
- `arabic_leaderboard_acva_Oman`
- `arabic_leaderboard_acva_Palestine`
- `arabic_leaderboard_acva_Qatar`
- `arabic_leaderboard_acva_Saudi_Arabia`
- `arabic_leaderboard_acva_Somalia`
- `arabic_leaderboard_acva_Sudan`
- `arabic_leaderboard_acva_Syria`
- `arabic_leaderboard_acva_Tunisia`
- `arabic_leaderboard_acva_United_Arab_Emirates`
- `arabic_leaderboard_acva_Yemen`
- `arabic_leaderboard_acva_communication`
- `arabic_leaderboard_acva_computer_and_phone`
- `arabic_leaderboard_acva_daily_life`
- `arabic_leaderboard_acva_entertainment`
### Checklist
For adding novel benchmarks/datasets to the library:
* [ ] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
group: arabic_leaderboard_alghafa
task:
- arabic_leaderboard_alghafa_mcq_exams_test_ar
- arabic_leaderboard_alghafa_meta_ar_dialects
- arabic_leaderboard_alghafa_meta_ar_msa
- arabic_leaderboard_alghafa_multiple_choice_facts_truefalse_balanced_task
- arabic_leaderboard_alghafa_multiple_choice_grounded_statement_soqal_task
- arabic_leaderboard_alghafa_multiple_choice_grounded_statement_xglue_mlqa_task
- arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_no_neutral_task
- arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_task
- arabic_leaderboard_alghafa_multiple_choice_sentiment_task
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: true
- metric: acc_norm
aggregation: mean
weight_by_size: true
metadata:
version: 1.0
task: arabic_leaderboard_alghafa_mcq_exams_test_ar
dataset_path: OALL/AlGhafa-Arabic-LLM-Benchmark-Native
dataset_name: mcq_exams_test_ar
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_alghafa_meta_ar_dialects
dataset_path: OALL/AlGhafa-Arabic-LLM-Benchmark-Native
dataset_name: meta_ar_dialects
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
task: arabic_leaderboard_alghafa_meta_ar_msa
dataset_path: OALL/AlGhafa-Arabic-LLM-Benchmark-Native
dataset_name: meta_ar_msa
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "choices"
fewshot_split: validation
fewshot_config:
sampler: first_n
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment