@@ -27,6 +27,7 @@ This project provides a unified framework to test generative language models on
**Features:**
- Over 60 standard academic benchmarks for LLMs, with hundreds of subtasks and variants implemented.
- Support for models loaded via [transformers](https://github.com/huggingface/transformers/)(including quantization via [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ)), [GPT-NeoX](https://github.com/EleutherAI/gpt-neox), and [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/), with a flexible tokenization-agnostic interface.
- Support for fast and memory-efficient inference with [vLLM](https://github.com/vllm-project/vllm).
- Support for commercial APIs including [OpenAI](https://openai.com), [goose.ai](https://goose.ai), and [TextSynth](https://textsynth.com/).
- Support for evaluation on adapters (e.g. LoRA) supported in [HuggingFace's PEFT library](https://github.com/huggingface/peft).
if a dataset feature is already a list, you can set the name of the feature as `doc_to_choice` (See [Hellaswag](https://github.com/EleutherAI/lm-evaluation-harness/blob/e0eda4d3ffa10e5f65e0976161cd134bec61983a/lm_eval/tasks/hellaswag/hellaswag.yaml#L13))
```
doc_to_choice: choices
```
### Writing a prompt with Jinja 2
We support the [Jinja 2](https://jinja.palletsprojects.com/en/3.1.x/) templating language for writing prompts. In practice, this means you can take your dataset's columns and do many basic string manipulations to place each document into prompted format.
-**generation_kwargs** (`dict`, *optional*) — Auxiliary arguments for the `generate` function from HF transformers library. Advanced keyword arguments may not be supported for non-HF LM classes.
-**repeats** (`int`, *optional*, defaults to 1) — Number of repeated runs through model for each sample. can be used for cases such as self-consistency.
-**filter_list** (`Union[str, list]`, *optional*) — List of filters to postprocess model outputs. See below for further detail on the filter API.
-**should_decontaminate** (`bool`, *optional*, defaults to False) -
-**should_decontaminate** (`bool`, *optional*, defaults to False) - Whether to decontaminate or not.
-**doc_to_decontamination_query** (`str`, *optional*) — Query for decontamination if `should_decontaminate` is True. If `should_decontaminate` is True but `doc_to_decontamination_query` is `None`, `doc_to_decontamination_query` will follow `doc_to_text`.
Other:
-**metadata** (`Union[str, list]`, *optional*) — An optional field where arbitrary metadata can be passed. A good example would be `version` that is used to denote the version of the yaml config.
# optionally: take in an already-initialized transformers.PreTrainedModel
ifnotisinstance(pretrained,str):
eval_logger.warning(
"`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way."
)
ifdevice:
ifdevicenotindevice_list:
device=int(device)
self._device=torch.device(device)
eval_logger.info(f"Using device '{device}'")
ifdevicein("mps","mps:0")andversion.parse(
torch.__version__
)<version.parse("2.1"):
raiseRuntimeError(
f"mps requires torch >= 2.1. You have {torch.__version__}"
)
assert(
notparallelize
),"`parallelize=True` is not compatible with passing pre-initialized model to `pretrained`"
# if we passed `pretrained` as a string, initialize our model now
ifisinstance(pretrained,str):
self._create_model(
pretrained=pretrained,
revision=revision,
dtype=dtype,
trust_remote_code=trust_remote_code,
parallelize=parallelize,
device_map_option=device_map_option,
max_memory_per_gpu=max_memory_per_gpu,
max_cpu_memory=max_cpu_memory,
offload_folder=offload_folder,
peft=peft,
autogptq=autogptq,
**kwargs,
)
# forever after, access self._model through self.model property
# access self._model through self.model property outside this method
self.model.eval()
self.model.tie_weights()
ifgpus<=1andnotparallelize:
# place model onto device, if not using HF Accelerate in any form
try:
self.model.to(self.device)
exceptValueError:
eval_logger.info(
"Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes`. If the desired GPU is being used, this message is safe to ignore."
# or any other option that preloads model onto device
try:
self.model.to(self.device)
exceptValueError:
eval_logger.info(
"Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes`. If the desired GPU is being used, this message is safe to ignore."
# multigpu data-parallel support when launched with accelerate
ifgpus>1:
ifparallelize:
ifaccelerator.num_processes>1:
raiseRuntimeError(
"Attempted to use both a HF Accelerate `device_map` and to launch via `accelerate launch`. If this is the case, please either remove `parallelize=True` from --model_args or launch outside of the Accelerate launcher."
)
else:
pass
elifgpus>accelerator.num_processes:
# TODO: make sure there's still never an edge case where we unintentionally default to CPU
eval_logger.warning(
"WARNING: The number of total system GPUs does not match the number of spawned processes. "
"If you would like to use data parallelism, please launch the script "
"with 'accelerate launch *script*'. "
f"Current run will proceed with {accelerator.num_processes} devices."
)
self._rank=accelerator.local_process_index
self._world_size=accelerator.num_processes
# manually set model to use gpu, for case where many GPUs available but
"Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes`. If the desired GPU is being used, this message is safe to ignore."
)
else:
assertaccelerator.distributed_typein[
DistributedType.FSDP,
DistributedType.MULTI_GPU,
],"Unsupported distributed type provided. Only DDP and FSDP are supported."
# multigpu data-parallel support when launched with accelerate
ifgpus>1:
ifparallelize:
ifaccelerator.num_processes>1:
raiseRuntimeError(
"Attempted to use both a HF Accelerate `device_map` and to launch via `accelerate launch`. If this is the case, please either remove `parallelize=True` from --model_args or launch outside of the Accelerate launcher."
)
else:
pass
elifaccelerator.num_processes==1:
# if we aren't launching via accelerate, ditch
self._rank=0
self._world_size=1
else:
self._model=accelerator.prepare_model(
self.model,evaluation_mode=True
ifgpus>accelerator.num_processes:
eval_logger.warning(
"WARNING: The number of total system GPUs does not match the number of spawned processes. "
"If you would like to use data parallelism, please launch the script "
"with 'accelerate launch *script*'. "
f"Current run will proceed with {accelerator.num_processes} devices."
)
assertaccelerator.distributed_typein[
DistributedType.FSDP,
DistributedType.MULTI_GPU,
],"Unsupported distributed type provided. Only DDP and FSDP are supported."