Commit d26aeda7 authored by Nathan Habib's avatar Nathan Habib
Browse files

checkout from main

parent b9bda0a3
...@@ -275,9 +275,9 @@ def consolidate_results( ...@@ -275,9 +275,9 @@ def consolidate_results(
metric_key metric_key
] ]
results[task_output.task_name]["samples"] = task_output.sample_len results[task_output.task_name]["samples"] = task_output.sample_len
results[task_output.task_name][ results[task_output.task_name][f"{metric}_stderr,{filter_key}"] = (
f"{metric}_stderr,{filter_key}" task_output.agg_metrics[f"{metric}_stderr,{filter_key}"]
] = task_output.agg_metrics[f"{metric}_stderr,{filter_key}"] )
return results, samples, configs, versions, num_fewshot, higher_is_better return results, samples, configs, versions, num_fewshot, higher_is_better
......
...@@ -4,7 +4,6 @@ from lm_eval.api.registry import register_filter ...@@ -4,7 +4,6 @@ from lm_eval.api.registry import register_filter
@register_filter("decontaminate") @register_filter("decontaminate")
class DecontaminationFilter(Filter): class DecontaminationFilter(Filter):
""" """
A filter which evaluates A filter which evaluates
""" """
......
...@@ -110,3 +110,20 @@ def add_env_info(storage: Dict[str, Any]): ...@@ -110,3 +110,20 @@ def add_env_info(storage: Dict[str, Any]):
"upper_git_hash": upper_dir_commit, # in case this repo is submodule "upper_git_hash": upper_dir_commit, # in case this repo is submodule
} }
storage.update(added_info) storage.update(added_info)
def add_tokenizer_info(storage: Dict[str, Any], lm):
if getattr(lm, "tokenizer", False):
tokenizer_info = {
"tokenizer_pad_token": [lm.tokenizer.pad_token, lm.tokenizer.pad_token_id],
"tokenizer_eos_token": [lm.tokenizer.eos_token, lm.tokenizer.eos_token_id],
"tokenizer_bos_token": [lm.tokenizer.bos_token, lm.tokenizer.bos_token_id],
"eot_token_id": getattr(lm, "eot_token_id", None),
"max_length": getattr(lm, "max_length", None),
}
storage.update(tokenizer_info)
# seems gguf and textsynth do not have tokenizer
else:
logger.debug(
"LM does not have a 'tokenizer' attribute, not logging tokenizer metadata to results."
)
...@@ -307,7 +307,7 @@ please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install ...@@ -307,7 +307,7 @@ please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install
# defaults to os.environ.get("ANTHROPIC_API_KEY") # defaults to os.environ.get("ANTHROPIC_API_KEY")
self.client = anthropic.Anthropic() self.client = anthropic.Anthropic()
self.temperature = temperature self.temperature = temperature
self.max_token = max_tokens self.max_tokens = max_tokens
self.tokenizer = self.client.get_tokenizer() self.tokenizer = self.client.get_tokenizer()
self.kwargs = kwargs self.kwargs = kwargs
......
...@@ -288,7 +288,7 @@ class NEURON_HF(TemplateLM): ...@@ -288,7 +288,7 @@ class NEURON_HF(TemplateLM):
self.vocab_size = self.tokenizer.vocab_size self.vocab_size = self.tokenizer.vocab_size
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
self.add_bos_token = self.add_bos_token self.add_bos_token = add_bos_token
self._max_length = max_length self._max_length = max_length
......
""" TextSynth API """TextSynth API
Implementation provided by Fabrice Bellard: Implementation provided by Fabrice Bellard:
https://github.com/EleutherAI/lm-evaluation-harness/issues/295 https://github.com/EleutherAI/lm-evaluation-harness/issues/295
...@@ -11,6 +11,7 @@ Example usage: ...@@ -11,6 +11,7 @@ Example usage:
Homepage: https://textsynth.com/index.html Homepage: https://textsynth.com/index.html
""" """
import logging import logging
import os import os
......
...@@ -119,6 +119,12 @@ class VLLM(TemplateLM): ...@@ -119,6 +119,12 @@ class VLLM(TemplateLM):
tokenizer_revision=tokenizer_revision, tokenizer_revision=tokenizer_revision,
) )
self.add_bos_token = add_bos_token self.add_bos_token = add_bos_token
if "gemma" in pretrained.lower():
self.add_bos_token = True
eval_logger.info(
"Found 'gemma' in model name, a BOS token will be used as Gemma underperforms without it."
)
self.custom_prefix_token_id = prefix_token_id self.custom_prefix_token_id = prefix_token_id
if prefix_token_id is not None: if prefix_token_id is not None:
eval_logger.info( eval_logger.info(
...@@ -493,7 +499,10 @@ class VLLM(TemplateLM): ...@@ -493,7 +499,10 @@ class VLLM(TemplateLM):
def modify_gen_kwargs(kwargs: dict) -> dict: def modify_gen_kwargs(kwargs: dict) -> dict:
# sampling_params # sampling_params
do_sample = kwargs.pop("do_sample", None) do_sample = kwargs.pop("do_sample", None)
if do_sample is False or "temperature" not in kwargs: if do_sample is False and "temperature" not in kwargs:
eval_logger.debug(
"Got `do_sample=False` and no temperature value, setting VLLM temperature to 0.0 ..."
)
kwargs["temperature"] = 0.0 kwargs["temperature"] = 0.0
# hf defaults # hf defaults
kwargs["skip_special_tokens"] = kwargs.get("skip_special_tokens", False) kwargs["skip_special_tokens"] = kwargs.get("skip_special_tokens", False)
......
...@@ -10,8 +10,8 @@ ...@@ -10,8 +10,8 @@
| [aclue](aclue/README.md) | Tasks focusing on ancient Chinese language understanding and cultural aspects. | Ancient Chinese | | [aclue](aclue/README.md) | Tasks focusing on ancient Chinese language understanding and cultural aspects. | Ancient Chinese |
| [aexams](aexams/README.md) | Tasks in Arabic related to various academic exams covering a range of subjects. | Arabic | | [aexams](aexams/README.md) | Tasks in Arabic related to various academic exams covering a range of subjects. | Arabic |
| [agieval](agieval/README.md) | Tasks involving historical data or questions related to history and historical texts. | English, Chinese | | [agieval](agieval/README.md) | Tasks involving historical data or questions related to history and historical texts. | English, Chinese |
| [ammlu](ammlu/README.md) | Arabic version of MMLU. | Arabic |
| [anli](anli/README.md) | Adversarial natural language inference tasks designed to test model robustness. | English | | [anli](anli/README.md) | Adversarial natural language inference tasks designed to test model robustness. | English |
| [arabicmmlu](arabicmmlu/README.md) | Localized Arabic version of MMLU with multiple-choice questions from 40 subjects. | Arabic |
| [arc](arc/README.md) | Tasks involving complex reasoning over a diverse set of questions. | English | | [arc](arc/README.md) | Tasks involving complex reasoning over a diverse set of questions. | English |
| [arithmetic](arithmetic/README.md) | Tasks involving numerical computations and arithmetic reasoning. | English | | [arithmetic](arithmetic/README.md) | Tasks involving numerical computations and arithmetic reasoning. | English |
| [asdiv](asdiv/README.md) | Tasks involving arithmetic and mathematical reasoning challenges. | English | | [asdiv](asdiv/README.md) | Tasks involving arithmetic and mathematical reasoning challenges. | English |
...@@ -20,11 +20,13 @@ ...@@ -20,11 +20,13 @@
| [bbh](bbh/README.md) | Tasks focused on deep semantic understanding through hypothesization and reasoning. | English, German | | [bbh](bbh/README.md) | Tasks focused on deep semantic understanding through hypothesization and reasoning. | English, German |
| [belebele](belebele/README.md) | Language understanding tasks in a variety of languages and scripts. | Multiple (122 languages) | | [belebele](belebele/README.md) | Language understanding tasks in a variety of languages and scripts. | Multiple (122 languages) |
| benchmarks | General benchmarking tasks that test a wide range of language understanding capabilities. | | | benchmarks | General benchmarking tasks that test a wide range of language understanding capabilities. | |
| [bertaqa](bertaqa/README.md) | Local Basque cultural trivia QA tests in English and Basque languages. | English, Basque, Basque (MT) |
| [bigbench](bigbench/README.md) | Broad tasks from the BIG-bench benchmark designed to push the boundaries of large models. | Multiple | | [bigbench](bigbench/README.md) | Broad tasks from the BIG-bench benchmark designed to push the boundaries of large models. | Multiple |
| [blimp](blimp/README.md) | Tasks testing grammatical phenomena to evaluate language model's linguistic capabilities. | English | | [blimp](blimp/README.md) | Tasks testing grammatical phenomena to evaluate language model's linguistic capabilities. | English |
| [ceval](ceval/README.md) | Tasks that evaluate language understanding and reasoning in an educational context. | Chinese | | [ceval](ceval/README.md) | Tasks that evaluate language understanding and reasoning in an educational context. | Chinese |
| [cmmlu](cmmlu/README.md) | Multi-subject multiple choice question tasks for comprehensive academic assessment. | Chinese | | [cmmlu](cmmlu/README.md) | Multi-subject multiple choice question tasks for comprehensive academic assessment. | Chinese |
| code_x_glue | Tasks that involve understanding and generating code across multiple programming languages. | Go, Java, JS, PHP, Python, Ruby | | code_x_glue | Tasks that involve understanding and generating code across multiple programming languages. | Go, Java, JS, PHP, Python, Ruby |
| [commonsense_qa](commmonsense_qa/README.md) | CommonsenseQA, a multiple-choice QA dataset for measuring commonsense knowledge. | English |
| [copal_id](copal_id/README.md) | Indonesian causal commonsense reasoning dataset that captures local nuances. | Indonesian | | [copal_id](copal_id/README.md) | Indonesian causal commonsense reasoning dataset that captures local nuances. | Indonesian |
| [coqa](coqa/README.md) | Conversational question answering tasks to test dialog understanding. | English | | [coqa](coqa/README.md) | Conversational question answering tasks to test dialog understanding. | English |
| [crows_pairs](crows_pairs/README.md) | Tasks designed to test model biases in various sociodemographic groups. | English, French | | [crows_pairs](crows_pairs/README.md) | Tasks designed to test model biases in various sociodemographic groups. | English, French |
...@@ -71,6 +73,7 @@ ...@@ -71,6 +73,7 @@
| okapi/mmlu_multilingual | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (34 languages) | | okapi/mmlu_multilingual | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (34 languages) |
| [okapi/truthfulqa_multilingual](okapi/truthfulqa_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) | | [okapi/truthfulqa_multilingual](okapi/truthfulqa_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) |
| [openbookqa](openbookqa/README.md) | Open-book question answering tasks that require external knowledge and reasoning. | English | | [openbookqa](openbookqa/README.md) | Open-book question answering tasks that require external knowledge and reasoning. | English |
| [paloma](paloma/README.md) | Paloma is a comprehensive benchmark designed to evaluate open language models across a wide range of domains, ranging from niche artist communities to mental health forums on Reddit. | English |
| [paws-x](paws-x/README.md) | Paraphrase Adversaries from Word Scrambling, focusing on cross-lingual capabilities. | English, French, Spanish, German, Chinese, Japanese, Korean | | [paws-x](paws-x/README.md) | Paraphrase Adversaries from Word Scrambling, focusing on cross-lingual capabilities. | English, French, Spanish, German, Chinese, Japanese, Korean |
| [pile](pile/README.md) | Open source language modelling data set that consists of 22 smaller, high-quality datasets. | English | | [pile](pile/README.md) | Open source language modelling data set that consists of 22 smaller, high-quality datasets. | English |
| [pile_10k](pile_10k/README.md) | The first 10K elements of The Pile, useful for debugging models trained on it. | English | | [pile_10k](pile_10k/README.md) | The first 10K elements of The Pile, useful for debugging models trained on it. | English |
......
""" """
Take in a YAML, and output all other splits with this YAML Take in a YAML, and output all other splits with this YAML
""" """
import argparse import argparse
import os import os
......
""" """
Take in a YAML, and output all other splits with this YAML Take in a YAML, and output all other splits with this YAML
""" """
import argparse import argparse
import os import os
import re import re
......
""" """
Take in a YAML, and output all other splits with this YAML Take in a YAML, and output all other splits with this YAML
""" """
import argparse import argparse
import os import os
......
...@@ -8,6 +8,7 @@ Requires the installation of ...@@ -8,6 +8,7 @@ Requires the installation of
`pip install "bigbench @ https://storage.googleapis.com/public_research_data/bigbench/bigbench-0.0.1.tar.gz"` `pip install "bigbench @ https://storage.googleapis.com/public_research_data/bigbench/bigbench-0.0.1.tar.gz"`
and is included so that the bigbench dependency can be avoided. and is included so that the bigbench dependency can be avoided.
""" """
import bigbench.api.util as bb_utils import bigbench.api.util as bb_utils
import datasets import datasets
from tqdm import tqdm from tqdm import tqdm
......
""" """
Take in a YAML, and output all other splits with this YAML Take in a YAML, and output all other splits with this YAML
""" """
import argparse import argparse
import os import os
......
""" """
Take in a YAML, and output all other splits with this YAML Take in a YAML, and output all other splits with this YAML
""" """
import argparse import argparse
import os import os
......
""" """
Take in a YAML, and output all other splits with this YAML Take in a YAML, and output all other splits with this YAML
""" """
import argparse import argparse
import os import os
......
"""
"""
import re import re
from typing import List from typing import List
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
# limitations under the License. # limitations under the License.
"""Library of instructions.""" """Library of instructions."""
import collections import collections
import json import json
import logging import logging
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
# limitations under the License. # limitations under the License.
"""Registry of all instructions.""" """Registry of all instructions."""
from lm_eval.tasks.ifeval import instructions from lm_eval.tasks.ifeval import instructions
......
""" """
Take in a YAML, and output all "other" splits with this YAML Take in a YAML, and output all "other" splits with this YAML
""" """
import argparse import argparse
import logging import logging
import os import os
......
...@@ -19,3 +19,5 @@ metric_list: ...@@ -19,3 +19,5 @@ metric_list:
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 1.0
dataset_kwargs:
trust_remote_code: true
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment