Commit d26aeda7 authored by Nathan Habib's avatar Nathan Habib
Browse files

checkout from main

parent b9bda0a3
......@@ -275,9 +275,9 @@ def consolidate_results(
metric_key
]
results[task_output.task_name]["samples"] = task_output.sample_len
results[task_output.task_name][
f"{metric}_stderr,{filter_key}"
] = task_output.agg_metrics[f"{metric}_stderr,{filter_key}"]
results[task_output.task_name][f"{metric}_stderr,{filter_key}"] = (
task_output.agg_metrics[f"{metric}_stderr,{filter_key}"]
)
return results, samples, configs, versions, num_fewshot, higher_is_better
......
......@@ -4,7 +4,6 @@ from lm_eval.api.registry import register_filter
@register_filter("decontaminate")
class DecontaminationFilter(Filter):
"""
A filter which evaluates
"""
......
......@@ -110,3 +110,20 @@ def add_env_info(storage: Dict[str, Any]):
"upper_git_hash": upper_dir_commit, # in case this repo is submodule
}
storage.update(added_info)
def add_tokenizer_info(storage: Dict[str, Any], lm):
if getattr(lm, "tokenizer", False):
tokenizer_info = {
"tokenizer_pad_token": [lm.tokenizer.pad_token, lm.tokenizer.pad_token_id],
"tokenizer_eos_token": [lm.tokenizer.eos_token, lm.tokenizer.eos_token_id],
"tokenizer_bos_token": [lm.tokenizer.bos_token, lm.tokenizer.bos_token_id],
"eot_token_id": getattr(lm, "eot_token_id", None),
"max_length": getattr(lm, "max_length", None),
}
storage.update(tokenizer_info)
# seems gguf and textsynth do not have tokenizer
else:
logger.debug(
"LM does not have a 'tokenizer' attribute, not logging tokenizer metadata to results."
)
......@@ -307,7 +307,7 @@ please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install
# defaults to os.environ.get("ANTHROPIC_API_KEY")
self.client = anthropic.Anthropic()
self.temperature = temperature
self.max_token = max_tokens
self.max_tokens = max_tokens
self.tokenizer = self.client.get_tokenizer()
self.kwargs = kwargs
......
......@@ -288,7 +288,7 @@ class NEURON_HF(TemplateLM):
self.vocab_size = self.tokenizer.vocab_size
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
self.add_bos_token = self.add_bos_token
self.add_bos_token = add_bos_token
self._max_length = max_length
......
""" TextSynth API
"""TextSynth API
Implementation provided by Fabrice Bellard:
https://github.com/EleutherAI/lm-evaluation-harness/issues/295
......@@ -11,6 +11,7 @@ Example usage:
Homepage: https://textsynth.com/index.html
"""
import logging
import os
......
......@@ -119,6 +119,12 @@ class VLLM(TemplateLM):
tokenizer_revision=tokenizer_revision,
)
self.add_bos_token = add_bos_token
if "gemma" in pretrained.lower():
self.add_bos_token = True
eval_logger.info(
"Found 'gemma' in model name, a BOS token will be used as Gemma underperforms without it."
)
self.custom_prefix_token_id = prefix_token_id
if prefix_token_id is not None:
eval_logger.info(
......@@ -493,7 +499,10 @@ class VLLM(TemplateLM):
def modify_gen_kwargs(kwargs: dict) -> dict:
# sampling_params
do_sample = kwargs.pop("do_sample", None)
if do_sample is False or "temperature" not in kwargs:
if do_sample is False and "temperature" not in kwargs:
eval_logger.debug(
"Got `do_sample=False` and no temperature value, setting VLLM temperature to 0.0 ..."
)
kwargs["temperature"] = 0.0
# hf defaults
kwargs["skip_special_tokens"] = kwargs.get("skip_special_tokens", False)
......
......@@ -10,8 +10,8 @@
| [aclue](aclue/README.md) | Tasks focusing on ancient Chinese language understanding and cultural aspects. | Ancient Chinese |
| [aexams](aexams/README.md) | Tasks in Arabic related to various academic exams covering a range of subjects. | Arabic |
| [agieval](agieval/README.md) | Tasks involving historical data or questions related to history and historical texts. | English, Chinese |
| [ammlu](ammlu/README.md) | Arabic version of MMLU. | Arabic |
| [anli](anli/README.md) | Adversarial natural language inference tasks designed to test model robustness. | English |
| [arabicmmlu](arabicmmlu/README.md) | Localized Arabic version of MMLU with multiple-choice questions from 40 subjects. | Arabic |
| [arc](arc/README.md) | Tasks involving complex reasoning over a diverse set of questions. | English |
| [arithmetic](arithmetic/README.md) | Tasks involving numerical computations and arithmetic reasoning. | English |
| [asdiv](asdiv/README.md) | Tasks involving arithmetic and mathematical reasoning challenges. | English |
......@@ -20,11 +20,13 @@
| [bbh](bbh/README.md) | Tasks focused on deep semantic understanding through hypothesization and reasoning. | English, German |
| [belebele](belebele/README.md) | Language understanding tasks in a variety of languages and scripts. | Multiple (122 languages) |
| benchmarks | General benchmarking tasks that test a wide range of language understanding capabilities. | |
| [bertaqa](bertaqa/README.md) | Local Basque cultural trivia QA tests in English and Basque languages. | English, Basque, Basque (MT) |
| [bigbench](bigbench/README.md) | Broad tasks from the BIG-bench benchmark designed to push the boundaries of large models. | Multiple |
| [blimp](blimp/README.md) | Tasks testing grammatical phenomena to evaluate language model's linguistic capabilities. | English |
| [ceval](ceval/README.md) | Tasks that evaluate language understanding and reasoning in an educational context. | Chinese |
| [cmmlu](cmmlu/README.md) | Multi-subject multiple choice question tasks for comprehensive academic assessment. | Chinese |
| code_x_glue | Tasks that involve understanding and generating code across multiple programming languages. | Go, Java, JS, PHP, Python, Ruby |
| [commonsense_qa](commmonsense_qa/README.md) | CommonsenseQA, a multiple-choice QA dataset for measuring commonsense knowledge. | English |
| [copal_id](copal_id/README.md) | Indonesian causal commonsense reasoning dataset that captures local nuances. | Indonesian |
| [coqa](coqa/README.md) | Conversational question answering tasks to test dialog understanding. | English |
| [crows_pairs](crows_pairs/README.md) | Tasks designed to test model biases in various sociodemographic groups. | English, French |
......@@ -71,6 +73,7 @@
| okapi/mmlu_multilingual | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (34 languages) |
| [okapi/truthfulqa_multilingual](okapi/truthfulqa_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) |
| [openbookqa](openbookqa/README.md) | Open-book question answering tasks that require external knowledge and reasoning. | English |
| [paloma](paloma/README.md) | Paloma is a comprehensive benchmark designed to evaluate open language models across a wide range of domains, ranging from niche artist communities to mental health forums on Reddit. | English |
| [paws-x](paws-x/README.md) | Paraphrase Adversaries from Word Scrambling, focusing on cross-lingual capabilities. | English, French, Spanish, German, Chinese, Japanese, Korean |
| [pile](pile/README.md) | Open source language modelling data set that consists of 22 smaller, high-quality datasets. | English |
| [pile_10k](pile_10k/README.md) | The first 10K elements of The Pile, useful for debugging models trained on it. | English |
......
"""
Take in a YAML, and output all other splits with this YAML
"""
import argparse
import os
......
"""
Take in a YAML, and output all other splits with this YAML
"""
import argparse
import os
import re
......
"""
Take in a YAML, and output all other splits with this YAML
"""
import argparse
import os
......
......@@ -8,6 +8,7 @@ Requires the installation of
`pip install "bigbench @ https://storage.googleapis.com/public_research_data/bigbench/bigbench-0.0.1.tar.gz"`
and is included so that the bigbench dependency can be avoided.
"""
import bigbench.api.util as bb_utils
import datasets
from tqdm import tqdm
......
"""
Take in a YAML, and output all other splits with this YAML
"""
import argparse
import os
......
"""
Take in a YAML, and output all other splits with this YAML
"""
import argparse
import os
......
"""
Take in a YAML, and output all other splits with this YAML
"""
import argparse
import os
......
"""
"""
import re
from typing import List
......
......@@ -13,6 +13,7 @@
# limitations under the License.
"""Library of instructions."""
import collections
import json
import logging
......
......@@ -13,6 +13,7 @@
# limitations under the License.
"""Registry of all instructions."""
from lm_eval.tasks.ifeval import instructions
......
"""
Take in a YAML, and output all "other" splits with this YAML
"""
import argparse
import logging
import os
......
......@@ -19,3 +19,5 @@ metric_list:
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment