checkout from main

d26aeda7 · Nathan Habib · b9bda0a3 · d26aeda7 · d26aeda7 · d26aeda7
Commit d26aeda7 authored Jun 27, 2024 by Nathan Habib
20 changed files
--- a/lm_eval/evaluator_utils.py
+++ b/lm_eval/evaluator_utils.py
@@ -275,9 +275,9 @@ def consolidate_results(
                metric_key
            ]
            results[task_output.task_name]["samples"] = task_output.sample_len
-            results[task_output.task_name][
+            results[task_output.task_name][f"{metric}_stderr,{filter_key}"] = (
-                f"{metric}_stderr,{filter_key}"
+                task_output.agg_metrics[f"{metric}_stderr,{filter_key}"]
-            ] = task_output.agg_metrics[f"{metric}_stderr,{filter_key}"]
+            )
    return results, samples, configs, versions, num_fewshot, higher_is_better

--- a/lm_eval/filters/decontamination.py
+++ b/lm_eval/filters/decontamination.py
@@ -4,7 +4,6 @@ from lm_eval.api.registry import register_filter
 @register_filter("decontaminate")
 class DecontaminationFilter(Filter):
    """
    A filter which evaluates
    """

--- a/lm_eval/loggers/utils.py
+++ b/lm_eval/loggers/utils.py
@@ -110,3 +110,20 @@ def add_env_info(storage: Dict[str, Any]):
        "upper_git_hash": upper_dir_commit,  # in case this repo is submodule
    }
    storage.update(added_info)
+def add_tokenizer_info(storage: Dict[str, Any], lm):
+    if getattr(lm, "tokenizer", False):
+        tokenizer_info = {
+            "tokenizer_pad_token": [lm.tokenizer.pad_token, lm.tokenizer.pad_token_id],
+            "tokenizer_eos_token": [lm.tokenizer.eos_token, lm.tokenizer.eos_token_id],
+            "tokenizer_bos_token": [lm.tokenizer.bos_token, lm.tokenizer.bos_token_id],
+            "eot_token_id": getattr(lm, "eot_token_id", None),
+            "max_length": getattr(lm, "max_length", None),
+        }
+        storage.update(tokenizer_info)
+    # seems gguf and textsynth do not have tokenizer
+    else:
+        logger.debug(
+            "LM does not have a 'tokenizer' attribute, not logging tokenizer metadata to results."
+        )
--- a/lm_eval/models/anthropic_llms.py
+++ b/lm_eval/models/anthropic_llms.py
@@ -307,7 +307,7 @@ please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install
        # defaults to os.environ.get("ANTHROPIC_API_KEY")
        self.client = anthropic.Anthropic()
        self.temperature = temperature
-        self.max_token = max_tokens
+        self.max_tokens = max_tokens
        self.tokenizer = self.client.get_tokenizer()
        self.kwargs = kwargs

--- a/lm_eval/models/neuron_optimum.py
+++ b/lm_eval/models/neuron_optimum.py
@@ -288,7 +288,7 @@ class NEURON_HF(TemplateLM):
        self.vocab_size = self.tokenizer.vocab_size
        self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
-        self.add_bos_token = self.add_bos_token
+        self.add_bos_token = add_bos_token
        self._max_length = max_length

--- a/lm_eval/models/textsynth.py
+++ b/lm_eval/models/textsynth.py
-""" TextSynth API
+"""TextSynth API
 Implementation provided by Fabrice Bellard:
    https://github.com/EleutherAI/lm-evaluation-harness/issues/295
@@ -11,6 +11,7 @@ Example usage:
 Homepage: https://textsynth.com/index.html
 """
 import logging
 import os

--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
@@ -119,6 +119,12 @@ class VLLM(TemplateLM):
            tokenizer_revision=tokenizer_revision,
        )
        self.add_bos_token = add_bos_token
+        if "gemma" in pretrained.lower():
+            self.add_bos_token = True
+            eval_logger.info(
+                "Found 'gemma' in model name, a BOS token will be used as Gemma underperforms without it."
+            )
        self.custom_prefix_token_id = prefix_token_id
        if prefix_token_id is not None:
            eval_logger.info(
@@ -493,7 +499,10 @@ class VLLM(TemplateLM):
    def modify_gen_kwargs(kwargs: dict) -> dict:
        # sampling_params
        do_sample = kwargs.pop("do_sample", None)
-        if do_sample is False or "temperature" not in kwargs:
+        if do_sample is False and "temperature" not in kwargs:
+            eval_logger.debug(
+                "Got `do_sample=False` and no temperature value, setting VLLM temperature to 0.0 ..."
+            )
            kwargs["temperature"] = 0.0
        # hf defaults
        kwargs["skip_special_tokens"] = kwargs.get("skip_special_tokens", False)

--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -10,8 +10,8 @@
 | [aclue](aclue/README.md) | Tasks focusing on ancient Chinese language understanding and cultural aspects. | Ancient Chinese |
 | [aexams](aexams/README.md) | Tasks in Arabic related to various academic exams covering a range of subjects. | Arabic |
 | [agieval](agieval/README.md) | Tasks involving historical data or questions related to history and historical texts. | English, Chinese |
-| [ammlu](ammlu/README.md) | Arabic version of MMLU. | Arabic |
 | [anli](anli/README.md) | Adversarial natural language inference tasks designed to test model robustness. | English |
+| [arabicmmlu](arabicmmlu/README.md) | Localized Arabic version of MMLU with multiple-choice questions from 40 subjects. | Arabic |
 | [arc](arc/README.md) | Tasks involving complex reasoning over a diverse set of questions.  | English |
 | [arithmetic](arithmetic/README.md) | Tasks involving numerical computations and arithmetic reasoning. | English |
 | [asdiv](asdiv/README.md) | Tasks involving arithmetic and mathematical reasoning challenges. | English |
@@ -20,11 +20,13 @@
 | [bbh](bbh/README.md) | Tasks focused on deep semantic understanding through hypothesization and reasoning. | English, German |
 | [belebele](belebele/README.md) | Language understanding tasks in a variety of languages and scripts. | Multiple (122 languages) |
 | benchmarks | General benchmarking tasks that test a wide range of language understanding capabilities. | |
+| [bertaqa](bertaqa/README.md) | Local Basque cultural trivia QA tests in English and Basque languages. | English, Basque, Basque (MT) |
 | [bigbench](bigbench/README.md) | Broad tasks from the BIG-bench benchmark designed to push the boundaries of large models. | Multiple |
 | [blimp](blimp/README.md) | Tasks testing grammatical phenomena to evaluate language model's linguistic capabilities. | English |
 | [ceval](ceval/README.md) | Tasks that evaluate language understanding and reasoning in an educational context. | Chinese |
 | [cmmlu](cmmlu/README.md) | Multi-subject multiple choice question tasks for comprehensive academic assessment. | Chinese |
 | code_x_glue | Tasks that involve understanding and generating code across multiple programming languages. | Go, Java, JS, PHP, Python, Ruby |
+| [commonsense_qa](commmonsense_qa/README.md) | CommonsenseQA, a multiple-choice QA dataset for measuring commonsense knowledge. | English |
 | [copal_id](copal_id/README.md) | Indonesian causal commonsense reasoning dataset that captures local nuances. | Indonesian |
 | [coqa](coqa/README.md) | Conversational question answering tasks to test dialog understanding. | English |
 | [crows_pairs](crows_pairs/README.md) | Tasks designed to test model biases in various sociodemographic groups. | English, French |
@@ -71,6 +73,7 @@
 | okapi/mmlu_multilingual | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (34 languages) |
 | [okapi/truthfulqa_multilingual](okapi/truthfulqa_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) |
 | [openbookqa](openbookqa/README.md) | Open-book question answering tasks that require external knowledge and reasoning. | English |
+| [paloma](paloma/README.md) | Paloma is a comprehensive benchmark designed to evaluate open language models across a wide range of domains, ranging from niche artist communities to mental health forums on Reddit. | English |
 | [paws-x](paws-x/README.md) | Paraphrase Adversaries from Word Scrambling, focusing on cross-lingual capabilities. | English, French, Spanish, German, Chinese, Japanese, Korean |
 | [pile](pile/README.md) | Open source language modelling data set that consists of 22 smaller, high-quality datasets. | English |
 | [pile_10k](pile_10k/README.md) | The first 10K elements of The Pile, useful for debugging models trained on it. | English |

--- a/lm_eval/tasks/aclue/_generate_configs.py
+++ b/lm_eval/tasks/aclue/_generate_configs.py
 """
 Take in a YAML, and output all other splits with this YAML
 """
 import argparse
 import os

--- a/lm_eval/tasks/bbh/_generate_configs.py
+++ b/lm_eval/tasks/bbh/_generate_configs.py
 """
 Take in a YAML, and output all other splits with this YAML
 """
 import argparse
 import os
 import re

--- a/lm_eval/tasks/belebele/_generate_configs.py
+++ b/lm_eval/tasks/belebele/_generate_configs.py
 """
 Take in a YAML, and output all other splits with this YAML
 """
 import argparse
 import os

--- a/lm_eval/tasks/bigbench/push_bigbench_dataset.py
+++ b/lm_eval/tasks/bigbench/push_bigbench_dataset.py
@@ -8,6 +8,7 @@ Requires the installation of
 `pip install "bigbench @ https://storage.googleapis.com/public_research_data/bigbench/bigbench-0.0.1.tar.gz"`
 and is included so that the bigbench dependency can be avoided.
 """
 import bigbench.api.util as bb_utils
 import datasets
 from tqdm import tqdm

--- a/lm_eval/tasks/ceval/_generate_configs.py
+++ b/lm_eval/tasks/ceval/_generate_configs.py
 """
 Take in a YAML, and output all other splits with this YAML
 """
 import argparse
 import os

--- a/lm_eval/tasks/cmmlu/_generate_configs.py
+++ b/lm_eval/tasks/cmmlu/_generate_configs.py
 """
 Take in a YAML, and output all other splits with this YAML
 """
 import argparse
 import os

--- a/lm_eval/tasks/csatqa/_generate_configs.py
+++ b/lm_eval/tasks/csatqa/_generate_configs.py
 """
 Take in a YAML, and output all other splits with this YAML
 """
 import argparse
 import os

--- a/lm_eval/tasks/fda/task.py
+++ b/lm_eval/tasks/fda/task.py
-"""
-"""
 import re
 from typing import List

--- a/lm_eval/tasks/ifeval/instructions.py
+++ b/lm_eval/tasks/ifeval/instructions.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 """Library of instructions."""
 import collections
 import json
 import logging

--- a/lm_eval/tasks/ifeval/instructions_registry.py
+++ b/lm_eval/tasks/ifeval/instructions_registry.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 """Registry of all instructions."""
 from lm_eval.tasks.ifeval import instructions

--- a/lm_eval/tasks/mmlu/_generate_configs.py
+++ b/lm_eval/tasks/mmlu/_generate_configs.py
 """
 Take in a YAML, and output all "other" splits with this YAML
 """
 import argparse
 import logging
 import os

--- a/lm_eval/tasks/piqa/piqa.yaml
+++ b/lm_eval/tasks/piqa/piqa.yaml
@@ -19,3 +19,5 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 1.0
+dataset_kwargs:
+  trust_remote_code: true