Commit 90ad5db7 authored by lintangsutawika's avatar lintangsutawika
Browse files

merged main

parents f692caa9 b177c82c
......@@ -3,5 +3,5 @@ task: truthfulqa_te_mc2
dataset_path: alexandrainst/m_truthfulqa
dataset_name: te
training_split: null
validation_split: validation
validation_split: val
test_split: null
......@@ -3,5 +3,5 @@ task: truthfulqa_uk_mc1
dataset_path: alexandrainst/m_truthfulqa
dataset_name: uk
training_split: null
validation_split: validation
validation_split: val
test_split: null
......@@ -3,5 +3,5 @@ task: truthfulqa_uk_mc2
dataset_path: alexandrainst/m_truthfulqa
dataset_name: uk
training_split: null
validation_split: validation
validation_split: val
test_split: null
......@@ -3,5 +3,5 @@ task: truthfulqa_vi_mc1
dataset_path: alexandrainst/m_truthfulqa
dataset_name: vi
training_split: null
validation_split: validation
validation_split: val
test_split: null
......@@ -3,5 +3,5 @@ task: truthfulqa_vi_mc2
dataset_path: alexandrainst/m_truthfulqa
dataset_name: vi
training_split: null
validation_split: validation
validation_split: val
test_split: null
......@@ -3,5 +3,5 @@ task: truthfulqa_zh_mc1
dataset_path: alexandrainst/m_truthfulqa
dataset_name: zh
training_split: null
validation_split: validation
validation_split: val
test_split: null
......@@ -3,5 +3,5 @@ task: truthfulqa_zh_mc2
dataset_path: alexandrainst/m_truthfulqa
dataset_name: zh
training_split: null
validation_split: validation
validation_split: val
test_split: null
import datasets
import re
import datasets
import numpy as np
QA_PROMPT = (
"Q: What is human life expectancy in the United States?\n"
"A: Human life expectancy in the United States is 78 years.\n\n"
......@@ -17,6 +19,7 @@ QA_PROMPT = (
"A: The 1992 Olympics were held in Barcelona, Spain."
)
def preprocess(text):
if text is None:
return " "
......
import re
import string
from collections import Counter
......
from datasets import Dataset
from functools import partial
from datasets import Dataset
def process_docs(dataset, set_answer_type="bool"):
FEATURES = ["title", "abstract", "question", "answer", "answer_type"]
......
import os
import json
import requests
import os
import numpy as np
import requests
from lm_eval.utils import eval_logger
......
import re
from abc import abstractmethod
from functools import reduce
import numpy as np
import transformers.data.metrics.squad_metrics as squad_metrics
from abc import abstractmethod
from datasets import load_metric
from transformers import AutoTokenizer
from functools import reduce
from lm_eval.api.task import Task
from lm_eval.api.metrics import mean
from lm_eval.api.instance import Instance
from lm_eval.api.registry import register_task
from lm_eval.api.metrics import mean
from lm_eval.api.task import Task
_CITATION = """
@inproceedings{shaham-etal-2022-scrolls,
......@@ -44,6 +44,7 @@ _CITATION = """
def _download_metric():
import os
import shutil
from huggingface_hub import hf_hub_download
scrolls_metric_path = hf_hub_download(
......@@ -148,7 +149,7 @@ class _SCROLLSTask(Task):
del self.dataset["test"]
for split in self.dataset:
self.dataset[split] = _drop_duplicates_in_input(self.dataset[split])
if self.PRUNE_TOKENIZERS is not None and self.PRUNE_TOKENIZERS is not None:
if self.PRUNE_TOKENIZERS is not None:
self.prune()
def _get_prune_text(self, sample):
......
......@@ -13,14 +13,15 @@ also determine when no answer is supported by the paragraph and abstain from ans
Homepage: https://rajpurkar.github.io/SQuAD-explorer/
"""
import datasets
from math import exp
from functools import partial
from math import exp
import datasets
from packaging import version
from lm_eval.api.task import ConfigurableTask
from lm_eval.api.instance import Instance
from lm_eval.api.task import ConfigurableTask
_CITATION = """
@misc{rajpurkar2018know,
......@@ -35,7 +36,6 @@ _CITATION = """
def _squad_metric(predictions, references):
# squad_metric = load("squad_v2")
squad_metric = datasets.load_metric("squad_v2")
return squad_metric.compute(predictions=predictions, references=references)
......@@ -52,7 +52,7 @@ class SQuAD2(ConfigurableTask):
DATASET_NAME = None
def __init__(self):
super().__init__(config={'metadata': {'version': self.VERSION}})
super().__init__(config={"metadata": {"version": self.VERSION}})
# HF changed squad on us so we have to make sure we aren't running the old one
assert version.parse(datasets.__version__) >= version.parse(
......
import sklearn
import numpy as np
import sklearn
def cb_multi_fi(items):
......
import collections
import re
import string
import collections
import numpy as np
import numpy as np
from datasets import Dataset
from lm_eval.api.metrics import metric_max_over_ground_truths
......
import re
from typing import List
def doc_to_text(x):
text = re.sub(r" X ", " *" + x["span2_text"] + "* ", _wsc_inputs(x))
return "wsc: " + text
......@@ -23,14 +24,14 @@ def _wsc_inputs(x):
[
" ".join(words[:pronoun_index]),
"X",
" ".join(words[pronoun_index + 1:]),
" ".join(words[pronoun_index + 1 :]),
]
)
# Handle some special cases.
if (
x["text"]
== 'The boy continued to whip the pony , and eventually the pony threw him over. John laughed out quite loud. "Good for him," he said. '
x["text"]
== 'The boy continued to whip the pony , and eventually the pony threw him over. John laughed out quite loud. "Good for him," he said. '
):
return (
"The boy continued to whip the pony , and eventually the pony threw "
......@@ -39,8 +40,8 @@ def _wsc_inputs(x):
# Using the span2_index, we get 'use' instead of 'it'.
if (
x["text"]
== "When they had eventually calmed down a bit , and had gotten home, Mr. Farley put the magic pebble in an iron safe . Some day they might want to use it , but really for now, what more could they wish for?"
x["text"]
== "When they had eventually calmed down a bit , and had gotten home, Mr. Farley put the magic pebble in an iron safe . Some day they might want to use it , but really for now, what more could they wish for?"
):
return (
"When they had eventually calmed down a bit , and had gotten home, "
......
import datasets
import sacrebleu
import numpy as np
import sacrebleu
from rouge_score import rouge_scorer, scoring
......
......@@ -51,7 +51,9 @@ def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
for lang in LANGUAGES:
file_name = f"xwinograd_{lang}.yaml"
try:
with open(f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf-8") as f:
with open(
f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf-8"
) as f:
f.write("# Generated by utils.py\n")
yaml.dump(
{
......
......@@ -5,16 +5,9 @@ import importlib.util
import inspect
import logging
import os
import pathlib
import re
import subprocess
import sys
from itertools import islice
from typing import (
Any,
Callable,
List,
)
from typing import Any, Callable, List
import numpy as np
import yaml
......@@ -249,7 +242,7 @@ def make_table(result_dict, column: str = "results"):
values = []
for k, dic in result_dict[column].items():
version = result_dict["versions"][k]
version = result_dict["versions"].get(k, "N/A")
n = str(result_dict["n-shot"][k])
if "alias" in dic:
......@@ -297,61 +290,6 @@ def positional_deprecated(fn):
return _wrapper
@positional_deprecated
def find_test_root(start_path: pathlib.Path) -> pathlib.Path:
"""
Search upward in the directory tree to a maximum of three layers
to find and return the package root (containing the 'tests' folder)
"""
cur_path = start_path.resolve()
max_layers = 3
for _ in range(max_layers):
if (cur_path / "tests" / "test_version_stable.py").exists():
return cur_path
else:
cur_path = cur_path.parent.resolve()
raise FileNotFoundError(
f"Unable to find package root within {max_layers} upwards" + f"of {start_path}"
)
@positional_deprecated
def run_task_tests(task_list: List[str]):
"""
Find the package root and run the tests for the given tasks
"""
import pytest
package_root = find_test_root(start_path=pathlib.Path(__file__))
task_string = " or ".join(task_list)
args = [
f"{package_root}/tests/test_version_stable.py",
f"--rootdir={package_root}",
"-k",
f"{task_string}",
]
sys.path.append(str(package_root))
pytest_return_val = pytest.main(args)
if pytest_return_val:
raise ValueError(
f"Not all tests for the specified tasks ({task_list}) ran successfully! Error code: {pytest_return_val}"
)
def get_git_commit_hash():
"""
Gets the git commit hash of your current repo (if it exists).
Source: https://github.com/EleutherAI/gpt-neox/blob/b608043be541602170bfcfb8ec9bf85e8a0799e0/megatron/neox_arguments/neox_args.py#L42
"""
try:
git_hash = subprocess.check_output(["git", "describe", "--always"]).strip()
git_hash = git_hash.decode()
except subprocess.CalledProcessError or FileNotFoundError:
# FileNotFoundError occurs when git not installed on system
git_hash = None
return git_hash
def ignore_constructor(loader, node):
return node
......@@ -433,16 +371,10 @@ def apply_template(template: str, doc: dict) -> str:
return rtemplate.render(**doc)
def create_iterator(raw_iterator, rank, world_size, limit=None):
def create_iterator(raw_iterator, *, rank=0, world_size=1, limit=None):
"""
Method for creating a (potentially) sliced and limited
iterator from a raw document iterator. Used for splitting data
among ranks in multigpu setting or only pulling a sample of documents
"""
return islice(raw_iterator, rank, limit, world_size)
# Multi-token stopping criteria
# from more_itertools
......@@ -36,6 +36,7 @@ dependencies = [
"tqdm-multiprocess",
"transformers>=4.1",
"zstandard",
"dill",
"word2number",
]
......@@ -71,6 +72,7 @@ sentencepiece = ["sentencepiece>=0.1.98", "protobuf>=4.22.1"]
testing = ["pytest", "pytest-cov", "pytest-xdist"]
vllm = ["vllm<=0.2.5"]
zeno = ["pandas", "zeno-client"]
wandb = ["wandb>=0.16.3", "pandas", "numpy"]
all = [
"lm_eval[anthropic]",
"lm_eval[dev]",
......@@ -86,11 +88,9 @@ all = [
"lm_eval[testing]",
"lm_eval[vllm]",
"lm_eval[zeno]",
"lm_eval[wandb]",
]
[tool.ruff]
extend-exclude = ["lm_eval/tasks/*.py"]
[tool.ruff.lint]
extend-select = ["I"]
......@@ -99,5 +99,4 @@ lines-after-imports = 2
known-first-party = ["lm_eval"]
[tool.ruff.extend-per-file-ignores]
"__init__.py" = ["F401","F402","F403","I"]
"lm_eval/tasks/*"= ["E721"]
"__init__.py" = ["F401","F402","F403"]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment