Commit 90ad5db7 authored by lintangsutawika's avatar lintangsutawika
Browse files

merged main

parents f692caa9 b177c82c
...@@ -3,5 +3,5 @@ task: truthfulqa_te_mc2 ...@@ -3,5 +3,5 @@ task: truthfulqa_te_mc2
dataset_path: alexandrainst/m_truthfulqa dataset_path: alexandrainst/m_truthfulqa
dataset_name: te dataset_name: te
training_split: null training_split: null
validation_split: validation validation_split: val
test_split: null test_split: null
...@@ -3,5 +3,5 @@ task: truthfulqa_uk_mc1 ...@@ -3,5 +3,5 @@ task: truthfulqa_uk_mc1
dataset_path: alexandrainst/m_truthfulqa dataset_path: alexandrainst/m_truthfulqa
dataset_name: uk dataset_name: uk
training_split: null training_split: null
validation_split: validation validation_split: val
test_split: null test_split: null
...@@ -3,5 +3,5 @@ task: truthfulqa_uk_mc2 ...@@ -3,5 +3,5 @@ task: truthfulqa_uk_mc2
dataset_path: alexandrainst/m_truthfulqa dataset_path: alexandrainst/m_truthfulqa
dataset_name: uk dataset_name: uk
training_split: null training_split: null
validation_split: validation validation_split: val
test_split: null test_split: null
...@@ -3,5 +3,5 @@ task: truthfulqa_vi_mc1 ...@@ -3,5 +3,5 @@ task: truthfulqa_vi_mc1
dataset_path: alexandrainst/m_truthfulqa dataset_path: alexandrainst/m_truthfulqa
dataset_name: vi dataset_name: vi
training_split: null training_split: null
validation_split: validation validation_split: val
test_split: null test_split: null
...@@ -3,5 +3,5 @@ task: truthfulqa_vi_mc2 ...@@ -3,5 +3,5 @@ task: truthfulqa_vi_mc2
dataset_path: alexandrainst/m_truthfulqa dataset_path: alexandrainst/m_truthfulqa
dataset_name: vi dataset_name: vi
training_split: null training_split: null
validation_split: validation validation_split: val
test_split: null test_split: null
...@@ -3,5 +3,5 @@ task: truthfulqa_zh_mc1 ...@@ -3,5 +3,5 @@ task: truthfulqa_zh_mc1
dataset_path: alexandrainst/m_truthfulqa dataset_path: alexandrainst/m_truthfulqa
dataset_name: zh dataset_name: zh
training_split: null training_split: null
validation_split: validation validation_split: val
test_split: null test_split: null
...@@ -3,5 +3,5 @@ task: truthfulqa_zh_mc2 ...@@ -3,5 +3,5 @@ task: truthfulqa_zh_mc2
dataset_path: alexandrainst/m_truthfulqa dataset_path: alexandrainst/m_truthfulqa
dataset_name: zh dataset_name: zh
training_split: null training_split: null
validation_split: validation validation_split: val
test_split: null test_split: null
import datasets
import re import re
import datasets
import numpy as np import numpy as np
QA_PROMPT = ( QA_PROMPT = (
"Q: What is human life expectancy in the United States?\n" "Q: What is human life expectancy in the United States?\n"
"A: Human life expectancy in the United States is 78 years.\n\n" "A: Human life expectancy in the United States is 78 years.\n\n"
...@@ -17,6 +19,7 @@ QA_PROMPT = ( ...@@ -17,6 +19,7 @@ QA_PROMPT = (
"A: The 1992 Olympics were held in Barcelona, Spain." "A: The 1992 Olympics were held in Barcelona, Spain."
) )
def preprocess(text): def preprocess(text):
if text is None: if text is None:
return " " return " "
......
import re import re
import string import string
from collections import Counter from collections import Counter
......
from datasets import Dataset
from functools import partial from functools import partial
from datasets import Dataset
def process_docs(dataset, set_answer_type="bool"): def process_docs(dataset, set_answer_type="bool"):
FEATURES = ["title", "abstract", "question", "answer", "answer_type"] FEATURES = ["title", "abstract", "question", "answer", "answer_type"]
......
import os
import json import json
import requests import os
import numpy as np import numpy as np
import requests
from lm_eval.utils import eval_logger from lm_eval.utils import eval_logger
......
import re import re
from abc import abstractmethod
from functools import reduce
import numpy as np import numpy as np
import transformers.data.metrics.squad_metrics as squad_metrics import transformers.data.metrics.squad_metrics as squad_metrics
from abc import abstractmethod
from datasets import load_metric from datasets import load_metric
from transformers import AutoTokenizer from transformers import AutoTokenizer
from functools import reduce
from lm_eval.api.task import Task
from lm_eval.api.metrics import mean
from lm_eval.api.instance import Instance from lm_eval.api.instance import Instance
from lm_eval.api.registry import register_task from lm_eval.api.metrics import mean
from lm_eval.api.task import Task
_CITATION = """ _CITATION = """
@inproceedings{shaham-etal-2022-scrolls, @inproceedings{shaham-etal-2022-scrolls,
...@@ -44,6 +44,7 @@ _CITATION = """ ...@@ -44,6 +44,7 @@ _CITATION = """
def _download_metric(): def _download_metric():
import os import os
import shutil import shutil
from huggingface_hub import hf_hub_download from huggingface_hub import hf_hub_download
scrolls_metric_path = hf_hub_download( scrolls_metric_path = hf_hub_download(
...@@ -148,7 +149,7 @@ class _SCROLLSTask(Task): ...@@ -148,7 +149,7 @@ class _SCROLLSTask(Task):
del self.dataset["test"] del self.dataset["test"]
for split in self.dataset: for split in self.dataset:
self.dataset[split] = _drop_duplicates_in_input(self.dataset[split]) self.dataset[split] = _drop_duplicates_in_input(self.dataset[split])
if self.PRUNE_TOKENIZERS is not None and self.PRUNE_TOKENIZERS is not None: if self.PRUNE_TOKENIZERS is not None:
self.prune() self.prune()
def _get_prune_text(self, sample): def _get_prune_text(self, sample):
......
...@@ -13,14 +13,15 @@ also determine when no answer is supported by the paragraph and abstain from ans ...@@ -13,14 +13,15 @@ also determine when no answer is supported by the paragraph and abstain from ans
Homepage: https://rajpurkar.github.io/SQuAD-explorer/ Homepage: https://rajpurkar.github.io/SQuAD-explorer/
""" """
import datasets
from math import exp
from functools import partial from functools import partial
from math import exp
import datasets
from packaging import version from packaging import version
from lm_eval.api.task import ConfigurableTask
from lm_eval.api.instance import Instance from lm_eval.api.instance import Instance
from lm_eval.api.task import ConfigurableTask
_CITATION = """ _CITATION = """
@misc{rajpurkar2018know, @misc{rajpurkar2018know,
...@@ -35,7 +36,6 @@ _CITATION = """ ...@@ -35,7 +36,6 @@ _CITATION = """
def _squad_metric(predictions, references): def _squad_metric(predictions, references):
# squad_metric = load("squad_v2")
squad_metric = datasets.load_metric("squad_v2") squad_metric = datasets.load_metric("squad_v2")
return squad_metric.compute(predictions=predictions, references=references) return squad_metric.compute(predictions=predictions, references=references)
...@@ -52,7 +52,7 @@ class SQuAD2(ConfigurableTask): ...@@ -52,7 +52,7 @@ class SQuAD2(ConfigurableTask):
DATASET_NAME = None DATASET_NAME = None
def __init__(self): def __init__(self):
super().__init__(config={'metadata': {'version': self.VERSION}}) super().__init__(config={"metadata": {"version": self.VERSION}})
# HF changed squad on us so we have to make sure we aren't running the old one # HF changed squad on us so we have to make sure we aren't running the old one
assert version.parse(datasets.__version__) >= version.parse( assert version.parse(datasets.__version__) >= version.parse(
......
import sklearn
import numpy as np import numpy as np
import sklearn
def cb_multi_fi(items): def cb_multi_fi(items):
......
import collections
import re import re
import string import string
import collections
import numpy as np
import numpy as np
from datasets import Dataset from datasets import Dataset
from lm_eval.api.metrics import metric_max_over_ground_truths from lm_eval.api.metrics import metric_max_over_ground_truths
......
import re import re
from typing import List from typing import List
def doc_to_text(x): def doc_to_text(x):
text = re.sub(r" X ", " *" + x["span2_text"] + "* ", _wsc_inputs(x)) text = re.sub(r" X ", " *" + x["span2_text"] + "* ", _wsc_inputs(x))
return "wsc: " + text return "wsc: " + text
...@@ -23,14 +24,14 @@ def _wsc_inputs(x): ...@@ -23,14 +24,14 @@ def _wsc_inputs(x):
[ [
" ".join(words[:pronoun_index]), " ".join(words[:pronoun_index]),
"X", "X",
" ".join(words[pronoun_index + 1:]), " ".join(words[pronoun_index + 1 :]),
] ]
) )
# Handle some special cases. # Handle some special cases.
if ( if (
x["text"] x["text"]
== 'The boy continued to whip the pony , and eventually the pony threw him over. John laughed out quite loud. "Good for him," he said. ' == 'The boy continued to whip the pony , and eventually the pony threw him over. John laughed out quite loud. "Good for him," he said. '
): ):
return ( return (
"The boy continued to whip the pony , and eventually the pony threw " "The boy continued to whip the pony , and eventually the pony threw "
...@@ -39,8 +40,8 @@ def _wsc_inputs(x): ...@@ -39,8 +40,8 @@ def _wsc_inputs(x):
# Using the span2_index, we get 'use' instead of 'it'. # Using the span2_index, we get 'use' instead of 'it'.
if ( if (
x["text"] x["text"]
== "When they had eventually calmed down a bit , and had gotten home, Mr. Farley put the magic pebble in an iron safe . Some day they might want to use it , but really for now, what more could they wish for?" == "When they had eventually calmed down a bit , and had gotten home, Mr. Farley put the magic pebble in an iron safe . Some day they might want to use it , but really for now, what more could they wish for?"
): ):
return ( return (
"When they had eventually calmed down a bit , and had gotten home, " "When they had eventually calmed down a bit , and had gotten home, "
......
import datasets import datasets
import sacrebleu
import numpy as np import numpy as np
import sacrebleu
from rouge_score import rouge_scorer, scoring from rouge_score import rouge_scorer, scoring
......
...@@ -51,7 +51,9 @@ def gen_lang_yamls(output_dir: str, overwrite: bool) -> None: ...@@ -51,7 +51,9 @@ def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
for lang in LANGUAGES: for lang in LANGUAGES:
file_name = f"xwinograd_{lang}.yaml" file_name = f"xwinograd_{lang}.yaml"
try: try:
with open(f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf-8") as f: with open(
f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf-8"
) as f:
f.write("# Generated by utils.py\n") f.write("# Generated by utils.py\n")
yaml.dump( yaml.dump(
{ {
......
...@@ -5,16 +5,9 @@ import importlib.util ...@@ -5,16 +5,9 @@ import importlib.util
import inspect import inspect
import logging import logging
import os import os
import pathlib
import re import re
import subprocess
import sys
from itertools import islice from itertools import islice
from typing import ( from typing import Any, Callable, List
Any,
Callable,
List,
)
import numpy as np import numpy as np
import yaml import yaml
...@@ -249,7 +242,7 @@ def make_table(result_dict, column: str = "results"): ...@@ -249,7 +242,7 @@ def make_table(result_dict, column: str = "results"):
values = [] values = []
for k, dic in result_dict[column].items(): for k, dic in result_dict[column].items():
version = result_dict["versions"][k] version = result_dict["versions"].get(k, "N/A")
n = str(result_dict["n-shot"][k]) n = str(result_dict["n-shot"][k])
if "alias" in dic: if "alias" in dic:
...@@ -297,61 +290,6 @@ def positional_deprecated(fn): ...@@ -297,61 +290,6 @@ def positional_deprecated(fn):
return _wrapper return _wrapper
@positional_deprecated
def find_test_root(start_path: pathlib.Path) -> pathlib.Path:
"""
Search upward in the directory tree to a maximum of three layers
to find and return the package root (containing the 'tests' folder)
"""
cur_path = start_path.resolve()
max_layers = 3
for _ in range(max_layers):
if (cur_path / "tests" / "test_version_stable.py").exists():
return cur_path
else:
cur_path = cur_path.parent.resolve()
raise FileNotFoundError(
f"Unable to find package root within {max_layers} upwards" + f"of {start_path}"
)
@positional_deprecated
def run_task_tests(task_list: List[str]):
"""
Find the package root and run the tests for the given tasks
"""
import pytest
package_root = find_test_root(start_path=pathlib.Path(__file__))
task_string = " or ".join(task_list)
args = [
f"{package_root}/tests/test_version_stable.py",
f"--rootdir={package_root}",
"-k",
f"{task_string}",
]
sys.path.append(str(package_root))
pytest_return_val = pytest.main(args)
if pytest_return_val:
raise ValueError(
f"Not all tests for the specified tasks ({task_list}) ran successfully! Error code: {pytest_return_val}"
)
def get_git_commit_hash():
"""
Gets the git commit hash of your current repo (if it exists).
Source: https://github.com/EleutherAI/gpt-neox/blob/b608043be541602170bfcfb8ec9bf85e8a0799e0/megatron/neox_arguments/neox_args.py#L42
"""
try:
git_hash = subprocess.check_output(["git", "describe", "--always"]).strip()
git_hash = git_hash.decode()
except subprocess.CalledProcessError or FileNotFoundError:
# FileNotFoundError occurs when git not installed on system
git_hash = None
return git_hash
def ignore_constructor(loader, node): def ignore_constructor(loader, node):
return node return node
...@@ -433,16 +371,10 @@ def apply_template(template: str, doc: dict) -> str: ...@@ -433,16 +371,10 @@ def apply_template(template: str, doc: dict) -> str:
return rtemplate.render(**doc) return rtemplate.render(**doc)
def create_iterator(raw_iterator, rank, world_size, limit=None): def create_iterator(raw_iterator, *, rank=0, world_size=1, limit=None):
""" """
Method for creating a (potentially) sliced and limited Method for creating a (potentially) sliced and limited
iterator from a raw document iterator. Used for splitting data iterator from a raw document iterator. Used for splitting data
among ranks in multigpu setting or only pulling a sample of documents among ranks in multigpu setting or only pulling a sample of documents
""" """
return islice(raw_iterator, rank, limit, world_size) return islice(raw_iterator, rank, limit, world_size)
# Multi-token stopping criteria
# from more_itertools
...@@ -36,6 +36,7 @@ dependencies = [ ...@@ -36,6 +36,7 @@ dependencies = [
"tqdm-multiprocess", "tqdm-multiprocess",
"transformers>=4.1", "transformers>=4.1",
"zstandard", "zstandard",
"dill",
"word2number", "word2number",
] ]
...@@ -71,6 +72,7 @@ sentencepiece = ["sentencepiece>=0.1.98", "protobuf>=4.22.1"] ...@@ -71,6 +72,7 @@ sentencepiece = ["sentencepiece>=0.1.98", "protobuf>=4.22.1"]
testing = ["pytest", "pytest-cov", "pytest-xdist"] testing = ["pytest", "pytest-cov", "pytest-xdist"]
vllm = ["vllm<=0.2.5"] vllm = ["vllm<=0.2.5"]
zeno = ["pandas", "zeno-client"] zeno = ["pandas", "zeno-client"]
wandb = ["wandb>=0.16.3", "pandas", "numpy"]
all = [ all = [
"lm_eval[anthropic]", "lm_eval[anthropic]",
"lm_eval[dev]", "lm_eval[dev]",
...@@ -86,11 +88,9 @@ all = [ ...@@ -86,11 +88,9 @@ all = [
"lm_eval[testing]", "lm_eval[testing]",
"lm_eval[vllm]", "lm_eval[vllm]",
"lm_eval[zeno]", "lm_eval[zeno]",
"lm_eval[wandb]",
] ]
[tool.ruff]
extend-exclude = ["lm_eval/tasks/*.py"]
[tool.ruff.lint] [tool.ruff.lint]
extend-select = ["I"] extend-select = ["I"]
...@@ -99,5 +99,4 @@ lines-after-imports = 2 ...@@ -99,5 +99,4 @@ lines-after-imports = 2
known-first-party = ["lm_eval"] known-first-party = ["lm_eval"]
[tool.ruff.extend-per-file-ignores] [tool.ruff.extend-per-file-ignores]
"__init__.py" = ["F401","F402","F403","I"] "__init__.py" = ["F401","F402","F403"]
"lm_eval/tasks/*"= ["E721"]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment