Commit 8d59330b authored by lintangsutawika's avatar lintangsutawika
Browse files

resolved merge conflict

parents 110e5a28 d4a913c4
import json
from importlib.util import find_spec
from pathlib import Path
from lm_eval import utils
from lm_eval.api.registry import register_model
from lm_eval.models.huggingface import HFLM
eval_logger = utils.eval_logger
@register_model("openvino")
class OptimumLM(HFLM):
"""
Optimum Intel provides a simple interface to optimize Transformer models and convert them to \
OpenVINO™ Intermediate Representation (IR) format to accelerate end-to-end pipelines on \
Intel® architectures using OpenVINO™ runtime.
To use an OpenVINO config, use `--model_args ov_config` to point to a json file with an OpenVINO config:
`lm_eval --model openvino --model_args pretrained=gpt2,ov_config=config.json --task lambada_openai`
Example json file contents: {"INFERENCE_PRECISION_HINT": "f32", "CACHE_DIR": "model_cache"}
"""
def __init__(
......@@ -48,16 +57,25 @@ class OptimumLM(HFLM):
from optimum.intel.openvino import OVModelForCausalLM
model_kwargs = kwargs if kwargs else {}
if "ov_config" in model_kwargs:
if not Path(model_kwargs["ov_config"]).exists():
raise ValueError(
"ov_config should point to a .json file containing an OpenVINO config"
)
with open(model_kwargs["ov_config"]) as f:
model_kwargs["ov_config"] = json.load(f)
eval_logger.info(
f"Using custom OpenVINO config: {model_kwargs['ov_config']}"
)
else:
model_kwargs["ov_config"] = {}
model_kwargs["ov_config"].setdefault("CACHE_DIR", "")
model_file = Path(pretrained) / "openvino_model.xml"
if model_file.exists():
export = False
else:
export = True
kwargs["ov_config"] = {
"PERFORMANCE_HINT": "LATENCY",
"NUM_STREAMS": "1",
"CACHE_DIR": "",
}
self._model = OVModelForCausalLM.from_pretrained(
pretrained,
......
......@@ -21,10 +21,14 @@ from lm_eval.utils import (
try:
import ray
from vllm import LLM, SamplingParams
if parse_version(version("vllm")) > parse_version("0.3.0"):
from vllm.lora.request import LoRARequest
from vllm.transformers_utils.tokenizer import get_tokenizer
except ModuleNotFoundError:
pass
eval_logger = eval_logger
......@@ -34,7 +38,7 @@ class VLLM(TemplateLM):
def __init__(
self,
pretrained="gpt2",
pretrained: str,
dtype: Literal["float16", "bfloat16", "float32", "auto"] = "auto",
revision: Optional[str] = None,
trust_remote_code: Optional[bool] = False,
......@@ -55,6 +59,7 @@ class VLLM(TemplateLM):
gpu_memory_utilization: float = 0.9,
device: str = "cuda",
data_parallel_size: int = 1,
lora_local_path: str = None,
**kwargs,
):
super().__init__()
......@@ -127,6 +132,14 @@ class VLLM(TemplateLM):
self._max_gen_toks = max_gen_toks
if lora_local_path is not None:
assert parse_version(version("vllm")) > parse_version(
"0.3.0"
), "lora adapters only compatible with vllm > v0.3.0."
self.lora_request = LoRARequest("finetuned", 1, lora_local_path)
else:
self.lora_request = None
@property
def eot_token_id(self):
# we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
......@@ -223,6 +236,14 @@ class VLLM(TemplateLM):
# flatten results
return undistribute(results)
if self.lora_request is not None:
outputs = self.model.generate(
prompt_token_ids=requests,
sampling_params=sampling_params,
use_tqdm=True if self.batch_size == "auto" else False,
lora_request=self.lora_request,
)
else:
outputs = self.model.generate(
prompt_token_ids=requests,
sampling_params=sampling_params,
......
# FDA
### Paper
Title: Language Models Enable Simple Systems For
Generating Structured Views Of Heterogenous Data
Lakes
Abstract: A long standing goal of the data management community is to develop general, automated systems
that ingest semi-structured documents and output queryable tables without human effort or domain
specific customization. Given the sheer variety of potential documents, state-of-the art systems make
simplifying assumptions and use domain specific training. In this work, we ask whether we can
maintain generality by using large language models (LLMs). LLMs, which are pretrained on broad
data, can perform diverse downstream tasks simply conditioned on natural language task descriptions.
We propose and evaluate EVAPORATE, a simple, prototype system powered by LLMs. We identify
two fundamentally different strategies for implementing this system: prompt the LLM to directly
extract values from documents or prompt the LLM to synthesize code that performs the extraction.
Our evaluations show a cost-quality tradeoff between these two approaches. Code synthesis is cheap,
but far less accurate than directly processing each document with the LLM. To improve quality while
maintaining low cost, we propose an extended code synthesis implementation, EVAPORATE-CODE+,
which achieves better quality than direct extraction. Our key insight is to generate many candidate
functions and ensemble their extractions using weak supervision. EVAPORATE-CODE+ not only
outperforms the state-of-the art systems, but does so using a sublinear pass over the documents with
the LLM. This equates to a 110× reduction in the number of tokens the LLM needs to process,
averaged across 16 real-world evaluation settings of 10k documents each.
A task for LMs to perform Information Extraction, as implemented by Based.
Homepage: https://github.com/HazyResearch/based-evaluation-harness
Description:
> FDA (Information Extraction). The task is to extract key-value pairs from a set of PDFs scraped from the FDA website. We use the dataset and labels collected in Arora et al. 2023. We break apart the documents into chunks of 1,920 tokens. For every key-value pair that appears in the chunk, we create a zero-shot prompt using the simple prompt template: {chunk} \n {key}: We allow the model to generate a fixed number of tokens after the prompt and check (with case insensitivity) if the value is contained within the generation. We report accuracy, the fraction of prompts for which the generation contains the value.
### Citation
```
@misc{arora2024simple,
title={Simple linear attention language models balance the recall-throughput tradeoff},
author={Simran Arora and Sabri Eyuboglu and Michael Zhang and Aman Timalsina and Silas Alberti and Dylan Zinsley and James Zou and Atri Rudra and Christopher Ré},
year={2024},
eprint={2402.18668},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{arora2023language,
title={Language Models Enable Simple Systems for Generating Structured Views of Heterogeneous Data Lakes},
author={Simran Arora and Brandon Yang and Sabri Eyuboglu and Avanika Narayan and Andrew Hojel and Immanuel Trummer and Christopher Ré},
year={2023},
eprint={2304.09433},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
```
### Groups and Tasks
#### Tasks
* `fda`: the FDA task as implemented in the paper "Simple linear attention language models balance the recall-throughput tradeoff". Designed for zero-shot evaluation of small LMs.
### Checklist
For adding novel benchmarks/datasets to the library:
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [x] Is the "Main" variant of this task clearly denoted?
* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
task: fda
class: !function task.FDA
"""
"""
import re
from typing import List
import numpy as np
from lm_eval.api.instance import Instance
from lm_eval.api.task import ConfigurableTask
class FDA(ConfigurableTask):
VERSION = 0
DATASET_PATH = "hazyresearch/based-fda"
DATASET_NAME = "default"
def __init__(self):
super().__init__(config={"metadata": {"version": self.VERSION}})
def has_training_docs(self):
return False
def has_validation_docs(self):
return True
def has_test_docs(self):
return False
def validation_docs(self):
return self.dataset["validation"]
def doc_to_text(self, doc):
return doc["text"]
def doc_to_target(self, doc):
return doc["value"]
def construct_requests(self, doc, ctx, **kwargs):
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
return [
Instance(
request_type="generate_until",
doc=doc,
arguments=(ctx, {"until": ["\n"], "max_gen_toks": 48}),
idx=0,
**kwargs,
)
]
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
# continuation, (logprob_unanswerable, _) = results
continuation = results
return {"contains": contains_score(continuation[0], [doc["value"]])}
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return {
"contains": np.mean, # Exact match (the normalized answer exactly match the gold answer)
}
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return {
"contains": True, # Exact match (the normalized answer exactly match the gold answer
}
def contains_score(prediction: str, labels: List[str]):
return max(
int(bool(re.search(re.compile(re.escape(label), re.IGNORECASE), prediction)))
for label in labels
)
# MATH
## Paper
Measuring Mathematical Problem Solving With the MATH Dataset
https://arxiv.org/abs/2103.03874
Many intellectual endeavors require mathematical problem solving, but this skill remains beyond the capabilities of computers. To measure this ability in machine learning models, we introduce MATH, a new dataset of 12,500 challenging competition mathematics problems. Each problem in MATH has a full step-by-step solution which can be used to teach models to generate answer derivations and explanations.
NOTE: This task corresponds to the MATH (`hendrycks_math`) implementation at https://github.com/EleutherAI/lm-evaluation-harness/tree/master . For the variant which uses the custom 4-shot prompt in the Minerva paper (https://arxiv.org/abs/2206.14858), and SymPy answer checking as done by Minerva, see `lm_eval/tasks/minerva_math`.
Homepage: https://github.com/hendrycks/math
## Citation
```
@article{hendrycksmath2021,
title={Measuring Mathematical Problem Solving With the MATH Dataset},
author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},
journal={NeurIPS},
year={2021}
}
```
### Groups and Tasks
#### Groups
- `hendrycks_math`: the MATH benchmark from Hendrycks et al. 0- or few-shot.
#### Tasks
- `hendrycks_math_algebra`
- `hendrycks_math_counting_and_prob`
- `hendrycks_math_geometry`
- `hendrycks_math_intermediate_algebra`
- `hendrycks_math_num_theory`
- `hendrycks_math_prealgebra`
- `hendrycks_math_precalc`
### Checklist
The checklist is the following:
For adding novel benchmarks/datasets to the library:
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
* Answer extraction code is taken from the original MATH benchmark paper's repository.
If other tasks on this dataset are already supported:
* [x] Is the "Main" variant of this task clearly denoted?
* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
group: hendrycks_math
task:
- hendrycks_math_algebra
- hendrycks_math_counting_and_prob
- hendrycks_math_geometry
- hendrycks_math_intermediate_algebra
- hendrycks_math_num_theory
- hendrycks_math_prealgebra
- hendrycks_math_precalc
group:
- math_word_problems
task: hendrycks_math_algebra
dataset_path: EleutherAI/hendrycks_math
process_docs: !function utils.process_docs
dataset_name: algebra
output_type: generate_until
training_split: train
test_split: test
doc_to_text: "Problem: {{problem}}\nAnswer:"
process_results: !function utils.process_results
doc_to_target: "{{answer}}"
generation_kwargs:
until:
- "Problem:"
do_sample: false
temperature: 0
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
include: hendrycks_math_algebra.yaml
dataset_name: counting_and_probability
task: hendrycks_math_counting_and_prob
include: hendrycks_math_algebra.yaml
dataset_name: geometry
task: hendrycks_math_geometry
include: hendrycks_math_algebra.yaml
dataset_name: intermediate_algebra
task: hendrycks_math_intermediate_algebra
include: hendrycks_math_algebra.yaml
dataset_name: number_theory
task: hendrycks_math_num_theory
include: hendrycks_math_algebra.yaml
dataset_name: prealgebra
task: hendrycks_math_prealgebra
include: hendrycks_math_algebra.yaml
dataset_name: precalculus
task: hendrycks_math_precalc
from typing import Dict, List
import datasets
def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
def _process_doc(doc: dict) -> dict:
out_doc = {
"problem": doc["problem"],
"solution": doc["solution"],
"answer": remove_boxed(last_boxed_only_string(doc["solution"])),
}
return out_doc
return dataset.map(_process_doc)
def process_results(doc: dict, results: List[str]) -> Dict[str, int]:
retval = 0
indices = [pos for pos, char in enumerate(results[0]) if char == "$"]
if len(indices) <= 1:
answer = results[0]
else:
answer = results[0][indices[0] + 1 : indices[-1]]
if is_equiv(answer, remove_boxed(last_boxed_only_string(doc["solution"]))):
retval = 1
results = {
"exact_match": retval,
}
return results
# string normalization from https://github.com/EleutherAI/lm-evaluation-harness/blob/master/lm_eval/tasks/hendrycks_math.py
def is_equiv(str1, str2, verbose=False):
if str1 is None and str2 is None:
print("WARNING: Both None")
return True
if str1 is None or str2 is None:
return False
try:
ss1 = strip_string(str1)
ss2 = strip_string(str2)
if verbose:
print(ss1, ss2)
return ss1 == ss2
except Exception:
return str1 == str2
def remove_boxed(s):
if "\\boxed " in s:
left = "\\boxed "
assert s[: len(left)] == left
return s[len(left) :]
left = "\\boxed{"
assert s[: len(left)] == left
assert s[-1] == "}"
return s[len(left) : -1]
def last_boxed_only_string(string):
idx = string.rfind("\\boxed")
if "\\boxed " in string:
return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
if idx < 0:
idx = string.rfind("\\fbox")
if idx < 0:
return None
i = idx
right_brace_idx = None
num_left_braces_open = 0
while i < len(string):
if string[i] == "{":
num_left_braces_open += 1
if string[i] == "}":
num_left_braces_open -= 1
if num_left_braces_open == 0:
right_brace_idx = i
break
i += 1
if right_brace_idx is None:
retval = None
else:
retval = string[idx : right_brace_idx + 1]
return retval
def fix_fracs(string):
substrs = string.split("\\frac")
new_str = substrs[0]
if len(substrs) > 1:
substrs = substrs[1:]
for substr in substrs:
new_str += "\\frac"
if substr[0] == "{":
new_str += substr
else:
try:
assert len(substr) >= 2
except AssertionError:
return string
a = substr[0]
b = substr[1]
if b != "{":
if len(substr) > 2:
post_substr = substr[2:]
new_str += "{" + a + "}{" + b + "}" + post_substr
else:
new_str += "{" + a + "}{" + b + "}"
else:
if len(substr) > 2:
post_substr = substr[2:]
new_str += "{" + a + "}" + b + post_substr
else:
new_str += "{" + a + "}" + b
string = new_str
return string
def fix_a_slash_b(string):
if len(string.split("/")) != 2:
return string
a = string.split("/")[0]
b = string.split("/")[1]
try:
a = int(a)
b = int(b)
assert string == "{}/{}".format(a, b)
new_string = "\\frac{" + str(a) + "}{" + str(b) + "}"
return new_string
except AssertionError:
return string
def remove_right_units(string):
# "\\text{ " only ever occurs (at least in the val set) when describing units
if "\\text{ " in string:
splits = string.split("\\text{ ")
assert len(splits) == 2
return splits[0]
else:
return string
def fix_sqrt(string):
if "\\sqrt" not in string:
return string
splits = string.split("\\sqrt")
new_string = splits[0]
for split in splits[1:]:
if split[0] != "{":
a = split[0]
new_substr = "\\sqrt{" + a + "}" + split[1:]
else:
new_substr = "\\sqrt" + split
new_string += new_substr
return new_string
def strip_string(string):
# linebreaks
string = string.replace("\n", "")
# remove inverse spaces
string = string.replace("\\!", "")
# replace \\ with \
string = string.replace("\\\\", "\\")
# replace tfrac and dfrac with frac
string = string.replace("tfrac", "frac")
string = string.replace("dfrac", "frac")
# remove \left and \right
string = string.replace("\\left", "")
string = string.replace("\\right", "")
# Remove circ (degrees)
string = string.replace("^{\\circ}", "")
string = string.replace("^\\circ", "")
# remove dollar signs
string = string.replace("\\$", "")
# remove units (on the right)
string = remove_right_units(string)
# remove percentage
string = string.replace("\\%", "")
string = string.replace("\%", "") # noqa: W605
# " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
string = string.replace(" .", " 0.")
string = string.replace("{.", "{0.")
# if empty, return empty string
if len(string) == 0:
return string
if string[0] == ".":
string = "0" + string
# to consider: get rid of e.g. "k = " or "q = " at beginning
if len(string.split("=")) == 2:
if len(string.split("=")[0]) <= 2:
string = string.split("=")[1]
# fix sqrt3 --> sqrt{3}
string = fix_sqrt(string)
# remove spaces
string = string.replace(" ", "")
# \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
string = fix_fracs(string)
# manually change 0.5 --> \frac{1}{2}
if string == "0.5":
string = "\\frac{1}{2}"
# NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
string = fix_a_slash_b(string)
return string
......@@ -12,7 +12,6 @@ generation_kwargs:
temperature: 0.0
max_gen_toks: 1280
process_results: !function utils.process_results
num_fewshot: 0
metric_list:
- metric: prompt_level_strict_acc
aggregation: mean
......
......@@ -28,16 +28,11 @@ Eprint = {arXiv:2206.14858},
}
```
### Groups, Benchmarks and Tasks
#### Benchmarks
- `minerva_math`
### Groups and Tasks
#### Groups
- `math_word_problems`
- `generate_until`
- `minerva_math`
#### Tasks
......
......@@ -20,4 +20,4 @@ metric_list:
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
version: 2.0
......@@ -4,8 +4,6 @@ import datasets
def preprocess(text):
if text is None:
return " "
text = text.strip()
text = text.replace(" [title]", ". ")
text = re.sub("\\[.*?\\]", "", text)
......@@ -20,11 +18,15 @@ def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
"id": doc["id"],
"query": "Question: " + preprocess(doc["instruction"]) + "\nAnswer:",
"choices": [
preprocess(doc["option_a"]),
preprocess(doc["option_b"]),
preprocess(doc["option_c"]),
preprocess(doc["option_d"]),
preprocess(doc["option_e"]),
preprocess(option)
for option in [
doc["option_a"],
doc["option_b"],
doc["option_c"],
doc["option_d"],
doc["option_e"],
]
if option
],
"gold": ["A", "B", "C", "D", "E"].index(doc["answer"]),
}
......
# Pile-10k
### Paper
Title: `NeelNanda/pile-10k`
Abstract: The first 10K elements of [The Pile](https://pile.eleuther.ai/), useful for debugging models trained on it. See the [HuggingFace page for the full Pile](https://huggingface.co/datasets/the_pile) for more info. Inspired by [stas' great resource](https://huggingface.co/datasets/stas/openwebtext-10k) doing the same for OpenWebText
Homepage: [https://huggingface.co/datasets/NeelNanda/pile-10k](https://huggingface.co/datasets/NeelNanda/pile-10k)
### Citation
```
@misc{Nanda2022Pile10K,
author = {Nanda, Neel},
title = {{NeelNanda/pile-10k} \textendash\ Datasets at Hugging Face},
year = {2022},
howpublished = {\url{https://huggingface.co/datasets/NeelNanda/pile-10k}},
}
```
### Groups and Tasks
#### Groups
* Not part of a group yet.
#### Tasks
* `pile_10k`: `The first 10K elements of The Pile, useful for debugging models trained on it.`
### Checklist
For adding novel benchmarks/datasets to the library:
* [ ] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment