"src/vscode:/vscode.git/clone" did not exist on "db817fcf29cdeede00244c8eb8bbbc726fdf4b39"
Commit 5add46aa authored by hepj's avatar hepj
Browse files

添加Megatron项目

parent deb8370c
Pipeline #2199 failed with stages
in 0 seconds
transformers>=4.25.1
accelerate>=0.13.2
datasets>=2.6.1
evaluate>=0.3.0
pyext==0.7
mosestokenizer==1.0.0
huggingface_hub>=0.11.1
fsspec<2023.10.0
from setuptools import setup, find_packages
with open("README.md") as readme_file:
readme = readme_file.read()
with open("requirements.txt") as reqs_file:
requirements = reqs_file.read().split("\n")
ds1000_requirements = [
"DateTime==4.7",
"gensim==4.2.0",
"matplotlib==3.5.2",
"numpy==1.21.6",
"openai==0.23.0",
"pandas==1.3.5",
"pandas-datareader==0.10.0",
"pathlib==1.0.1",
"scikit-learn==1.0.2",
"scipy==1.7.3",
"seaborn==0.11.2",
"statsmodels==0.13.2",
"tensorflow==2.10.0",
"tokenizers==0.12.1",
"torchvision==0.13.1",
"tqdm==4.64.1",
"xgboost==1.6.2",
"Pillow==9.2.0",
]
setup(
description="A framework for the evaluation of autoregressive code generation language models.",
long_description=readme,
license="Apache 2.0",
packages=find_packages() ,
install_requires=requirements,
extras_require={"ds1000": ds1000_requirements},
)
# This template file is adapted from: https://github.com/EleutherAI/lm-evaluation-harness/blob/master/templates/new_task.py
# TODO: Remove all TODO comments once the implementation is complete.
"""
TODO: Add the Paper Title on this line.
TODO: Add the paper's PDF URL (preferably from arXiv) on this line.
TODO: Write a Short Description of the task.
Homepage: TODO: Add the URL to the task's Homepage here.
"""
from bigcode_eval.base import Task
# TODO: Add the BibTeX citation for the task.
_CITATION = """
"""
# TODO: Replace `NewTask` with the name of your Task.
class NewTask(Task):
# TODO: Add the `DATASET_PATH` string. This will be the name of the `Task`
# dataset as denoted in HuggingFace `datasets`.
DATASET_PATH = ""
# TODO: Add the `DATASET_NAME` string. This is the name of a subset within
# `DATASET_PATH`. If there aren't specific subsets you need, leave this as `None`.
DATASET_NAME = None
def __init__(self):
super().__init__(
# TODO: Specify the list of stop words in `stop_words` for the code generation task \
# and if the evaluation requires executing the generated code in `requires_execution`.
stop_words=[],
requires_execution=False,
)
def get_dataset(self):
# TODO: retrieve the evaluation subset from the loaded dataset (e.g. `self.dataset["test"]`)
"""Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
return []
def fewshot_examples(self):
# TODO: load few-shot examples (from bigcode_eval/tasks/fewshot_examples) if they exist
"""Loads and returns the few-shot examples for the task if they exist."""
pass
def get_prompt(self, doc):
# TODO: build the prompt for the language model from a sample `doc` from the dataset
"""
Builds the prompt for the LM to generate from.
:param doc: dict[str: str]
sample from the test dataset
:return: str
"""
return ""
def get_reference(self, doc):
# TODO: get the reference solution from a sample `doc` from the dataset
"""
Builds the reference solution for the doc (sample from the test dataset).
:param doc: dict[str: str]
sample from the test dataset
:return: str
"""
return ""
def postprocess_generation(self, generation, idx):
# TODO: define the postprocessing for the LM generation
"""
Defines the postprocessing for a LM generation.
:param generation: str
code generation from LM
:param idx: int (if needed)
index of doc in the dataset to which the generation belongs
:return: str
"""
return ""
def process_results(self, generations, references):
# TODO: define how the evaluation score is computed from list of \
# generations and reference solutions
"""
Takes the list of LM generations and evaluates them against ground truth references,
returning the metric for the generations as in {"metric_name": result}.
We encourage to directly load the metric from `evaluate` library to keep the code concise.
:param generations: list(list(str))
list of lists containing generations
:param references: list(str)
list of str containing refrences
:return: dict[str: float]
"""
return {}
[["from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\n for idx, elem in enumerate(numbers):\n for idx2, elem2 in enumerate(numbers):\n if idx != idx2:\n distance = abs(elem - elem2)\n if distance < threshold:\n return True\n\n return False\n", "This is some random text"], ["This is some random text", "This is some random text"]]
\ No newline at end of file
[["from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\"jectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectjectject through through through through through through through through through through through through through through through through through through through through through through through through through through"], ["from typing import List\n\n\ndef separate_paren_groups(paren_string: str) -> List[str]:\n \"\"\" Input to this function is a string containing multiple groups of nested parentheses. Your goal is to\n separate those group into separate strings and return the list of those.\n Separate groups are balanced (each open brace is properly closed) and not nested within each other\n Ignore any spaces in the input string.\n >>> separate_paren_groups('( ) (( )) (( )( ))')\n ['()', '(())', '(()())']\n \"\"\" at at at at at at at at at at at at at at at at at at at at at at at fe fe fe fe fe fe fe fe fe fe fe fe"]]
\ No newline at end of file
["\n\n\nMETADATA = {\n 'author': 'jt',\n 'dataset': 'test'\n}\n\n\ndef check(candidate):\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n\ncheck(has_close_elements)", "\n\n\nMETADATA = {\n 'author': 'jt',\n 'dataset': 'test'\n}\n\n\ndef check(candidate):\n assert candidate('(()()) ((())) () ((())()())') == [\n '(()())', '((()))', '()', '((())()())'\n ]\n assert candidate('() (()) ((())) (((())))') == [\n '()', '(())', '((()))', '(((())))'\n ]\n assert candidate('(()(())((())))') == [\n '(()(())((())))'\n ]\n assert candidate('( ) (( )) (( )( ))') == ['()', '(())', '(()())']\n\ncheck(separate_paren_groups)"]
\ No newline at end of file
[["def remove_Occ(s,ch): \r\n for i in range(len(s)): \r\n if (s[i] == ch): \r\n s = s[0 : i] + s[i + 1:] \r\n break\r\n for i in range(len(s) - 1,-1,-1): \r\n if (s[i] == ch): \r\n s = s[0 : i] + s[i + 1:] \r\n break\r\n return s ", "This is some random text"], ["This is some random text", "This is some random text"]]
\ No newline at end of file
[["\"\"\"\nWrite a python function to remove first and last occurrence of a given character from the string.\nassert remove_Occ(\"hello\",\"l\") == \"heo\"\n\"\"\"\normormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormormorm only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only only"], ["\"\"\"\nWrite a function to sort a given matrix in ascending order according to the sum of its rows.\nassert sort_matrix([[1, 2, 3], [2, 4, 5], [1, 1, 1]])==[[1, 1, 1], [1, 2, 3], [2, 4, 5]]\n\"\"\"\n at at at at at at feiririririririririririririrherherherherher who who who who who who who who who who who who who who who who fe fe fe fe fe fe fe fejjjjjjjjjjjjjjjjjjjjjififififififififififififififiriririririririrGGGGGGGedededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededededed"]]
\ No newline at end of file
["assert remove_Occ(\"hello\",\"l\") == \"heo\"\nassert remove_Occ(\"abcda\",\"a\") == \"bcd\"\nassert remove_Occ(\"PHP\",\"P\") == \"H\"", "assert sort_matrix([[1, 2, 3], [2, 4, 5], [1, 1, 1]])==[[1, 1, 1], [1, 2, 3], [2, 4, 5]]\nassert sort_matrix([[1, 2, 3], [-2, 4, -5], [1, -1, 1]])==[[-2, 4, -5], [1, -1, 1], [1, 2, 3]]\nassert sort_matrix([[5,8,9],[6,4,3],[2,1,4]])==[[2, 1, 4], [6, 4, 3], [5, 8, 9]]"]
\ No newline at end of file
[
[
"def solution():\n \"\"\"Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\"\"\"\n eggs_per_day = 16\n eggs_eaten = 3\n eggs_baked = 4\n eggs_sold = eggs_per_day - eggs_eaten - eggs_baked\n price_per_egg = 2\n money_made = eggs_sold * price_per_egg\n result = money_made\n return result\nprint(solution())"
],
[
"def solution():\n \"\"\"A robe takes 2 bolts of blue fiber and half that much white fiber. How many bolts in total does it take?\"\"\"\n blue_fiber = 2\n white_fiber = blue_fiber / 2\n total_fiber = blue_fiber + white_fiber\n result = total_fiber\n return result\nprint(solution())"
]
]
\ No newline at end of file
{
"prompt":"Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?\n\n# solution in Python:\n\n\ndef solution():\n \"\"\"Olivia has $23. She bought five bagels for $3 each. How much money does she have left?\"\"\"\n money_initial = 23\n bagels = 5\n bagel_cost = 3\n money_spent = bagels * bagel_cost\n money_left = money_initial - money_spent\n result = money_left\n return result\n\n\n\n\n\nQ: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?\n\n# solution in Python:\n\n\ndef solution():\n \"\"\"Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?\"\"\"\n golf_balls_initial = 58\n golf_balls_lost_tuesday = 23\n golf_balls_lost_wednesday = 2\n golf_balls_left = golf_balls_initial - golf_balls_lost_tuesday - golf_balls_lost_wednesday\n result = golf_balls_left\n return result\n\n\n\n\n\nQ: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?\n\n# solution in Python:\n\n\ndef solution():\n \"\"\"There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?\"\"\"\n computers_initial = 9\n computers_per_day = 5\n num_days = 4 # 4 days between monday and thursday\n computers_added = computers_per_day * num_days\n computers_total = computers_initial + computers_added\n result = computers_total\n return result\n\n\n\n\n\nQ: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?\n\n# solution in Python:\n\n\ndef solution():\n \"\"\"Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?\"\"\"\n toys_initial = 5\n mom_toys = 2\n dad_toys = 2\n total_received = mom_toys + dad_toys\n total_toys = toys_initial + total_received\n result = total_toys\n return result\n\n\n\n\n\nQ: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?\n\n# solution in Python:\n\n\ndef solution():\n \"\"\"Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?\"\"\"\n jason_lollipops_initial = 20\n jason_lollipops_after = 12\n denny_lollipops = jason_lollipops_initial - jason_lollipops_after\n result = denny_lollipops\n return result\n\n\n\n\n\nQ: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?\n\n# solution in Python:\n\n\ndef solution():\n \"\"\"Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?\"\"\"\n leah_chocolates = 32\n sister_chocolates = 42\n total_chocolates = leah_chocolates + sister_chocolates\n chocolates_eaten = 35\n chocolates_left = total_chocolates - chocolates_eaten\n result = chocolates_left\n return result\n\n\n\n\n\nQ: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?\n\n# solution in Python:\n\n\ndef solution():\n \"\"\"If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?\"\"\"\n cars_initial = 3\n cars_arrived = 2\n total_cars = cars_initial + cars_arrived\n result = total_cars\n return result\n\n\n\n\n\nQ: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?\n\n# solution in Python:\n\n\ndef solution():\n \"\"\"There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?\"\"\"\n trees_initial = 15\n trees_after = 21\n trees_added = trees_after - trees_initial\n result = trees_added\n return result\n\n\n\n\n\nQ: test\n\n# solution in Python:\n\n\n"
}
\ No newline at end of file
import json
import os
import tempfile
from accelerate import Accelerator
from accelerate.utils import write_basic_config
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
from bigcode_eval.arguments import EvalArguments
from bigcode_eval.evaluator import Evaluator
# TODO add more tasks
# Tasks for generation test
GEN_TASKS = ["humaneval", "mbpp"]
# Tasks for evaluator tests
EVAL_TASKS = ["humaneval", "mbpp", "pal-gsm8k-greedy"]
TMPDIR = tempfile.mkdtemp()
TEST_MODEL = "hf-internal-testing/tiny-random-gpt2"
REF_EVAL_SCORES = {
"humaneval": {"pass@1": 0.25},
"mbpp": {"pass@1": 0.25},
"pal-gsm8k-greedy": {"accuracy": 1.0, "num_failed_execution": 0},
}
def update_args(args):
args.model = "hf-internal-testing/tiny-random-gpt2"
# the executed code for the tests is safe (see tests/data/*_eval_gens.json)
args.allow_code_execution = True
args.save_generations = False
args.save_generations_path = ""
args.save_references = False
args.save_references_path = ""
args.metric_output_path = TMPDIR
args.load_generations_path = None
args.generation_only = False
args.check_references = False
# postprocessing for HumanEval and MBPP makes generations
# with dummy model not distinctive
args.postprocess = False
args.instruction_tokens = None
args.limit = 2
args.limit_start = 0
args.batch_size = 1
args.max_length_generation = 300
args.left_padding = False
args.do_sample = False
args.top_p = 0
args.n_samples = 1
args.seed = 0
args.prompt = None
args.precision = None
args.modeltype = None
args.max_memory_per_gpu = None
return args
def setup():
model = AutoModelForCausalLM.from_pretrained(TEST_MODEL)
tokenizer = AutoTokenizer.from_pretrained(TEST_MODEL)
tokenizer.pad_token = tokenizer.eos_token
configPath = os.path.join(TMPDIR, "default_config.yml")
write_basic_config(save_location=configPath)
accelerator = Accelerator()
return model, tokenizer, accelerator
def load_generation_examples(task):
# generations for testing the generation feature of dummy test model
with open(f"tests/data/{task}_gen_gens.json") as fp:
gens = json.load(fp)
with open(f"tests/data/{task}_gen_refs.json") as fp:
refs = json.load(fp)
return gens, refs
args = update_args(EvalArguments())
set_seed(args.seed)
model, tokenizer, accelerator = setup()
def test_generation():
args.generation_only = True
args.save_every_k_tasks = -1
evaluator = Evaluator(accelerator, model, tokenizer, args)
for task in GEN_TASKS:
print(f"testing task {task}")
generations, references = evaluator.generate_text(task)
true_gens, true_refs = load_generation_examples(task)
assert generations == true_gens
assert references == true_refs
print("passed gen")
def test_evaluation():
# TODO add scores for each task
args.n_samples = 2
args.save_every_k_tasks = -1
for task in EVAL_TASKS:
print(f"testing task {task}")
# path to generation examples to evaluate
args.load_generations_path = f"tests/data/{task}_eval_gens.json"
evaluator = Evaluator(accelerator, None, None, args)
results = evaluator.evaluate(task)
assert results == REF_EVAL_SCORES[task]
print("passed eval")
import json
from bigcode_eval import tasks
TASKS = ["pal-gsm8k-greedy"]
sample_doc = {"pal-gsm8k-greedy": {"question": "test"}}
def load_reference_prompt(task_name):
with open(f"tests/data/{task_name}_prompt.json") as fp:
prompts = json.load(fp)
return prompts["prompt"]
def test_gsm_prompt():
for task_name in TASKS:
task = tasks.get_task(task_name)
task_prompt = task.get_prompt(sample_doc[task_name])
ref_prompt = load_reference_prompt(task_name)
assert task_prompt == ref_prompt
[run]
# tasks that aren't wired up.
omit =
lm_eval/tasks/quac.py
lm_eval/tasks/storycloze.py
lm_eval/tasks/cbt.py
lm_eval/tasks/sat.py
lm_eval/tasks/triviaqa.py
lm_eval/tasks/naturalqs.py
lm_eval/models/dummy.py
[report]
exclude_lines =
# Skip any pass lines such as may be used for @abstractmethod
pass
# Have to re-enable the standard pragma
pragma: no cover
# Don't complain about missing debug-only code:
def __repr__
if self\.debug
# Don't complain if tests don't hit defensive assertion code:
raise AssertionError
raise NotImplementedError
return NotImplemented
[flake8]
ignore = E203, E266, E501, W503, F403, F401, C901
max-line-length = 127
max-complexity = 10
select = B,C,E,F,W,T4,B9
@misc{eval-harness,
author = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy},
title = {A framework for few-shot language model evaluation},
month = 12,
year = 2023,
publisher = {Zenodo},
version = {v0.4.0},
doi = {10.5281/zenodo.10256836},
url = {https://zenodo.org/records/10256836}
}
* @haileyschoelkopf @lintangsutawika
MIT License
Copyright (c) 2020 EleutherAI
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
# Language Model Evaluation Harness
[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.10256836.svg)](https://doi.org/10.5281/zenodo.10256836)
## Announcement
**A new v0.4.0 release of lm-evaluation-harness is available** !
New updates and features include:
- Internal refactoring
- Config-based task creation and configuration
- Easier import and sharing of externally-defined task config YAMLs
- Support for Jinja2 prompt design, easy modification of prompts + prompt imports from Promptsource
- More advanced configuration options, including output post-processing, answer extraction, and multiple LM generations per document, configurable fewshot settings, and more
- Speedups and new modeling libraries supported, including: faster data-parallel HF model usage, vLLM support, MPS support with HuggingFace, and more
- Logging and usability changes
- New tasks including CoT BIG-Bench-Hard, Belebele, user-defined task groupings, and more
Please see our updated documentation pages in `docs/` for more details.
Development will be continuing on the `main` branch, and we encourage you to give us feedback on what features are desired and how to improve the library further, or ask questions, either in issues or PRs on GitHub, or in the [EleutherAI discord](https://discord.gg/eleutherai)!
## Overview
This project provides a unified framework to test generative language models on a large number of different evaluation tasks.
**Features:**
- Over 60 standard academic benchmarks for LLMs, with hundreds of subtasks and variants implemented.
- Support for models loaded via [transformers](https://github.com/huggingface/transformers/) (including quantization via [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ)), [GPT-NeoX](https://github.com/EleutherAI/gpt-neox), and [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/), with a flexible tokenization-agnostic interface.
- Support for fast and memory-efficient inference with [vLLM](https://github.com/vllm-project/vllm).
- Support for commercial APIs including [OpenAI](https://openai.com), and [TextSynth](https://textsynth.com/).
- Support for evaluation on adapters (e.g. LoRA) supported in [HuggingFace's PEFT library](https://github.com/huggingface/peft).
- Support for local models and benchmarks.
- Evaluation with publicly available prompts ensures reproducibility and comparability between papers.
- Easy support for custom prompts and evaluation metrics.
The Language Model Evaluation Harness is the backend for 🤗 Hugging Face's popular [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard), has been used in [hundreds of papers](https://scholar.google.com/scholar?oi=bibs&hl=en&authuser=2&cites=15052937328817631261,4097184744846514103,1520777361382155671,17476825572045927382,18443729326628441434,14801318227356878622,7890865700763267262,12854182577605049984,15641002901115500560,5104500764547628290), and is used internally by dozens of organizations including NVIDIA, Cohere, BigScience, BigCode, Nous Research, and Mosaic ML.
## Install
To install the `lm-eval` package from the github repository, run:
```bash
git clone https://github.com/EleutherAI/lm-evaluation-harness
cd lm-evaluation-harness
pip install -e .
```
We also provide a number of optional dependencies for extended functionality. A detailed table is available at the end of this document.
## Basic Usage
### Hugging Face `transformers`
To evaluate a model hosted on the [HuggingFace Hub](https://huggingface.co/models) (e.g. GPT-J-6B) on `hellaswag` you can use the following command (this assumes you are using a CUDA-compatible GPU):
```bash
lm_eval --model hf \
--model_args pretrained=EleutherAI/gpt-j-6B \
--tasks hellaswag \
--device cuda:0 \
--batch_size 8
```
Additional arguments can be provided to the model constructor using the `--model_args` flag. Most notably, this supports the common practice of using the `revisions` feature on the Hub to store partially trained checkpoints, or to specify the datatype for running a model:
```bash
lm_eval --model hf \
--model_args pretrained=EleutherAI/pythia-160m,revision=step100000,dtype="float" \
--tasks lambada_openai,hellaswag \
--device cuda:0 \
--batch_size 8
```
Models that are loaded via both `transformers.AutoModelForCausalLM` (autoregressive, decoder-only GPT style models) and `transformers.AutoModelForSeq2SeqLM` (such as encoder-decoder models like T5) in Huggingface are supported.
Batch size selection can be automated by setting the ```--batch_size``` flag to ```auto```. This will perform automatic detection of the largest batch size that will fit on your device. On tasks where there is a large difference between the longest and shortest example, it can be helpful to periodically recompute the largest batch size, to gain a further speedup. To do this, append ```:N``` to above flag to automatically recompute the largest batch size ```N``` times. For example, to recompute the batch size 4 times, the command would be:
```bash
lm_eval --model hf \
--model_args pretrained=EleutherAI/pythia-160m,revision=step100000,dtype="float" \
--tasks lambada_openai,hellaswag \
--device cuda:0 \
--batch_size auto:4
```
The full list of supported arguments are provided [here](./docs/interface.md), and on the terminal by calling `lm_eval -h`. Alternatively, you can use `lm-eval` instead of `lm_eval`.
> [!Note]
> Just like you can provide a local path to `transformers.AutoModel`, you can also provide a local path to `lm_eval` via `--model_args pretrained=/path/to/model`
#### Multi-GPU Evaluation with Hugging Face `accelerate`
We support two main ways of using Hugging Face's [accelerate 🚀](https://github.com/huggingface/accelerate) library for multi-GPU evaluation.
To perform *data-parallel evaluation* (where each GPU loads a **separate full copy** of the model), we leverage the `accelerate` launcher as follows:
```
accelerate launch -m lm_eval --model hf \
--tasks lambada_openai,arc_easy \
--batch_size 16
```
(or via `accelerate launch --no-python lm_eval`).
For cases where your model can fit on a single GPU, this allows you to evaluate on K GPUs K times faster than on one.
**WARNING**: This setup does not work with FSDP model sharding, so in `accelerate config` FSDP must be disabled, or the NO_SHARD FSDP option must be used.
The second way of using `accelerate` for multi-GPU evaluation is when your model is *too large to fit on a single GPU.*
In this setting, run the library *outside of the `accelerate` launcher*, but passing `parallelize=True` to `--model_args` as follows:
```
lm_eval --model hf \
--tasks lambada_openai,arc_easy \
--model_args parallelize=True \
--batch_size 16
```
This means that your model's weights will be split across all available GPUs.
For more advanced users or even larger models, we allow for the following arguments when `parallelize=True` as well:
- `device_map_option`: How to split model weights across available GPUs. defaults to "auto".
- `max_memory_per_gpu`: the max GPU memory to use per GPU in loading the model.
- `max_cpu_memory`: the max amount of CPU memory to use when offloading the model weights to RAM.
- `offload_folder`: a folder where model weights will be offloaded to disk if needed.
These two options (`accelerate launch` and `parallelize=True`) are mutually exclusive.
**Note: we do not currently support multi-node evaluations natively, and advise using either an externally hosted server to run inference requests against, or creating a custom integration with your distributed framework [as is done for the GPT-NeoX library](https://github.com/EleutherAI/gpt-neox/blob/main/eval_tasks/eval_adapter.py).**
### Tensor + Data Parallel and Optimized Inference with `vLLM`
We also support vLLM for faster inference on [supported model types](https://docs.vllm.ai/en/latest/models/supported_models.html), especially faster when splitting a model across multiple GPUs. For single-GPU or multi-GPU — tensor parallel, data parallel, or a combination of both — inference, for example:
```bash
lm_eval --model vllm \
--model_args pretrained={model_name},tensor_parallel_size={GPUs_per_model},dtype=auto,gpu_memory_utilization=0.8,data_parallel_size={model_replicas} \
--tasks lambada_openai \
--batch_size auto
```
For a full list of supported vLLM configurations, please reference our vLLM integration and the vLLM documentation.
vLLM occasionally differs in output from Huggingface. We treat Huggingface as the reference implementation, and provide a [script](./scripts/model_comparator.py) for checking the validity of vllm results against HF.
### Model APIs and Inference Servers
Our library also supports the evaluation of models served via several commercial APIs, and we hope to implement support for the most commonly used performant local/self-hosted inference servers.
To call a hosted model, use:
```bash
export OPENAI_API_KEY=YOUR_KEY_HERE
lm_eval --model openai-completions \
--model_args model=davinci \
--tasks lambada_openai,hellaswag
```
We also support using your own local inference server with servers that mirror the OpenAI Completions and ChatCompletions APIs.
```bash
lm_eval --model local-chat-completions --tasks gsm8k --model_args model=facebook/opt-125m,base_url=http://{yourip}:8000/v1
```
Note that for externally hosted models, configs such as `--device` and `--batch_size` should not be used and do not function. Just like you can use `--model_args` to pass arbitrary arguments to the model constructor for local models, you can use it to pass arbitrary arguments to the model API for hosted models. See the documentation of the hosting service for information on what arguments they support.
| API or Inference Server | Implemented? | `--model <xxx>` name | Models supported: | Request Types: |
|---------------------------------------------------------------------------------------------------------------------------|---------------------------------|---------------------------------------------------------------------|-----------------------------------------------------------------------------------------------|------------------------------------------------------------|
| OpenAI Completions | :heavy_check_mark: | `openai-completions`, `local-completions` | All OpenAI Completions API models | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| OpenAI ChatCompletions | :heavy_check_mark: | `openai-chat-completions`, `local-chat-completions` | [All ChatCompletions API models](https://platform.openai.com/docs/guides/gpt) | `generate_until` (no logprobs) |
| Anthropic | :heavy_check_mark: | `anthropic` | [Supported Anthropic Engines](https://docs.anthropic.com/claude/reference/selecting-a-model) | `generate_until` (no logprobs) |
| Textsynth | :heavy_check_mark: | `textsynth` | [All supported engines](https://textsynth.com/documentation.html#engines) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| Cohere | [:hourglass: - blocked on Cohere API bug](https://github.com/EleutherAI/lm-evaluation-harness/pull/395) | N/A | [All `cohere.generate()` engines](https://docs.cohere.com/docs/models) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| [Llama.cpp](https://github.com/ggerganov/llama.cpp) (via [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)) | :heavy_check_mark: | `gguf`, `ggml` | [All models supported by llama.cpp](https://github.com/ggerganov/llama.cpp) | `generate_until`, `loglikelihood`, (perplexity evaluation not yet implemented) |
| vLLM | :heavy_check_mark: | `vllm` | [Most HF Causal Language Models](https://docs.vllm.ai/en/latest/models/supported_models.html) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| Mamba | :heavy_check_mark: | `mamba_ssm` | [Mamba architecture Language Models via the `mamba_ssm` package](https://huggingface.co/state-spaces) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| Huggingface Optimum (Causal LMs) | ✔️ | `openvino` | Any decoder-only AutoModelForCausalLM converted with Huggingface Optimum into OpenVINO™ Intermediate Representation (IR) format | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | ... |
| Neuron via AWS Inf2 (Causal LMs) | ✔️ | `neuronx` | Any decoder-only AutoModelForCausalLM supported to run on [huggingface-ami image for inferentia2](https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` | ... |
| Your local inference server! | :heavy_check_mark: | `local-completions` or `local-chat-completions` (using `openai-chat-completions` model type) | Any server address that accepts GET requests using HF models and mirror's OpenAI's Completions or ChatCompletions interface | `generate_until` | | ... |
Models which do not supply logits or logprobs can be used with tasks of type `generate_until` only, while local models, or APIs that supply logprobs/logits of their prompts, can be run on all task types: `generate_until`, `loglikelihood`, `loglikelihood_rolling`, and `multiple_choice`.
For more information on the different task `output_types` and model request types, see [our documentation](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/model_guide.md#interface).
### Other Frameworks
A number of other libraries contain scripts for calling the eval harness through their library. These include [GPT-NeoX](https://github.com/EleutherAI/gpt-neox/blob/main/eval_tasks/eval_adapter.py), [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/blob/main/examples/MoE/readme_evalharness.md), and [mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/blob/master/eval_harness.py).
To create your own custom integration you can follow instructions from [this tutorial](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage).
### Additional Features
> [!Note]
> For tasks unsuitable for direct evaluation — either due risks associated with executing untrusted code or complexities in the evaluation process — the `--predict_only` flag is available to obtain decoded generations for post-hoc evaluation.
If you have a Metal compatible Mac, you can run the eval harness using the MPS back-end by replacing `--device cuda:0` with `--device mps` (requires PyTorch version 2.1 or higher).
> [!Note]
> You can inspect what the LM inputs look like by running the following command:
> ```bash
> python write_out.py \
> --tasks <task1,task2,...> \
> --num_fewshot 5 \
> --num_examples 10 \
> --output_base_path /path/to/output/folder
> ```
> This will write out one text file for each task.
To verify the data integrity of the tasks you're performing in addition to running the tasks themselves, you can use the `--check_integrity` flag:
```bash
lm_eval --model openai \
--model_args engine=davinci \
--tasks lambada_openai,hellaswag \
--check_integrity
```
## Advanced Usage Tips
For models loaded with the HuggingFace `transformers` library, any arguments provided via `--model_args` get passed to the relevant constructor directly. This means that anything you can do with `AutoModel` can be done with our library. For example, you can pass a local path via `pretrained=` or use models finetuned with [PEFT](https://github.com/huggingface/peft) by taking the call you would run to evaluate the base model and add `,peft=PATH` to the `model_args` argument:
```bash
lm_eval --model hf \
--model_args pretrained=EleutherAI/gpt-j-6b,parallelize=True,load_in_4bit=True,peft=nomic-ai/gpt4all-j-lora \
--tasks openbookqa,arc_easy,winogrande,hellaswag,arc_challenge,piqa,boolq \
--device cuda:0
```
[GPTQ](https://github.com/PanQiWei/AutoGPTQ) quantized models can be loaded by specifying their file names in `,autogptq=NAME` (or `,autogptq=True` for default names) in the `model_args` argument:
```bash
lm_eval --model hf \
--model_args pretrained=model-name-or-path,autogptq=model.safetensors,gptq_use_triton=True \
--tasks hellaswag
```
We support wildcards in task names, for example you can run all of the machine-translated lambada tasks via `--task lambada_openai_mt_*`.
To save evaluation results provide an `--output_path`. We also support logging model responses with the `--log_samples` flag for post-hoc analysis.
Additionally, one can provide a directory with `--use_cache` to cache the results of prior runs. This allows you to avoid repeated execution of the same (model, task) pairs for re-scoring.
For a full list of supported arguments, check out the [interface](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md) guide in our documentation!
> [!Tip]
> Running lm-evaluation-harness as an external library and can't find (almost) any tasks available? Run `lm_eval.tasks.initialize_tasks()` to load the library's stock tasks before calling `lm_eval.evaluate()` or `lm_eval.simple_evaluate()` !
## Visualizing Results
You can seamlessly visualize and analyze the results of your evaluation harness runs using both Weights & Biases (W&B) and Zeno.
### Zeno
You can use [Zeno](https://zenoml.com) to visualize the results of your eval harness runs.
First, head to [hub.zenoml.com](https://hub.zenoml.com) to create an account and get an API key [on your account page](https://hub.zenoml.com/account).
Add this key as an environment variable:
```bash
export ZENO_API_KEY=[your api key]
```
You'll also need to install the `lm_eval[zeno]` package extra.
To visualize the results, run the eval harness with the `log_samples` and `output_path` flags.
We expect `output_path` to contain multiple folders that represent individual model names.
You can thus run your evaluation on any number of tasks and models and upload all of the results as projects on Zeno.
```bash
lm_eval \
--model hf \
--model_args pretrained=EleutherAI/gpt-j-6B \
--tasks hellaswag \
--device cuda:0 \
--batch_size 8 \
--log_samples \
--output_path output/gpt-j-6B
```
Then, you can upload the resulting data using the `zeno_visualize` script:
```bash
python scripts/zeno_visualize.py \
--data_path output \
--project_name "Eleuther Project"
```
This will use all subfolders in `data_path` as different models and upload all tasks within these model folders to Zeno.
If you run the eval harness on multiple tasks, the `project_name` will be used as a prefix and one project will be created per task.
You can find an example of this workflow in [examples/visualize-zeno.ipynb](examples/visualize-zeno.ipynb).
### Weights and Biases
With the [Weights and Biases](https://wandb.ai/site) integration, you can now spend more time extracting deeper insights into your evaluation results. The integration is designed to streamline the process of logging and visualizing experiment results using the Weights & Biases (W&B) platform.
The integration provide functionalities
- to automatically log the evaluation results,
- log the samples as W&B Tables for easy visualization,
- log the `results.json` file as an artifact for version control,
- log the `<task_name>_eval_samples.json` file if the samples are logged,
- generate a comprehensive report for analysis and visualization with all the important metric,
- log task and cli specific configs,
- and more out of the box like the command used to run the evaluation, GPU/CPU counts, timestamp, etc.
First you'll need to install the lm_eval[wandb] package extra. Do `pip install lm_eval[wandb]`.
Authenticate your machine with an your unique W&B token. Visit https://wandb.ai/authorize to get one. Do `wandb login` in your command line terminal.
Run eval harness as usual with a `wandb_args` flag. Use this flag to provide arguments for initializing a wandb run ([wandb.init](https://docs.wandb.ai/ref/python/init)) as comma separated string arguments.
```bash
lm_eval \
--model hf \
--model_args pretrained=microsoft/phi-2,trust_remote_code=True \
--tasks hellaswag,mmlu_abstract_algebra \
--device cuda:0 \
--batch_size 8 \
--output_path output/phi-2 \
--limit 10 \
--wandb_args project=lm-eval-harness-integration \
--log_samples
```
In the stdout, you will find the link to the W&B run page as well as link to the generated report. You can find an example of this workflow in [examples/visualize-wandb.ipynb](examples/visualize-wandb.ipynb), and an example of how to integrate it beyond the CLI.
## How to Contribute or Learn More?
For more information on the library and how everything fits together, check out all of our [documentation pages](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/docs)! We plan to post a larger roadmap of desired + planned library improvements soon, with more information on how contributors can help.
### Implementing new tasks
To implement a new task in the eval harness, see [this guide](./docs/new_task_guide.md).
In general, we follow this priority list for addressing concerns about prompting and other eval details:
1. If there is widespread agreement among people who train LLMs, use the agreed upon procedure.
2. If there is a clear and unambiguous official implementation, use that procedure.
3. If there is widespread agreement among people who evaluate LLMs, use the agreed upon procedure.
4. If there are multiple common implementations but not universal or widespread agreement, use our preferred option among the common implementations. As before, prioritize choosing from among the implementations found in LLM training papers.
These are guidelines and not rules, and can be overruled in special circumstances.
We try to prioritize agreement with the procedures used by other groups to decrease the harm when people inevitably compare runs across different papers despite our discouragement of the practice. Historically, we also prioritized the implementation from [Language Models are Few Shot Learners](https://arxiv.org/abs/2005.14165) as our original goal was specifically to compare results with that paper.
### Support
The best way to get support is to open an issue on this repo or join the [EleutherAI Discord server](https://discord.gg/eleutherai). The `#lm-thunderdome` channel is dedicated to developing this project and the `#release-discussion` channel is for receiving support for our releases. If you've used the library and have had a positive (or negative) experience, we'd love to hear from you!
## Optional Extras
Extras dependencies can be installed via `pip install -e ".[NAME]"`
| Name | Use |
|---------------|---------------------------------------|
| anthropic | For using Anthropic's models |
| dev | For linting PRs and contributions |
| gptq | For loading models with GPTQ |
| hf_transfer | For speeding up HF Hub file downloads |
| ifeval | For running the IFEval task |
| neuronx | For running on AWS inf2 instances |
| mamba | For loading Mamba SSM models |
| math | For running math task answer checking |
| multilingual | For multilingual tokenizers |
| openai | For using OpenAI's models |
| optimum | For running Intel OpenVINO models |
| promptsource | For using PromptSource prompts |
| sentencepiece | For using the sentencepiece tokenizer |
| testing | For running library test suite |
| vllm | For loading models with vLLM |
| zeno | For visualizing results with Zeno |
|---------------|---------------------------------------|
| all | Loads all extras (not recommended) |
## Cite as
```
@misc{eval-harness,
author = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy},
title = {A framework for few-shot language model evaluation},
month = 12,
year = 2023,
publisher = {Zenodo},
version = {v0.4.0},
doi = {10.5281/zenodo.10256836},
url = {https://zenodo.org/records/10256836}
}
```
# Contributing to LM Evaluation Harness
Welcome and thank you for your interest in the LM Evaluation Harness! We welcome contributions and feedback and appreciate your time spent with our library, and hope you find it useful!
We intend LM Evaluation Harness to be a broadly useful and
## Important Resources
There are several places information about LM Evaluation Harness is located:
- Our [documentation pages](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/docs)
- We occasionally use [GitHub Milestones](https://github.com/EleutherAI/lm-evaluation-harness/milestones) to track progress toward specific near-term version releases.
- We maintain a [Project Board](https://github.com/orgs/EleutherAI/projects/25) for tracking current work items and PRs, and for future roadmap items or feature requests.
- Further discussion and support conversations are located in the #lm-thunderdome channel of the [EleutherAI discord](discord.gg/eleutherai).
## Code Style
LM Evaluation Harness uses [ruff](https://github.com/astral-sh/ruff) for linting via [pre-commit](https://pre-commit.com/).
You can install linters and dev tools via
```pip install lm_eval[dev]``` or ```pip install -e ".[dev]"```
Then, run
```pre-commit install```
in order to ensure linters and other checks will be run upon committing.
## Testing
We use [pytest](https://docs.pytest.org/en/latest/) for running unit tests. All library unit tests can be run via:
```
python -m pytest --ignore=tests/tests_master --ignore=tests/extra
```
## Contributor License Agreement
We ask that new contributors agree to a Contributor License Agreement affirming that EleutherAI has the rights to use your contribution to our library.
First-time pull requests will have a reply added by @CLAassistant containing instructions for how to confirm this, and we require it before merging your PR.
## Contribution Best Practices
We recommend a few best practices to make your contributions or reported errors easier to assist with.
**For Pull Requests:**
- PRs should be titled descriptively, and be opened with a brief description of the scope and intent of the new contribution.
- New features should have appropriate documentation added alongside them.
- Aim for code maintainability, and minimize code copying.
- If opening a task, try to share test results on the task using a publicly-available model, and if any public results are available on the task, compare to them.
**For Feature Requests:**
- Provide a short paragraph's worth of description. What is the feature you are requesting? What is its motivation, and an example use case of it? How does this differ from what is currently supported?
**For Bug Reports**:
- Provide a short description of the bug.
- Provide a *reproducible example*--what is the command you run with our library that results in this error? Have you tried any other steps to resolve it?
- Provide a *full error traceback* of the error that occurs, if applicable. A one-line error message or small screenshot snippet is unhelpful without the surrounding context.
- Note what version of the codebase you are using, and any specifics of your environment and setup that may be relevant.
**For Requesting New Tasks**:
- Provide a 1-2 sentence description of what the task is and what it evaluates.
- Provide a link to the paper introducing the task.
- Provide a link to where the dataset can be found.
- Provide a link to a paper containing results on an open-source model on the task, for use in comparisons and implementation validation.
- If applicable, link to any codebase that has implemented the task (especially the original publication's codebase, if existent).
## How Can I Get Involved?
To quickly get started, we maintain a list of good first issues, which can be found [on our project board](https://github.com/orgs/EleutherAI/projects/25/views/8) or by [filtering GH Issues](https://github.com/EleutherAI/lm-evaluation-harness/issues?q=is%3Aopen+label%3A%22good+first+issue%22+label%3A%22help+wanted%22). These are typically smaller code changes or self-contained features which can be added without extensive familiarity with library internals, and we recommend new contributors consider taking a stab at one of these first if they are feeling uncertain where to begin.
There are a number of distinct ways to contribute to LM Evaluation Harness, and all are extremely helpful! A sampling of ways to contribute include:
- **Implementing and verifying new evaluation tasks**: Is there a task you'd like to see LM Evaluation Harness support? Consider opening an issue requesting it, or helping add it! Verifying and cross-checking task implementations with their original versions is also a very valuable form of assistance in ensuring standardized evaluation.
- **Improving documentation** - Improvements to the documentation, or noting pain points / gaps in documentation, are helpful in order for us to improve the user experience of the library and clarity + coverage of documentation.
- **Testing and devops** - We are very grateful for any assistance in adding tests for the library that can be run for new PRs, and other devops workflows.
- **Adding new modeling / inference library integrations** - We hope to support a broad range of commonly-used inference libraries popular among the community, and welcome PRs for new integrations, so long as they are documented properly and maintainable.
- **Proposing or Contributing New Features** - We want LM Evaluation Harness to support a broad range of evaluation usecases. If you have a feature that is not currently supported but desired, feel free to open an issue describing the feature and, if applicable, how you intend to implement it. We would be happy to give feedback on the cleanest way to implement new functionalities and are happy to coordinate with interested contributors via GH discussions or via discord.
We hope that this has been helpful, and appreciate your interest in contributing! Further questions can be directed to [our Discord](discord.gg/eleutherai).
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment