Commit c4b0c0cb authored by Baber's avatar Baber
Browse files

Merge branch 'main' into metrics

# Conflicts:
#	lm_eval/models/vllm_causallms.py
#	pyproject.toml
parents 6b20ae8c de496b80
......@@ -2,7 +2,6 @@ tag:
- afrobench_sentiment_tasks
- nollysenti_prompt_2
dataset_path: Davlan/nollysenti
dataset_kwargs: {trust_remote_code: True}
output_type: multiple_choice
validation_split: validation
test_split: test
......
......@@ -2,7 +2,6 @@ tag:
- afrobench_sentiment_tasks
- nollysenti_prompt_3
dataset_path: Davlan/nollysenti
dataset_kwargs: {trust_remote_code: True}
output_type: multiple_choice
validation_split: validation
test_split: test
......
......@@ -2,7 +2,6 @@ tag:
- afrobench_sentiment_tasks
- nollysenti_prompt_4
dataset_path: Davlan/nollysenti
dataset_kwargs: {trust_remote_code: True}
output_type: multiple_choice
validation_split: validation
test_split: test
......
......@@ -2,7 +2,6 @@ tag:
- afrobench_sentiment_tasks
- nollysenti_prompt_5
dataset_path: Davlan/nollysenti
dataset_kwargs: {trust_remote_code: True}
output_type: multiple_choice
validation_split: validation
test_split: test
......
......@@ -4,7 +4,6 @@ tag:
- ntrex_afr-eng_prompt_1
- afrobench_MT_tasks
dataset_path: masakhane/ntrex_african
dataset_kwargs: {trust_remote_code: True}
output_type: generate_until
validation_split: test
fewshot_split: test
......
......@@ -4,7 +4,6 @@ tag:
- ntrex_eng-afr_prompt_1
- afrobench_MT_tasks
dataset_path: masakhane/ntrex_african
dataset_kwargs: {trust_remote_code: True}
output_type: generate_until
validation_split: test
fewshot_split: test
......
......@@ -3,7 +3,6 @@ tag:
- ntrex_afr-eng_prompt_2
- afrobench_MT_tasks
dataset_path: masakhane/ntrex_african
dataset_kwargs: {trust_remote_code: True}
output_type: generate_until
validation_split: test
fewshot_split: test
......
......@@ -3,7 +3,6 @@ tag:
- ntrex_eng-afr_prompt_2
- afrobench_MT_tasks
dataset_path: masakhane/ntrex_african
dataset_kwargs: {trust_remote_code: True}
output_type: generate_until
validation_split: test
fewshot_split: test
......
......@@ -3,7 +3,6 @@ tag:
- ntrex_afr-eng_prompt_3
- afrobench_MT_tasks
dataset_path: masakhane/ntrex_african
dataset_kwargs: {trust_remote_code: True}
output_type: generate_until
validation_split: test
fewshot_split: test
......
......@@ -3,7 +3,6 @@ tag:
- ntrex_eng-afr_prompt_3
- afrobench_MT_tasks
dataset_path: masakhane/ntrex_african
dataset_kwargs: {trust_remote_code: True}
output_type: generate_until
validation_split: test
fewshot_split: test
......
......@@ -3,7 +3,6 @@ tag:
- salt_prompt_1
- afrobench_MT_tasks
dataset_path: Sunbird/salt
dataset_kwargs: {trust_remote_code: True}
output_type: generate_until
validation_split: dev
fewshot_split: dev
......
......@@ -3,7 +3,6 @@ tag:
- salt_prompt_2
- afrobench_MT_tasks
dataset_path: Sunbird/salt
dataset_kwargs: {trust_remote_code: True}
output_type: generate_until
validation_split: dev
fewshot_split: dev
......
......@@ -3,7 +3,6 @@ tag:
- salt_prompt_3
- afrobench_MT_tasks
dataset_path: Sunbird/salt
dataset_kwargs: {trust_remote_code: True}
output_type: generate_until
validation_split: dev
fewshot_split: dev
......
# AIME
### Citation
```text
@dataset{aime_1983_2024,
author = {Hemish Veeraboina},
title = {AIME Problem Set 1983-2024},
year = {2024},
publisher = {Kaggle},
url = {https://www.kaggle.com/datasets/hemishveeraboina/aime-problem-set-1983-2024}
}
@dataset{aime_2024,
author = {Maxwell Jia},
title = {AIME Problem Set 2024},
year = {2024},
publisher = {Huggingface},
url = {https://huggingface.co/datasets/Maxwell-Jia/AIME_2024}
}
@dataset{aime_2025,
author = {math-ai},
title = {AIME Problem Set 2025},
year = {2025},
publisher = {Huggingface},
url = {https://huggingface.co/datasets/math-ai/aime25}
}
```
### Groups, Tags, and Tasks
#### Groups
* `math_word_problems`
#### Tasks
* `aime`: `AIME 1983-2024 problems`
* `aime24`: `AIME 2024 problems`
* `aime25`: `AIME 2025 problems`
### Checklist
For adding novel benchmarks/datasets to the library:
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
tag:
- math_word_problems
task: aime
dataset_path: gneubig/aime-1983-2024
# dataset_name: null
output_type: generate_until
training_split: train
fewshot_split: train
test_split: train
doc_to_text: "Question: {{Question}}\nAnswer:"
doc_to_target: "{{Answer}}"
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
generation_kwargs:
until:
- "Question:"
- "</s>"
- "<|im_end|>"
- "<|eot_id|>"
do_sample: false
temperature: 0.0
max_gen_toks: 32768
repeats: 1
num_fewshot: 0
metadata:
version: 0.0
tag:
- math_word_problems
task: aime24
dataset_path: Maxwell-Jia/AIME_2024
# dataset_name: null
output_type: generate_until
training_split: train
fewshot_split: train
test_split: train
doc_to_text: "Question: {{Problem}}\nAnswer:"
doc_to_target: "{{Answer}}"
process_results: !function utils.process_results
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
generation_kwargs:
until:
- "Question:"
- "</s>"
- "<|im_end|>"
- "<|eot_id|>"
do_sample: false
temperature: 0.0
max_gen_toks: 32768
repeats: 1
num_fewshot: 0
metadata:
version: 0.0
tag:
- math_word_problems
task: aime25
dataset_path: math-ai/aime25
# dataset_name: null
output_type: generate_until
training_split: test
fewshot_split: test
test_split: test
doc_to_text: "Question: {{problem}}\nAnswer:"
doc_to_target: "{{answer}}"
process_results: !function utils.process_results
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
generation_kwargs:
until:
- "Question:"
- "</s>"
- "<|im_end|>"
- "<|eot_id|>"
do_sample: false
temperature: 0.0
max_gen_toks: 32768
repeats: 1
num_fewshot: 0
metadata:
version: 0.0
import re
from typing import Dict, List
def process_results(doc: dict, results: List[str]) -> Dict[str, int]:
retval = 0
response = results[0]
# Try to extract answer from $...$ format first
indices = [pos for pos, char in enumerate(response) if char == "$"]
if len(indices) <= 1:
answer = response
else:
answer = response[indices[0] + 1 : indices[-1]]
# Extract from \\boxed{} if present
boxed_answer = last_boxed_only_string(response)
if boxed_answer is not None:
try:
boxed_content = remove_boxed(boxed_answer)
if boxed_content is not None:
answer = boxed_content
except (AssertionError, IndexError):
pass
# Check if answer matches target
answer_key = next(k for k in doc.keys() if k.lower() == "answer")
target = str(doc[answer_key])
if is_equiv(answer, target):
retval = 1
return {"exact_match": retval}
# string normalization from https://github.com/EleutherAI/lm-evaluation-harness/blob/master/lm_eval/tasks/hendrycks_math.py
def is_equiv(str1, str2, verbose=False):
if str1 is None and str2 is None:
print("WARNING: Both None")
return True
if str1 is None or str2 is None:
return False
try:
ss1 = strip_string(str1)
ss2 = strip_string(str2)
if verbose:
print(ss1, ss2)
return ss1 == ss2
except Exception:
return str1 == str2
def remove_boxed(s):
if "\\boxed " in s:
left = "\\boxed "
assert s[: len(left)] == left
return s[len(left) :]
left = "\\boxed{"
assert s[: len(left)] == left
assert s[-1] == "}"
return s[len(left) : -1]
def last_boxed_only_string(string):
idx = string.rfind("\\boxed")
if "\\boxed " in string:
return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
if idx < 0:
idx = string.rfind("\\fbox")
if idx < 0:
return None
i = idx
right_brace_idx = None
num_left_braces_open = 0
while i < len(string):
if string[i] == "{":
num_left_braces_open += 1
if string[i] == "}":
num_left_braces_open -= 1
if num_left_braces_open == 0:
right_brace_idx = i
break
i += 1
if right_brace_idx is None:
retval = None
else:
retval = string[idx : right_brace_idx + 1]
return retval
def fix_fracs(string):
substrs = string.split("\\frac")
new_str = substrs[0]
if len(substrs) > 1:
substrs = substrs[1:]
for substr in substrs:
new_str += "\\frac"
if substr[0] == "{":
new_str += substr
else:
try:
assert len(substr) >= 2
except AssertionError:
return string
a = substr[0]
b = substr[1]
if b != "{":
if len(substr) > 2:
post_substr = substr[2:]
new_str += "{" + a + "}{" + b + "}" + post_substr
else:
new_str += "{" + a + "}{" + b + "}"
else:
if len(substr) > 2:
post_substr = substr[2:]
new_str += "{" + a + "}" + b + post_substr
else:
new_str += "{" + a + "}" + b
string = new_str
return string
def fix_a_slash_b(string):
if len(string.split("/")) != 2:
return string
a = string.split("/")[0]
b = string.split("/")[1]
try:
a = int(a)
b = int(b)
assert string == "{}/{}".format(a, b)
new_string = "\\frac{" + str(a) + "}{" + str(b) + "}"
return new_string
except AssertionError:
return string
def remove_right_units(string):
# "\\text{ " only ever occurs (at least in the val set) when describing units
if "\\text{ " in string:
splits = string.split("\\text{ ")
assert len(splits) == 2
return splits[0]
else:
return string
def fix_sqrt(string):
if "\\sqrt" not in string:
return string
splits = string.split("\\sqrt")
new_string = splits[0]
for split in splits[1:]:
if split[0] != "{":
a = split[0]
new_substr = "\\sqrt{" + a + "}" + split[1:]
else:
new_substr = "\\sqrt" + split
new_string += new_substr
return new_string
def strip_string(string):
# linebreaks
string = string.replace("\n", "")
# remove inverse spaces
string = string.replace("\\!", "")
# replace \\ with \
string = string.replace("\\\\", "\\")
# replace tfrac and dfrac with frac
string = string.replace("tfrac", "frac")
string = string.replace("dfrac", "frac")
# remove \left and \right
string = string.replace("\\left", "")
string = string.replace("\\right", "")
# Remove circ (degrees)
string = string.replace("^{\\circ}", "")
string = string.replace("^\\circ", "")
# remove dollar signs
string = string.replace("\\$", "")
# remove units (on the right)
string = remove_right_units(string)
# remove percentage
string = string.replace("\\%", "")
string = string.replace("\%", "") # noqa: W605
# " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
string = string.replace(" .", " 0.")
string = string.replace("{.", "{0.")
# if empty, return empty string
if len(string) == 0:
return string
if string[0] == ".":
string = "0" + string
# to consider: get rid of e.g. "k = " or "q = " at beginning
if len(string.split("=")) == 2:
if len(string.split("=")[0]) <= 2:
string = string.split("=")[1]
# fix sqrt3 --> sqrt{3}
string = fix_sqrt(string)
# remove spaces
string = string.replace(" ", "")
# \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
string = fix_fracs(string)
# manually change 0.5 --> \frac{1}{2}
if string == "0.5":
string = "\\frac{1}{2}"
# NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
string = fix_a_slash_b(string)
return string
# Babilong
### Paper
Title: Babilong: Testing the Limits of LLMs with Long Context Reasoning-in-a-Haystack
Abstract: https://arxiv.org/abs/2406.10149
In recent years, the input context sizes of large language models (LLMs) have increased dramatically. However, existing evaluation methods have not kept pace, failing to comprehensively assess the efficiency of models in handling long contexts. To bridge this gap, we introduce the BABILong benchmark, designed to test language models' ability to reason across facts distributed in extremely long documents. BABILong includes a diverse set of 20 reasoning tasks, including fact chaining, simple induction, deduction, counting, and handling lists/sets. These tasks are challenging on their own, and even more demanding when the required facts are scattered across long natural text. Our evaluations show that popular LLMs effectively utilize only 10-20\% of the context and their performance declines sharply with increased reasoning complexity. Among alternatives to in-context reasoning, Retrieval-Augmented Generation methods achieve a modest 60\% accuracy on single-fact question answering, independent of context length. Among context extension methods, the highest performance is demonstrated by recurrent memory transformers after fine-tuning, enabling the processing of lengths up to 50 million tokens. The BABILong benchmark is extendable to any length to support the evaluation of new upcoming models with increased capabilities, and we provide splits up to 10 million token lengths.
Homepage: https://github.com/booydar/babilong
### Citation
```
@article{kuratov2024babilong,
title={Babilong: Testing the Limits of LLMs with Long Context Reasoning-in-a-Haystack},
author={Kuratov, Yuri and Bulatov, Aydar and Anokhin, Petr and Rodkin, Ivan and Sorokin, Dmitry and Burtsev, Mikhail},
journal={arXiv preprint arXiv:2406.10149},
year={2024}
}
```
### Groups and Tasks
#### Groups
* `babilong`: All Babilong tasks at 0k context length
* `babilong_longctx`: Babilong tasks between qa1-qa5 at context lengths up to 128k
#### Tasks
The benchmark includes 1000 samples of 20 reasoning tasks at various context lengths:
**QA Tasks (qa1-qa20):**
* `babilong_qa1`: Single supporting fact QA
* `babilong_qa2`: Two supporting facts QA
* `babilong_qa3`: Three supporting facts QA
* `babilong_qa4`: Two argument relations
* `babilong_qa5`: Three argument relations
* `babilong_qa6`: Yes/No questions
* `babilong_qa7`: Counting
* `babilong_qa8`: Lists and sets
* `babilong_qa9`: Simple negation
* `babilong_qa10`: Indefinite knowledge
* `babilong_qa11`: Track person through temporal references
* `babilong_qa12`: Conjunction
* `babilong_qa13`: Compound coreference
* `babilong_qa14`: Time reasoning
* `babilong_qa15`: Basic deduction
* `babilong_qa16`: Basic induction
* `babilong_qa17`: Positional reasoning
* `babilong_qa18`: Size reasoning
* `babilong_qa19`: Path finding
* `babilong_qa20`: Motivation deduction
> [!NOTE]
> When using babilong tasks, please note:
> 1. This is the implementation with 1000 samples per length. You can change the dataset path to `RMT-team/babilong` in `common_utils.py` for the dataset with 100 samples per length, which supports context lengths up to 10M tokens.
> 2. Supported lengths are 0k, 1, 2, 4, 8, 16, 32, 64, 128k tokens for tasks qa1-5. Tasks qa6-20 only have a length of 0k.
> 3. The default maximum sequence length is 0k. For calculating metrics of different max seq lengths, specify additional lengths using the metadata parameter:
> `--metadata '{"max_seq_lengths":"0k,1k,2k,4k,8k,16k,32k,128k"}'`. The config currently only takes one context length at a time. The metadata parameter can also be passed to the TaskManager (metadata: dict).
### Checklist
For adding novel benchmarks/datasets to the library:
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
dataset_path: RMT-team/babilong-1k-samples
output_type: generate_until
doc_to_target: "{{target}}"
target_delimiter: " "
num_fewshot: 2
process_results: !function common_utils.process_results
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
generation_kwargs:
do_sample: false
temperature: 0.0
max_gen_toks: 16
until: []
metadata:
version: 0.0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment