Commit abd17276 authored by Baber's avatar Baber
Browse files

Merge branch 'smolrefact' into tasklist

# Conflicts:
#	lm_eval/__main__.py
#	lm_eval/api/group.py
#	lm_eval/api/task.py
#	lm_eval/evaluator_utils.py
#	lm_eval/tasks/__init__.py
#	lm_eval/utils.py
#	pyproject.toml
parents 00afd536 70314843
...@@ -3,7 +3,6 @@ tag: ...@@ -3,7 +3,6 @@ tag:
- salt_prompt_1 - salt_prompt_1
- afrobench_MT_tasks - afrobench_MT_tasks
dataset_path: Sunbird/salt dataset_path: Sunbird/salt
dataset_kwargs: {trust_remote_code: True}
output_type: generate_until output_type: generate_until
validation_split: dev validation_split: dev
fewshot_split: dev fewshot_split: dev
......
...@@ -3,7 +3,6 @@ tag: ...@@ -3,7 +3,6 @@ tag:
- salt_prompt_2 - salt_prompt_2
- afrobench_MT_tasks - afrobench_MT_tasks
dataset_path: Sunbird/salt dataset_path: Sunbird/salt
dataset_kwargs: {trust_remote_code: True}
output_type: generate_until output_type: generate_until
validation_split: dev validation_split: dev
fewshot_split: dev fewshot_split: dev
......
...@@ -3,7 +3,6 @@ tag: ...@@ -3,7 +3,6 @@ tag:
- salt_prompt_3 - salt_prompt_3
- afrobench_MT_tasks - afrobench_MT_tasks
dataset_path: Sunbird/salt dataset_path: Sunbird/salt
dataset_kwargs: {trust_remote_code: True}
output_type: generate_until output_type: generate_until
validation_split: dev validation_split: dev
fewshot_split: dev fewshot_split: dev
......
# AIME
### Citation
```text
@dataset{aime_1983_2024,
author = {Hemish Veeraboina},
title = {AIME Problem Set 1983-2024},
year = {2024},
publisher = {Kaggle},
url = {https://www.kaggle.com/datasets/hemishveeraboina/aime-problem-set-1983-2024}
}
@dataset{aime_2024,
author = {Maxwell Jia},
title = {AIME Problem Set 2024},
year = {2024},
publisher = {Huggingface},
url = {https://huggingface.co/datasets/Maxwell-Jia/AIME_2024}
}
@dataset{aime_2025,
author = {math-ai},
title = {AIME Problem Set 2025},
year = {2025},
publisher = {Huggingface},
url = {https://huggingface.co/datasets/math-ai/aime25}
}
```
### Groups, Tags, and Tasks
#### Groups
* `math_word_problems`
#### Tasks
* `aime`: `AIME 1983-2024 problems`
* `aime24`: `AIME 2024 problems`
* `aime25`: `AIME 2025 problems`
### Checklist
For adding novel benchmarks/datasets to the library:
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
tag:
- math_word_problems
task: aime
dataset_path: gneubig/aime-1983-2024
# dataset_name: null
output_type: generate_until
training_split: train
fewshot_split: train
test_split: train
doc_to_text: "Question: {{Question}}\nAnswer:"
doc_to_target: "{{Answer}}"
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
generation_kwargs:
until:
- "Question:"
- "</s>"
- "<|im_end|>"
- "<|eot_id|>"
do_sample: false
temperature: 0.0
max_gen_toks: 32768
repeats: 1
num_fewshot: 0
metadata:
version: 0.0
tag:
- math_word_problems
task: aime24
dataset_path: Maxwell-Jia/AIME_2024
# dataset_name: null
output_type: generate_until
training_split: train
fewshot_split: train
test_split: train
doc_to_text: "Question: {{Problem}}\nAnswer:"
doc_to_target: "{{Answer}}"
process_results: !function utils.process_results
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
generation_kwargs:
until:
- "Question:"
- "</s>"
- "<|im_end|>"
- "<|eot_id|>"
do_sample: false
temperature: 0.0
max_gen_toks: 32768
repeats: 1
num_fewshot: 0
metadata:
version: 0.0
tag:
- math_word_problems
task: aime25
dataset_path: math-ai/aime25
# dataset_name: null
output_type: generate_until
training_split: test
fewshot_split: test
test_split: test
doc_to_text: "Question: {{problem}}\nAnswer:"
doc_to_target: "{{answer}}"
process_results: !function utils.process_results
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
generation_kwargs:
until:
- "Question:"
- "</s>"
- "<|im_end|>"
- "<|eot_id|>"
do_sample: false
temperature: 0.0
max_gen_toks: 32768
repeats: 1
num_fewshot: 0
metadata:
version: 0.0
import re
from typing import Dict, List
def process_results(doc: dict, results: List[str]) -> Dict[str, int]:
retval = 0
response = results[0]
# Try to extract answer from $...$ format first
indices = [pos for pos, char in enumerate(response) if char == "$"]
if len(indices) <= 1:
answer = response
else:
answer = response[indices[0] + 1 : indices[-1]]
# Extract from \\boxed{} if present
boxed_answer = last_boxed_only_string(response)
if boxed_answer is not None:
try:
boxed_content = remove_boxed(boxed_answer)
if boxed_content is not None:
answer = boxed_content
except (AssertionError, IndexError):
pass
# Check if answer matches target
answer_key = next(k for k in doc.keys() if k.lower() == "answer")
target = str(doc[answer_key])
if is_equiv(answer, target):
retval = 1
return {"exact_match": retval}
# string normalization from https://github.com/EleutherAI/lm-evaluation-harness/blob/master/lm_eval/tasks/hendrycks_math.py
def is_equiv(str1, str2, verbose=False):
if str1 is None and str2 is None:
print("WARNING: Both None")
return True
if str1 is None or str2 is None:
return False
try:
ss1 = strip_string(str1)
ss2 = strip_string(str2)
if verbose:
print(ss1, ss2)
return ss1 == ss2
except Exception:
return str1 == str2
def remove_boxed(s):
if "\\boxed " in s:
left = "\\boxed "
assert s[: len(left)] == left
return s[len(left) :]
left = "\\boxed{"
assert s[: len(left)] == left
assert s[-1] == "}"
return s[len(left) : -1]
def last_boxed_only_string(string):
idx = string.rfind("\\boxed")
if "\\boxed " in string:
return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
if idx < 0:
idx = string.rfind("\\fbox")
if idx < 0:
return None
i = idx
right_brace_idx = None
num_left_braces_open = 0
while i < len(string):
if string[i] == "{":
num_left_braces_open += 1
if string[i] == "}":
num_left_braces_open -= 1
if num_left_braces_open == 0:
right_brace_idx = i
break
i += 1
if right_brace_idx is None:
retval = None
else:
retval = string[idx : right_brace_idx + 1]
return retval
def fix_fracs(string):
substrs = string.split("\\frac")
new_str = substrs[0]
if len(substrs) > 1:
substrs = substrs[1:]
for substr in substrs:
new_str += "\\frac"
if substr[0] == "{":
new_str += substr
else:
try:
assert len(substr) >= 2
except AssertionError:
return string
a = substr[0]
b = substr[1]
if b != "{":
if len(substr) > 2:
post_substr = substr[2:]
new_str += "{" + a + "}{" + b + "}" + post_substr
else:
new_str += "{" + a + "}{" + b + "}"
else:
if len(substr) > 2:
post_substr = substr[2:]
new_str += "{" + a + "}" + b + post_substr
else:
new_str += "{" + a + "}" + b
string = new_str
return string
def fix_a_slash_b(string):
if len(string.split("/")) != 2:
return string
a = string.split("/")[0]
b = string.split("/")[1]
try:
a = int(a)
b = int(b)
assert string == "{}/{}".format(a, b)
new_string = "\\frac{" + str(a) + "}{" + str(b) + "}"
return new_string
except AssertionError:
return string
def remove_right_units(string):
# "\\text{ " only ever occurs (at least in the val set) when describing units
if "\\text{ " in string:
splits = string.split("\\text{ ")
assert len(splits) == 2
return splits[0]
else:
return string
def fix_sqrt(string):
if "\\sqrt" not in string:
return string
splits = string.split("\\sqrt")
new_string = splits[0]
for split in splits[1:]:
if split[0] != "{":
a = split[0]
new_substr = "\\sqrt{" + a + "}" + split[1:]
else:
new_substr = "\\sqrt" + split
new_string += new_substr
return new_string
def strip_string(string):
# linebreaks
string = string.replace("\n", "")
# remove inverse spaces
string = string.replace("\\!", "")
# replace \\ with \
string = string.replace("\\\\", "\\")
# replace tfrac and dfrac with frac
string = string.replace("tfrac", "frac")
string = string.replace("dfrac", "frac")
# remove \left and \right
string = string.replace("\\left", "")
string = string.replace("\\right", "")
# Remove circ (degrees)
string = string.replace("^{\\circ}", "")
string = string.replace("^\\circ", "")
# remove dollar signs
string = string.replace("\\$", "")
# remove units (on the right)
string = remove_right_units(string)
# remove percentage
string = string.replace("\\%", "")
string = string.replace("\%", "") # noqa: W605
# " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
string = string.replace(" .", " 0.")
string = string.replace("{.", "{0.")
# if empty, return empty string
if len(string) == 0:
return string
if string[0] == ".":
string = "0" + string
# to consider: get rid of e.g. "k = " or "q = " at beginning
if len(string.split("=")) == 2:
if len(string.split("=")[0]) <= 2:
string = string.split("=")[1]
# fix sqrt3 --> sqrt{3}
string = fix_sqrt(string)
# remove spaces
string = string.replace(" ", "")
# \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
string = fix_fracs(string)
# manually change 0.5 --> \frac{1}{2}
if string == "0.5":
string = "\\frac{1}{2}"
# NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
string = fix_a_slash_b(string)
return string
# Babilong
### Paper
Title: Babilong: Testing the Limits of LLMs with Long Context Reasoning-in-a-Haystack
Abstract: https://arxiv.org/abs/2406.10149
In recent years, the input context sizes of large language models (LLMs) have increased dramatically. However, existing evaluation methods have not kept pace, failing to comprehensively assess the efficiency of models in handling long contexts. To bridge this gap, we introduce the BABILong benchmark, designed to test language models' ability to reason across facts distributed in extremely long documents. BABILong includes a diverse set of 20 reasoning tasks, including fact chaining, simple induction, deduction, counting, and handling lists/sets. These tasks are challenging on their own, and even more demanding when the required facts are scattered across long natural text. Our evaluations show that popular LLMs effectively utilize only 10-20\% of the context and their performance declines sharply with increased reasoning complexity. Among alternatives to in-context reasoning, Retrieval-Augmented Generation methods achieve a modest 60\% accuracy on single-fact question answering, independent of context length. Among context extension methods, the highest performance is demonstrated by recurrent memory transformers after fine-tuning, enabling the processing of lengths up to 50 million tokens. The BABILong benchmark is extendable to any length to support the evaluation of new upcoming models with increased capabilities, and we provide splits up to 10 million token lengths.
Homepage: https://github.com/booydar/babilong
### Citation
```
@article{kuratov2024babilong,
title={Babilong: Testing the Limits of LLMs with Long Context Reasoning-in-a-Haystack},
author={Kuratov, Yuri and Bulatov, Aydar and Anokhin, Petr and Rodkin, Ivan and Sorokin, Dmitry and Burtsev, Mikhail},
journal={arXiv preprint arXiv:2406.10149},
year={2024}
}
```
### Groups and Tasks
#### Groups
* `babilong`: All Babilong tasks at 0k context length
* `babilong_longctx`: Babilong tasks between qa1-qa5 at context lengths up to 128k
#### Tasks
The benchmark includes 1000 samples of 20 reasoning tasks at various context lengths:
**QA Tasks (qa1-qa20):**
* `babilong_qa1`: Single supporting fact QA
* `babilong_qa2`: Two supporting facts QA
* `babilong_qa3`: Three supporting facts QA
* `babilong_qa4`: Two argument relations
* `babilong_qa5`: Three argument relations
* `babilong_qa6`: Yes/No questions
* `babilong_qa7`: Counting
* `babilong_qa8`: Lists and sets
* `babilong_qa9`: Simple negation
* `babilong_qa10`: Indefinite knowledge
* `babilong_qa11`: Track person through temporal references
* `babilong_qa12`: Conjunction
* `babilong_qa13`: Compound coreference
* `babilong_qa14`: Time reasoning
* `babilong_qa15`: Basic deduction
* `babilong_qa16`: Basic induction
* `babilong_qa17`: Positional reasoning
* `babilong_qa18`: Size reasoning
* `babilong_qa19`: Path finding
* `babilong_qa20`: Motivation deduction
> [!NOTE]
> When using babilong tasks, please note:
> 1. This is the implementation with 1000 samples per length. You can change the dataset path to `RMT-team/babilong` in `common_utils.py` for the dataset with 100 samples per length, which supports context lengths up to 10M tokens.
> 2. Supported lengths are 0k, 1, 2, 4, 8, 16, 32, 64, 128k tokens for tasks qa1-5. Tasks qa6-20 only have a length of 0k.
> 3. The default maximum sequence length is 0k. For calculating metrics of different max seq lengths, specify additional lengths using the metadata parameter:
> `--metadata '{"max_seq_lengths":"0k,1k,2k,4k,8k,16k,32k,128k"}'`. The config currently only takes one context length at a time. The metadata parameter can also be passed to the TaskManager (metadata: dict).
### Checklist
For adding novel benchmarks/datasets to the library:
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
dataset_path: RMT-team/babilong-1k-samples
output_type: generate_until
doc_to_target: "{{target}}"
target_delimiter: " "
num_fewshot: 2
process_results: !function common_utils.process_results
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
generation_kwargs:
do_sample: false
temperature: 0.0
max_gen_toks: 16
until: []
metadata:
version: 0.0
group: babilong
task:
- babilong_qa1
- babilong_qa2
- babilong_qa3
- babilong_qa4
- babilong_qa5
- babilong_qa6
- babilong_qa7
- babilong_qa8
- babilong_qa9
- babilong_qa10
- babilong_qa11
- babilong_qa12
- babilong_qa13
- babilong_qa14
- babilong_qa15
- babilong_qa16
- babilong_qa17
- babilong_qa18
- babilong_qa19
- babilong_qa20
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 0.0
group: babilong_longctx
task:
- babilong_qa1
- babilong_qa2
- babilong_qa3
- babilong_qa4
- babilong_qa5
aggregate_metric_list:
- metric: acc
weight_by_size: True
metadata:
version: 0.0
include: _babilong_common_yaml
task: babilong_qa1
test_split: qa1
custom_dataset: !function common_utils.load_dataset
dataset_kwargs:
qa_split: qa1
description: "I will give you context with the facts about positions of different persons hidden in some random text and a question. You need to answer the question based only on the information from the facts. If a person was in different locations, use the latest location to answer the question.\nAlways return your answer in the following format:\nThe most recent location of 'person' is 'location'. Do not write anything else after that.\n\n"
doc_to_text: "{{input.strip()}}\n{{question.strip()}}"
fewshot_config:
sampler: first_n
samples:
- input: "Charlie went to the hallway. Judith come back to the kitchen. Charlie travelled to balcony."
question: "Where is Charlie?"
target: "The most recent location of Charlie is balcony."
- input: "Alan moved to the garage. Charlie went to the beach. Alan went to the shop. Rouse travelled to balcony."
question: "Where is Alan?"
target: "The most recent location of Alan is shop."
include: _babilong_common_yaml
task: babilong_qa10
test_split: qa10
custom_dataset: !function common_utils.load_dataset
dataset_kwargs:
qa_split: qa10
description: "I will give you context with the facts about people and their locations hidden in some random text and a question. You need to answer the question based only on the information from the facts.\nIf a person was in different locations, use the latest location the person was in to answer the question.\nYour answer should contain only one word - $yes$ or $no$ or $maybe$. Do not write anything else. Do not explain your answer.\n\n"
doc_to_text: "{{input.strip()}}\n{{question.strip()}}"
fewshot_config:
sampler: first_n
samples:
- input: "Bill is in the kitchen. Julie is either in the school or the cinema."
question: "Is Bill in the bedroom?"
target: "no"
- input: "Fred is in the bedroom. Mary is either in the school or the cinema."
question: "Is Mary in the school?"
target: "maybe"
- input: "Fred is either in the kitchen or the park. Bill moved to the cinema."
question: "Is Bill in the cinema?"
target: "yes"
include: _babilong_common_yaml
task: babilong_qa11
test_split: qa11
dataset_name: 0k
description: "I will give you context with the facts about people and their locations hidden in some random text and a question. You need to answer the question based only on the information from the facts.\nIf a person was in different locations, use the latest location the person was in to answer the question.\nYour answer should contain only one word - location. Do not write anything else after that. Do not explain your answer.\n\n"
doc_to_text: "{{input.strip()}}\n{{question.strip()}}"
fewshot_config:
sampler: first_n
samples:
- input: "Daniel journeyed to the hallway. After that he journeyed to the garden."
question: "Where is Daniel?"
target: "garden"
- input: "Mary moved to the office. Afterwards she journeyed to the kitchen. Daniel went to the hallway. Then he journeyed to the garden."
question: "Where is Mary?"
target: "kitchen"
- input: "Sandra moved to the kitchen. After that she went back to the hallway. Sandra moved to the bedroom. Then she went to the hallway. Mary moved to the bedroom. Afterwards she travelled to the bathroom."
question: "Where is Sandra?"
target: "hallway"
include: _babilong_common_yaml
task: babilong_qa12
test_split: qa12
dataset_name: 0k
description: "I will give you context with the facts about people and their locations hidden in some random text and a question. You need to answer the question based only on the information from the facts.\nIf a person was in different locations, use the latest location the person was in to answer the question.\nYour answer should contain only one word - location. Do not write anything else after that. Do not explain your answer.\n\n"
doc_to_text: "{{input.strip()}}\n{{question.strip()}}"
fewshot_config:
sampler: first_n
samples:
- input: "Mary and Daniel travelled to the bathroom. John and Daniel travelled to the office."
question: "Where is Daniel?"
target: "office"
- input: "Sandra and Mary went back to the office. Daniel and Sandra went to the bedroom. Sandra and Mary travelled to the hallway. John and Mary went to the kitchen."
question: "Where is Mary?"
target: "kitchen"
- input: "Daniel and Sandra went back to the hallway. Daniel and John moved to the office. Daniel and John moved to the garden. Daniel and Mary went back to the bathroom. Daniel and John went back to the kitchen. Daniel and Sandra went to the bathroom."
question: "Where is John?"
target: "kitchen"
include: _babilong_common_yaml
task: babilong_qa13
test_split: qa13
dataset_name: 0k
description: "I will give you context with the facts about people and their locations hidden in some random text and a question. You need to answer the question based only on the information from the facts.\nIf a person was in different locations, use the latest location the person was in to answer the question.\nYour answer should contain only one word - location. Do not write anything else after that. Do not explain your answer.\n\n"
doc_to_text: "{{input.strip()}}\n{{question.strip()}}"
fewshot_config:
sampler: first_n
samples:
- input: "Mary and Daniel travelled to the bathroom. Then they journeyed to the hallway."
question: "Where is Daniel?"
target: "hallway"
- input: "Daniel and Sandra travelled to the kitchen. After that they journeyed to the hallway. Mary and Daniel travelled to the bedroom. After that they travelled to the hallway."
question: "Where is Sandra?"
target: "hallway"
- input: "John and Mary moved to the bathroom. Then they travelled to the office. John and Mary went to the kitchen. Afterwards they went to the bedroom. John and Sandra moved to the bathroom. Following that they went back to the kitchen."
question: "Where is Mary?"
target: "bedroom"
include: _babilong_common_yaml
task: babilong_qa14
test_split: qa14
dataset_name: 0k
description: "I will give you context with the facts about people and their locations hidden in some random text and a question. You need to answer the question based only on the information from the facts.\nIf a person was in different locations, use the latest location the person was in to answer the question.\nYour answer should contain only one word - location. Do not write anything else after that. Do not explain your answer.\n\n"
doc_to_text: "{{input.strip()}}\n{{question.strip()}}"
fewshot_config:
sampler: first_n
samples:
- input: "Bill went back to the cinema yesterday. Julie went to the school this morning. Fred went to the park yesterday. Yesterday Julie went to the office."
question: "Where was Julie before the school?"
target: "office"
- input: "This morning Fred went to the kitchen. Fred journeyed to the bedroom yesterday. Mary travelled to the bedroom this morning. Yesterday Mary went to the cinema."
question: "Where was Mary before the bedroom?"
target: "cinema"
- input: "Yesterday Julie went back to the park. Julie went to the bedroom this morning. Bill journeyed to the cinema yesterday. This morning Bill went back to the park. This evening Julie went to the school. This afternoon Julie went back to the park."
question: "Where was Julie before the bedroom?"
target: "park"
include: _babilong_common_yaml
task: babilong_qa15
test_split: qa15
dataset_name: 0k
description: "I will give you context with the facts about animals, their names and relations. The facts and a question are hidden in some random text. You need to answer the question based only on the information from the facts.\nYour answer should contain only one word - an animal species. Do not write anything else after that. Do not explain your answer.\n\n"
doc_to_text: "{{input.strip()}}\n{{question.strip()}}"
fewshot_config:
sampler: first_n
samples:
- input: "Mice are afraid of wolves. Gertrude is a mouse. Cats are afraid of sheep. Winona is a mouse. Sheep are afraid of wolves. Emily is a mouse. Jessica is a wolf."
question: "What is gertrude afraid of?"
target: "wolf"
- input: "Mice are afraid of wolves. Gertrude is a mouse. Cats are afraid of sheep. Winona is a mouse. Sheep are afraid of wolves. Emily is a mouse. Jessica is a wolf."
question: "What is jessica afraid of?"
target: "cat"
- input: "Mice are afraid of cats. Wolves are afraid of sheep. Emily is a wolf. Cats are afraid of sheep. Gertrude is a wolf. Sheep are afraid of cats. Winona is a wolf."
question: "What is emily afraid of?"
target: "sheep"
include: _babilong_common_yaml
task: babilong_qa16
test_split: qa16
dataset_name: 0k
description: "I will give you context with the facts about animals, their names and colors. The facts and a question are hidden in some random text. You need to answer the question based only on the information from the facts.\nYour answer should contain only one word - a color. Do not write anything else after that.\nDo not explain your answer.\n\n"
doc_to_text: "{{input.strip()}}\n{{question.strip()}}"
fewshot_config:
sampler: first_n
samples:
- input: "Lily is a frog. Bernhard is a frog. Bernhard is green. Brian is a lion. Brian is white. Julius is a swan. Julius is green. Lily is green. Greg is a swan."
question: "What color is Greg?"
target: "green"
- input: "Julius is a lion. Lily is a rhino. Bernhard is a swan. Lily is white. Bernhard is green. Greg is a rhino. Greg is gray. Julius is white. Brian is a lion."
question: "What color is Brian?"
target: "white"
- input: "Brian is a rhino. Julius is a lion. Bernhard is a lion. Greg is a swan. Brian is gray. Greg is white. Lily is a rhino. Bernhard is yellow. Lily is gray."
question: "What color is Julius?"
target: "yellow"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment