Commit b58e5556 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into tasklist

# Conflicts:
#	pyproject.toml
parents 6e1866f5 4f8195f1
......@@ -12,9 +12,9 @@ def prompt_func(mode, lang):
"prompt_3": f"You are an assistant able to classify topics in texts. \n\n"
f"Given the categories technology, religion, politics, sports, health, entertainment, or business; what is "
f"the topic of the {lang} statement below? Return only the category. "
"\n\ntext: {{headline}} \category:\n\n",
"\n\ntext: {{headline}} \\category:\n\n",
"prompt_4": "Label the following text as technology, religion, politics, sports, health, entertainment, or geography. Provide only the category as your "
"response. \n\ntext: {{headline}} \category: \n\n",
"response. \n\ntext: {{headline}} \\category: \n\n",
"prompt_5": f"You are tasked with performing topic classification on the following {lang} text. "
f"For each input, classify the topic as technology, business, politics, sports, health, entertainment, or religion. "
f"Use the following guidelines: \n\n "
......@@ -27,7 +27,7 @@ def prompt_func(mode, lang):
f"business: The text covers economy, business, or related topics. \n\n"
f"If the text contains multiple topics, choose the dominant topic. "
f"For ambiguous or unclear topics, select the category that best reflects the overall content. "
"Please provide a single classification for each input.\n\ntext: {{headline}} \category: \n\n",
"Please provide a single classification for each input.\n\ntext: {{headline}} \\category: \n\n",
}
return prompt_map[mode]
......
......@@ -17,9 +17,9 @@ def prompt_func(mode, lang):
"prompt_3": f"You are an assistant able to classify topics in texts. \n\n"
f"Given the categories science/technology, travel, politics, sports, health, entertainment, or geography; what is "
f"the topic of the {lang} statement below? Return only the category. "
"\n\ntext: {{text}} \category:\n\n",
"\n\ntext: {{text}} \\category:\n\n",
"prompt_4": "Label the following text as science/technology, travel, politics, sports, health, entertainment, or geography. Provide only the category as your "
"response. \n\ntext: {{text}} \category: \n\n",
"response. \n\ntext: {{text}} \\category: \n\n",
"prompt_5": f"You are tasked with performing topic classification on the following {lang} text. "
f"For each input, classify the topic as science/technology, travel, politics, sports, health, entertainment, or geography. "
f"Use the following guidelines: \n\n "
......@@ -32,7 +32,7 @@ def prompt_func(mode, lang):
f"geography: The text involves geographical information, locations, or related topics. \n\n"
f"If the text contains multiple topics, choose the dominant topic. "
f"For ambiguous or unclear topics, select the category that best reflects the overall content. "
"Please provide a single classification for each input.\n\ntext: {{text}} \category: \n\n",
"Please provide a single classification for each input.\n\ntext: {{text}} \\category: \n\n",
}
return prompt_map[mode]
......
......@@ -4,8 +4,6 @@ tag:
task: null
dataset_path: csebuetnlp/xlsum
dataset_name: null
dataset_kwargs:
trust_remote_code: true
output_type: generate_until
generation_kwargs:
until:
......
......@@ -4,8 +4,6 @@ tag:
task: null
dataset_path: csebuetnlp/xlsum
dataset_name: null
dataset_kwargs:
trust_remote_code: true
output_type: generate_until
generation_kwargs:
until:
......
......@@ -4,8 +4,6 @@ tag:
task: null
dataset_path: csebuetnlp/xlsum
dataset_name: null
dataset_kwargs:
trust_remote_code: true
output_type: generate_until
generation_kwargs:
until:
......
......@@ -47,7 +47,7 @@ def parse_math_answer(raw_string):
return retval
def get_answer_with_dollar_sign(s):
first_pattern = "\$(.*)\$"
first_pattern = r"\$(.*)\$"
last_match = None
matches = re.findall(first_pattern, s)
if matches:
......@@ -63,7 +63,7 @@ def parse_math_answer(raw_string):
if "\\n" in last_match:
last_match = last_match.split("\\n")[0]
else:
pattern = "(?:\\$)?\d+(?:\.\d+)?(?![\w\d])"
pattern = "(?:\\$)?\\d+(?:\\.\\d+)?(?![\\w\\d])"
matches = re.findall(pattern, s)
if matches:
last_match = matches[-1]
......@@ -186,7 +186,7 @@ def _strip_string(string):
# remove percentage
string = string.replace("\\%", "")
string = string.replace("\%", "")
string = string.replace(r"\%", "")
# " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
string = string.replace(" .", " 0.")
......
......@@ -15,5 +15,3 @@ metric_list:
higher_is_better: true
metadata:
version: 2.0
dataset_kwargs:
trust_remote_code: true
include: arithmetic_1dc.yaml
task: arithmetic_2da
dataset_name: arithmetic_2da
dataset_kwargs:
trust_remote_code: true
include: arithmetic_1dc.yaml
task: arithmetic_2dm
dataset_name: arithmetic_2dm
dataset_kwargs:
trust_remote_code: true
include: arithmetic_1dc.yaml
task: arithmetic_2ds
dataset_name: arithmetic_2ds
dataset_kwargs:
trust_remote_code: true
include: arithmetic_1dc.yaml
task: arithmetic_3da
dataset_name: arithmetic_3da
dataset_kwargs:
trust_remote_code: true
include: arithmetic_1dc.yaml
task: arithmetic_3ds
dataset_name: arithmetic_3ds
dataset_kwargs:
trust_remote_code: true
include: arithmetic_1dc.yaml
task: arithmetic_4da
dataset_name: arithmetic_4da
dataset_kwargs:
trust_remote_code: true
include: arithmetic_1dc.yaml
task: arithmetic_4ds
dataset_name: arithmetic_4ds
dataset_kwargs:
trust_remote_code: true
include: arithmetic_1dc.yaml
task: arithmetic_5da
dataset_name: arithmetic_5da
dataset_kwargs:
trust_remote_code: true
include: arithmetic_1dc.yaml
task: arithmetic_5ds
dataset_name: arithmetic_5ds
dataset_kwargs:
trust_remote_code: true
......@@ -41,13 +41,13 @@ fewshot_config:
target: Olivia had 23 dollars. 5 bagels for 3 dollars each will be 5 x 3 = 15
dollars. So she has 23 - 15 dollars left. 23 - 15 is 8. The final answer is 8
filter_list:
- filter:
- filter:
- function: regex
group_select: -1
regex_pattern: The final answer is ((-?[$0-9.,]{2,})|(-?[0-9]+))
- function: take_first
name: strict-match
- filter:
- filter:
- function: regex
group_select: -1
regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
......@@ -62,11 +62,11 @@ generation_kwargs:
- </s>
- <|im_end|>
tag:
- chain_of_thought
- chain_of_thought
metadata:
version: 1.0
metric_list:
- aggregation: mean
- aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: false
......@@ -84,5 +84,3 @@ validation_split: validation
test_split: validation
should_decontaminate: true
doc_to_decontamination_query: "{{body}} {{question}}"
dataset_kwargs:
trust_remote_code: true
......@@ -12,5 +12,3 @@ metric_list:
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -23,5 +23,3 @@ metric_list:
higher_is_better: true
metadata:
version: 0.1
dataset_kwargs:
trust_remote_code: true
......@@ -53,4 +53,7 @@ None.
- [ ] Majority voting "without CoT"
### Changelog
no version change: changed dataset to `SaylorTwift/bbh`. Do not expect any change in the results.
- no version change: changed dataset to `SaylorTwift/bbh`. Do not expect any change in the results.
- `bbh_cot_fewshot` v.4.0; 2025-07-14:
- PR #3140. Removed duplicate "Let's think step by step" from the fewshots.
- set target_delimiter to "" as the fewshot samples end with a newline character.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment