Unverified Commit 9f020aaf authored by Jess's avatar Jess Committed by GitHub
Browse files

Merge pull request #3 from JessicaOjo/afrimgsm

add afrimgsm -direct
parents 816832f8 c4f634c6
!!@@##@@!! -- Example 0
If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?
Answer:5
Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?
Answer:
!!@@##@@!! -- Example 1
Olivia has $23. She bought five bagels for $3 each. How much money does she have left?
Answer:8
A robe takes 2 bolts of blue fiber and half that much white fiber. How many bolts in total does it take?
Answer:
!!@@##@@!! -- Example 2
There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?
Answer:29
Josh decides to try flipping a house. He buys a house for $80,000 and then puts in $50,000 in repairs. This increased the value of the house by 150%. How much profit did he make?
Answer:
......@@ -58,6 +58,20 @@ def f1_score(items):
return np.max(fscore)
@register_aggregation("squad_f1")
def squad_f1_score(items):
gold_squad, pred_squad = [], []
for index, (ref, pred) in enumerate(items):
pred_dict = {'prediction_text': str(pred), 'id': str(index)}
ref_dict = {'answers': {'answer_start': [0], 'text': str(ref)}, 'id': str(index)}
gold_squad.append(ref_dict)
pred_squad.append(pred_dict)
squad_metric = hf_evaluate.load("squad")
results_squad = squad_metric.compute(predictions=pred_squad, references=gold_squad)
return results_squad['f1']
@register_aggregation("matthews_corrcoef")
def matthews_corrcoef(items):
unzipped_list = list(zip(*items))
......@@ -178,6 +192,16 @@ def exact_match_fn(**kwargs):
return exact_match.compute(**kwargs)
@register_metric(
metric="squad",
higher_is_better=True,
output_type="generate_until",
aggregation="squad_f1"
)
def squad_fn(items):
return items
@register_metric(
metric="perplexity",
higher_is_better=False,
......
......@@ -1294,6 +1294,7 @@ class ConfigurableTask(Task):
**({"f1": (gold, pred)} if "f1" in use_metric else {}),
**({"mcc": (gold, pred)} if "mcc" in use_metric else {}),
**({"acc_norm": acc_norm} if "acc_norm" in use_metric else {}),
**({"squad": (gold, pred)} if "squad" in use_metric else {}),
**({"exact_match": exact_match} if "exact_match" in use_metric else {}),
**(
{"brier_score": (gold, prob_norm)}
......@@ -1371,7 +1372,7 @@ class ConfigurableTask(Task):
predictions=[result],
**self._metric_fn_kwargs[metric],
)
except TypeError: # needed for now in order to use a different interface between our own metrics and HF Evaluate metrics
except TypeError as error: # needed for now in order to use a different interface between our own metrics and HF Evaluate metrics
result_score = self._metric_fn_list[metric]([gold, result])
if isinstance(result_score, dict):
# TODO: this handles the case where HF evaluate returns a dict.
......
......@@ -44,13 +44,53 @@ class RegexFilter(Filter):
filtered.append(match)
return filtered
# print(resps)
filtered_resps = list(map(lambda x: filter_set(x), resps))
# print(filtered_resps)
return filtered_resps
@register_filter("regex-numbers")
class RegexFilter(Filter):
""" """
def __init__(
self,
regex_pattern: str = r"#### (\-?[0-9\.\,]+)",
group_select=0,
fallback: str = 0,
) -> None:
"""
pass a string `regex` to run `re.compile(r"regex")` on.
`fallback` defines the output returned if no matches for the regex are located.
"""
self.regex_pattern = regex_pattern
self.regex = re.compile(regex_pattern)
self.group_select = group_select
self.fallback = fallback
def apply(self, resps, docs):
# here, we assume we have a list, in which each element is
# a list of model responses for some particular input/target pair.
# so we process each of these (same input/target response sets)
# independently (and keep them a list.)
def filter_set(inst):
filtered = []
for resp in inst:
match = self.regex.findall(resp)
if match:
match = match[self.group_select]
if isinstance(match, tuple):
match = [m for m in match if m][0]
match = match.strip().replace(',', '').replace('.', '')
else:
match = self.fallback
filtered.append(match)
return filtered
filtered_resps = list(map(lambda x: filter_set(x), resps))
return filtered_resps
@register_filter("remove_whitespace")
class WhitespaceFilter(Filter):
""" """
......
#!/bin/bash
models=(
"masakhane/African-ultrachat-alpaca"
"masakhane/zephyr-7b-gemma-sft-african-alpaca"
"masakhane/zephyr-7b-gemma-sft-african-ultrachat-5k"
"google/flan-t5-xxl"
"bigscience/mt0-xxl-mt"
"CohereForAI/aya-101"
"bigscience/bloomz-7b1-mt"
"meta-llama/Llama-2-7b-chat-hf"
"meta-llama/Meta-Llama-3-8B-Instruct"
"meta-llama/Meta-Llama-3-70B-Instruct"
"google/gemma-1.1-7b-it"
"RWKV/v5-EagleX-v2-7B-HF"
"RWKV/rwkv-6-world-7b"
)
task=afrimgsm_direct_amh,afrimgsm_direct_ibo,afrimgsm_direct_fra,afrimgsm_direct_sna,afrimgsm_direct_lin,afrimgsm_direct_wol,afrimgsm_direct_ewe,afrimgsm_direct_lug,afrimgsm_direct_xho,afrimgsm_direct_kin,afrimgsm_direct_twi,afrimgsm_direct_zul,afrimgsm_direct_orm,afrimgsm_direct_yor,afrimgsm_direct_hau,afrimgsm_direct_sot,afrimgsm_direct_swa
for model in "${models[@]}"
do
echo "Evaluating model: $model"
for fewshot in 0 2 4 6 8
do
export OUTPUT_DIR=results/$fewshot/${model##*/}
mkdir -p "$OUTPUT_DIR"
lm_eval --model hf \
--model_args "pretrained=${model}" \
--tasks $task\
--device cuda:0 \
--batch_size 16 \
--num_fewshot $fewshot \
--verbosity DEBUG
done
done
\ No newline at end of file
group: mgsm_direct
task: afrimgsm_direct
dataset_path: masakhane/afrimgsm
output_type: generate_until
training_split: train
test_split: test
fewshot_split: train
target_delimiter: ""
doc_to_target: '{% if answer is not none %}{{answer}}{% else %}{{answer_number|string}}{% endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
generation_kwargs:
do_sample: false
until:
- 'Question:'
- </s>
- <|im_end|>
filter_list:
- filter:
- function: regex-numbers
group_select: -1
regex_pattern: (-?[0-9.,]{2,})|(-?[0-9]+)
- function: take_first
name: flexible-extract
metric_list:
- metric: squad
aggregation: squad_f1
average: weighted
hf_evaluate: False
higher_is_better: True
metadata:
version: 1.0
# Generated by utils.py
dataset_name: amh
doc_to_target: '{% if answer is not none %}{{answer}}{% else %}{{answer_number|int}}{% endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
generation_kwargs:
do_sample: false
until:
- 'Question:'
- </s>
- <|im_end|>
include: afrimgsm_common_yaml
task: afrimgsm_direct_amh
# Generated by utils.py
dataset_name: eng
include: afrimgsm_common_yaml
task: afrimgsm_direct_eng
# Generated by utils.py
dataset_name: ewe
doc_to_target: '{% if answer is not none %}{{answer}}{% else %}{{answer_number|int}}{% endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
generation_kwargs:
do_sample: false
until:
- 'Question:'
- </s>
- <|im_end|>
include: afrimgsm_common_yaml
task: afrimgsm_direct_ewe
# Generated by utils.py
dataset_name: fra
doc_to_target: '{% if answer is not none %}{{answer}}{% else %}{{answer_number|int}}{% endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
generation_kwargs:
do_sample: false
until:
- 'Question:'
- </s>
- <|im_end|>
include: afrimgsm_common_yaml
task: afrimgsm_direct_fra
# Generated by utils.py
dataset_name: hau
doc_to_target: '{% if answer is not none %}{{answer}}{% else %}{{answer_number|int}}{% endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
generation_kwargs:
do_sample: false
until:
- 'Question:'
- </s>
- <|im_end|>
include: afrimgsm_common_yaml
task: afrimgsm_direct_hau
# Generated by utils.py
dataset_name: ibo
doc_to_target: '{% if answer is not none %}{{answer}}{% else %}{{answer_number|int}}{% endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
generation_kwargs:
do_sample: false
until:
- 'Question:'
- </s>
- <|im_end|>
include: afrimgsm_common_yaml
task: afrimgsm_direct_ibo
# Generated by utils.py
dataset_name: kin
doc_to_target: '{% if answer is not none %}{{answer}}{% else %}{{answer_number|int}}{% endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
generation_kwargs:
do_sample: false
until:
- 'Question:'
- </s>
- <|im_end|>
include: afrimgsm_common_yaml
task: afrimgsm_direct_kin
# Generated by utils.py
dataset_name: lin
doc_to_target: '{% if answer is not none %}{{answer}}{% else %}{{answer_number|int}}{% endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
generation_kwargs:
do_sample: false
until:
- 'Question:'
- </s>
- <|im_end|>
include: afrimgsm_common_yaml
task: afrimgsm_direct_lin
# Generated by utils.py
dataset_name: lug
doc_to_target: '{% if answer is not none %}{{answer}}{% else %}{{answer_number|int}}{% endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
generation_kwargs:
do_sample: false
until:
- 'Question:'
- </s>
- <|im_end|>
include: afrimgsm_common_yaml
task: afrimgsm_direct_lug
# Generated by utils.py
dataset_name: orm
doc_to_target: '{% if answer is not none %}{{answer}}{% else %}{{answer_number|int}}{% endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
generation_kwargs:
do_sample: false
until:
- 'Question:'
- </s>
- <|im_end|>
include: afrimgsm_common_yaml
task: afrimgsm_direct_orm
# Generated by utils.py
dataset_name: sna
doc_to_target: '{% if answer is not none %}{{answer}}{% else %}{{answer_number|int}}{% endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
generation_kwargs:
do_sample: false
until:
- 'Question:'
- </s>
- <|im_end|>
include: afrimgsm_common_yaml
task: afrimgsm_direct_sna
# Generated by utils.py
dataset_name: sot
doc_to_target: '{% if answer is not none %}{{answer}}{% else %}{{answer_number|int}}{% endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
generation_kwargs:
do_sample: false
until:
- 'Question:'
- </s>
- <|im_end|>
include: afrimgsm_common_yaml
task: afrimgsm_direct_sot
# Generated by utils.py
dataset_name: swa
doc_to_target: '{% if answer is not none %}{{answer}}{% else %}{{answer_number|int}}{% endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
generation_kwargs:
do_sample: false
until:
- 'Question:'
- </s>
- <|im_end|>
include: afrimgsm_common_yaml
task: afrimgsm_direct_swa
# Generated by utils.py
dataset_name: twi
doc_to_target: '{% if answer is not none %}{{answer}}{% else %}{{answer_number|int}}{% endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
generation_kwargs:
do_sample: false
until:
- 'Question:'
- </s>
- <|im_end|>
include: afrimgsm_common_yaml
task: afrimgsm_direct_twi
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment