"test/vscode:/vscode.git/clone" did not exist on "3a79613c28319030a5fe7fe22284b178f56984e1"
Commit f7b81bd4 authored by lintangsutawika's avatar lintangsutawika
Browse files

modifications for current evals on t5v2

parent 032e879b
...@@ -399,7 +399,7 @@ def evaluate( ...@@ -399,7 +399,7 @@ def evaluate(
if type(items[0]) == tuple: if type(items[0]) == tuple:
numitem = len(items[0]) numitem = len(items[0])
if isinstance(items[0], (str, list)): if isinstance(items[0], (str, list, tuple)):
# handle the string case # handle the string case
gathered_items = [None] * lm.accelerator.num_processes gathered_items = [None] * lm.accelerator.num_processes
torch.distributed.all_gather_object(gathered_items, items) torch.distributed.all_gather_object(gathered_items, items)
...@@ -492,6 +492,8 @@ def evaluate( ...@@ -492,6 +492,8 @@ def evaluate(
]: ]:
stderr = "_stderr,".join(metric.split(",")) stderr = "_stderr,".join(metric.split(","))
stderr_score = results[task][stderr] stderr_score = results[task][stderr]
if isinstance(stderr_score, str):
stderr_score = 0
var_score = stderr_score**2 var_score = stderr_score**2
metric_score = results[task][metric] metric_score = results[task][metric]
......
...@@ -17,6 +17,7 @@ generation_kwargs: ...@@ -17,6 +17,7 @@ generation_kwargs:
- "</s>" - "</s>"
- "Q" - "Q"
- "\n\n" - "\n\n"
- "<0x0A>"
do_sample: false do_sample: false
temperature: 0.0 temperature: 0.0
filter_list: filter_list:
......
...@@ -14,6 +14,7 @@ generation_kwargs: ...@@ -14,6 +14,7 @@ generation_kwargs:
- "</s>" - "</s>"
- "Q" - "Q"
- "\n\n" - "\n\n"
- "<0x0A>"
do_sample: false do_sample: false
temperature: 0.0 temperature: 0.0
filter_list: filter_list:
......
...@@ -14,6 +14,7 @@ generation_kwargs: ...@@ -14,6 +14,7 @@ generation_kwargs:
- "</s>" - "</s>"
- "Q" - "Q"
- "\n\n" - "\n\n"
- "<0x0A>"
do_sample: false do_sample: false
temperature: 0.0 temperature: 0.0
num_fewshot: 0 num_fewshot: 0
......
...@@ -14,6 +14,7 @@ generation_kwargs: ...@@ -14,6 +14,7 @@ generation_kwargs:
- "</s>" - "</s>"
- "Q:" - "Q:"
- "\n\n" - "\n\n"
- "<0x0A>"
do_sample: false do_sample: false
temperature: 0.0 temperature: 0.0
num_fewshot: 0 num_fewshot: 0
......
group: flan_anli group: flan_anli
task: task:
- include: yaml_templates/held_in_template_yaml - include: yaml_templates/held_in_template_yaml
task: anli_r1 task: r1
dataset_path: anli dataset_path: anli
use_prompt: prompt_templates/anli.yaml:* use_prompt: prompt_templates/anli.yaml:*
validation_split: dev_r1 validation_split: dev_r1
- include: yaml_templates/held_in_template_yaml - include: yaml_templates/held_in_template_yaml
task: anli_r2 task: r2
dataset_path: anli dataset_path: anli
use_prompt: prompt_templates/anli.yaml:* use_prompt: prompt_templates/anli.yaml:*
validation_split: dev_r2 validation_split: dev_r2
- include: yaml_templates/held_in_template_yaml - include: yaml_templates/held_in_template_yaml
task: anli_r3 task: r3
dataset_path: anli dataset_path: anli
use_prompt: prompt_templates/anli.yaml:* use_prompt: prompt_templates/anli.yaml:*
validation_split: dev_r3 validation_split: dev_r3
group: flan_arc group: flan_arc
task: task:
- include: yaml_templates/held_in_template_yaml - include: yaml_templates/held_in_template_yaml
task: arc_easy
dataset_path: ai2_arc dataset_path: ai2_arc
dataset_name: ARC-Easy dataset_name: ARC-Easy
use_prompt: prompt_templates/arc.yaml:* use_prompt: prompt_templates/arc.yaml:*
validation_split: validation validation_split: validation
- include: yaml_templates/held_in_template_yaml - include: yaml_templates/held_in_template_yaml
task: arc_challenge
dataset_path: ai2_arc dataset_path: ai2_arc
dataset_name: ARC-Challenge dataset_name: ARC-Challenge
use_prompt: prompt_templates/arc.yaml:* use_prompt: prompt_templates/arc.yaml:*
......
group: flan_held_in group: flan_held_in
task: task:
- flan_boolq - include: yaml_templates/held_in_template_yaml
- flan_rte task: r1
- flan_anli dataset_path: anli
- flan_arc use_prompt: prompt_templates/anli.yaml:*
validation_split: dev_r1
- include: yaml_templates/held_in_template_yaml
task: r2
dataset_path: anli
use_prompt: prompt_templates/anli.yaml:*
validation_split: dev_r2
- include: yaml_templates/held_in_template_yaml
task: r3
dataset_path: anli
use_prompt: prompt_templates/anli.yaml:*
validation_split: dev_r3
- include: yaml_templates/held_in_template_yaml
dataset_path: ai2_arc
dataset_name: ARC-Easy
use_prompt: prompt_templates/arc.yaml:*
validation_split: validation
- include: yaml_templates/held_in_template_yaml
dataset_path: ai2_arc
dataset_name: ARC-Challenge
use_prompt: prompt_templates/arc.yaml:*
validation_split: validation
- include: yaml_templates/held_in_template_yaml
dataset_path: super_glue
dataset_name: boolq
use_prompt: prompt_templates/boolq.yaml:*
validation_split: validation
- include: yaml_templates/held_in_template_yaml
dataset_path: super_glue
dataset_name: rte
use_prompt: prompt_templates/rte.yaml:*
validation_split: validation
group: flan_held_in
task:
- include: flan/yaml_templates/held_in_template_yaml
dataset_path: super_glue
dataset_name: boolq
use_prompt: flan/prompt_templates/boolq.yaml:*
validation_split: validation
- include: flan/yaml_templates/held_in_template_yaml
dataset_path: super_glue
dataset_name: rte
use_prompt: flan/prompt_templates/rte.yaml:*
validation_split: validation
- include: flan/yaml_templates/held_in_template_yaml
task: anli_r1
dataset_path: anli
use_prompt: flan/prompt_templates/anli.yaml:*
validation_split: dev_r1
- include: flan/yaml_templates/held_in_template_yaml
task: anli_r2
dataset_path: anli
use_prompt: flan/prompt_templates/anli.yaml:*
validation_split: dev_r2
- include: flan/yaml_templates/held_in_template_yaml
task: anli_r3
dataset_path: anli
use_prompt: flan/prompt_templates/anli.yaml:*
validation_split: dev_r3
- include: flan/yaml_templates/held_in_template_yaml
task: arc_easy
dataset_path: ai2_arc
dataset_name: ARC-Easy
use_prompt: flan/prompt_templates/arc.yaml:*
validation_split: validation
- include: flan/yaml_templates/held_in_template_yaml
task: arc_challenge
dataset_path: ai2_arc
dataset_name: ARC-Challenge
use_prompt: flan/prompt_templates/arc.yaml:*
validation_split: validation
group: flan_held_out group: flan_held_out
task: task:
# BBH # BBH
- bbh_flan_zeroshot - bbh_zeroshot
- bbh_flan_fewshot - bbh_fewshot
- bbh_flan_cot_fewshot - bbh_cot_fewshot
- bbh_flan_cot_zeroshot - bbh_cot_zeroshot
# MMLU # MMLU
- mmlu - mmlu
- mmlu_flan_n_shot_generative - mmlu_flan_n_shot_generative
......
...@@ -8,6 +8,7 @@ doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}" ...@@ -8,6 +8,7 @@ doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
generation_kwargs: generation_kwargs:
until: until:
- "</s>" - "</s>"
- "<0x0A>"
metric_list: metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment