"include/vscode:/vscode.git/clone" did not exist on "b3084bca82917d7b50881eb5fd2dcfbdf58c231a"
Commit e5161a6d authored by lintangsutawika's avatar lintangsutawika
Browse files

changes

parent 57f08e40
...@@ -61,21 +61,30 @@ def include_benchmarks(task_dir, benchmark_dir="benchmarks"): ...@@ -61,21 +61,30 @@ def include_benchmarks(task_dir, benchmark_dir="benchmarks"):
if (subdirs == [] or subdirs == ["__pycache__"]) and (len(file_list) > 0): if (subdirs == [] or subdirs == ["__pycache__"]) and (len(file_list) > 0):
for f in file_list: for f in file_list:
if f.endswith(".yaml"): if f.endswith(".yaml"):
benchmark_path = os.path.join(root, f) try:
benchmark_path = os.path.join(root, f)
with open(benchmark_path, "rb") as file:
yaml_config = yaml.full_load(file) with open(benchmark_path, "rb") as file:
yaml_config = yaml.full_load(file)
assert "group" in yaml_config
group = yaml_config["group"] assert "group" in yaml_config
task_list = yaml_config["task"] group = yaml_config["group"]
task_names = utils.pattern_match(task_list, ALL_TASKS) task_list = yaml_config["task"]
for task in task_names: task_names = utils.pattern_match(task_list, ALL_TASKS)
if task in TASK_REGISTRY: for task in task_names:
if group in GROUP_REGISTRY: if task in TASK_REGISTRY:
GROUP_REGISTRY[group].append(task) if group in GROUP_REGISTRY:
else: GROUP_REGISTRY[group].append(task)
GROUP_REGISTRY[group] = [task] else:
GROUP_REGISTRY[group] = [task]
ALL_TASKS.add(group)
except Exception as error:
eval_logger.warning(
"Failed to load benchmark in\n"
f" {benchmark_path}\n"
" Benchmark will not be added to registry\n"
f" Error: {error}"
)
task_dir = os.path.dirname(os.path.abspath(__file__)) + "/" task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
......
...@@ -8,6 +8,6 @@ task: ...@@ -8,6 +8,6 @@ task:
- winogrande - winogrande
- arc_challenge - arc_challenge
- arc_easy - arc_easy
- logiqa # - logiqa
- blimp_* # - blimp_*
- hendrycksTest* # - hendrycksTest*
group:
- super-glue-lm-eval-v1
task: winogrande task: winogrande
dataset_path: winogrande dataset_path: winogrande
dataset_name: winogrande_xl dataset_name: winogrande_xl
output_type: multiple_choice output_type: multiple_choice
should_decontaminate: true
doc_to_decontamination_query: "{{sentence}}"
training_split: train training_split: train
validation_split: validation validation_split: validation
doc_to_text: !function preprocess_winogrande.doc_to_text
doc_to_target: !function preprocess_winogrande.doc_to_target
doc_to_choice: !function preprocess_winogrande.doc_to_choice
should_decontaminate: true
doc_to_decontamination_query: sentence
metric_list: metric_list:
- metric: exact_match - metric: acc
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
ignore_case: true
ignore_punctuation: true
import re
from lm_eval.utils import general_detokenize
def partial_context(doc, option):
# Substitute the pronoun in the sentence with the specified option
# and ignore everything after.
pronoun_loc = doc["sentence"].index("_")
return doc["sentence"][:pronoun_loc] + option
def partial_target(doc):
# The target is everything after the document specified pronoun.
pronoun_loc = doc["sentence"].index("_") + 1
return " " + doc["sentence"][pronoun_loc:].strip()
task: winogrande
dataset_path: winogrande
dataset_name: winogrande_xl
output_type: multiple_choice
training_split: train
validation_split: validation
doc_to_text: !function preprocess_winogrande.doc_to_text
doc_to_target: !function preprocess_winogrande.doc_to_target
doc_to_choice: !function preprocess_winogrande.doc_to_choice
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment