Unverified Commit 4f27f0b9 authored by Hailey Schoelkopf's avatar Hailey Schoelkopf Committed by GitHub
Browse files

Merge pull request #774 from EleutherAI/speedup-hellaswag

[Refactor] Speedup hellaswag context building
parents dbf2c083 8189eb18
...@@ -7,9 +7,10 @@ output_type: multiple_choice ...@@ -7,9 +7,10 @@ output_type: multiple_choice
training_split: train training_split: train
validation_split: validation validation_split: validation
test_split: null test_split: null
doc_to_text: "{% set text = activity_label ~ ': ' ~ ctx_a ~ ' ' ~ ctx_b.capitalize() %}{{text|trim|replace(' [title]', '. ')|regex_replace('\\[.*?\\]', '')|replace(' ', ' ')}}" process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{label}}" doc_to_target: "{{label}}"
doc_to_choice: "{{endings|map('trim')|map('replace', ' [title]', '. ')|map('regex_replace', '\\[.*?\\]', '')|map('replace', ' ', ' ')|list}}" doc_to_choice: "{{choices}}"
metric_list: metric_list:
- metric: acc - metric: acc
aggregation: mean aggregation: mean
......
import datasets
import re
def preprocess(text):
text = text.strip()
# NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
text = text.replace(" [title]", ". ")
text = re.sub("\\[.*?\\]", "", text)
text = text.replace(" ", " ")
return text
def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
def _process_doc(doc):
ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize()
out_doc = {
"query": preprocess(doc["activity_label"] + ": " + ctx),
"choices": [preprocess(ending) for ending in doc["endings"]],
"gold": int(doc["label"]),
}
return out_doc
return dataset.map(_process_doc)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment