Commit 470059f6 authored by lintangsutawika's avatar lintangsutawika
Browse files

merge conflict

parents b8d7d6c3 9d030712
# Generated by utils.py
dataset_name: what_is_the_tao_zero_shot
include: ../multiple_choice_template_yaml
task: bigbench_what_is_the_tao_multiple_choice
# Generated by utils.py
dataset_name: which_wiki_edit_zero_shot
include: ../multiple_choice_template_yaml
task: bigbench_which_wiki_edit_multiple_choice
# Generated by utils.py
dataset_name: winowhy_zero_shot
include: ../multiple_choice_template_yaml
task: bigbench_winowhy_multiple_choice
# Generated by utils.py
dataset_name: word_sorting_zero_shot
include: ../multiple_choice_template_yaml
task: bigbench_word_sorting_multiple_choice
# Generated by utils.py
dataset_name: word_unscrambling_zero_shot
include: ../multiple_choice_template_yaml
task: bigbench_word_unscrambling_multiple_choice
group: bigbench_multiple_choice
dataset_path: hails/bigbench
dataset_kwargs:
# num_shots: 0 # TODO: num of shots for `bigbench` HF dataset should be controlled through this, not through the typical methods
# subtask_name: null
output_type: multiple_choice
test_split: default
doc_to_text: inputs
doc_to_target: "{{multiple_choice_targets.index(targets[0])}}"
doc_to_choice: "{{multiple_choice_targets}}"
metric_list:
- metric: acc
# TODO: brier score and other metrics
"""
A utility script that pushes all Bigbench subtasks from their form in the `bigbench` HF dataset
into `{org name}/bigbench`.
Prior to running, log into HF Hub for the target HF hub org via `huggingface-cli login`.
Requires the installation of
`pip install "bigbench @ https://storage.googleapis.com/public_research_data/bigbench/bigbench-0.0.1.tar.gz"`
and is included so that the bigbench dependency can be avoided.
"""
from tqdm import tqdm
import datasets
import bigbench.api.util as bb_utils
all_task_names = bb_utils.get_all_json_task_names()
num_shots = [0]
for shots in num_shots:
for task_name in tqdm(all_task_names):
try:
print(f"Loading '{task_name}' with num_shots={shots}...")
task_ds = datasets.load_dataset("bigbench", name=task_name, num_shots=shots)
print(f"Pushing '{task_name}' with num_shots={shots}...")
task_ds.push_to_hub("hails/bigbench", task_name + "_zero_shot")
del task_ds
except Exception as e:
raise e
#!/usr/bin/python
import os
import re
import sys
import math
import subprocess
import xml.sax.saxutils
from typing import List, Pattern, Tuple, Union, Dict, Any, Optional
"""
This script was adapted from the original version by hieuhoang1972 which is part of MOSES.
"""
# $Id: bleu.py 1307 2007-03-14 22:22:36Z hieuhoang1972 $
"""Provides:
cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test().
cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked().
score_cooked(alltest, n=4): Score a list of cooked test sentences.
score_set(s, testid, refids, n=4): Interface with dataset.py; calculate BLEU score of testid against refids.
The reason for breaking the BLEU computation into three phases cook_refs(), cook_test(), and score_cooked() is to allow the caller to calculate BLEU scores for multiple test sets as efficiently as possible.
"""
# Added to bypass NIST-style pre-processing of hyp and ref files -- wade
nonorm = 0
preserve_case = False
eff_ref_len = "shortest"
normalize1: List[Tuple[Union[Pattern[str], str], str]] = [
("<skipped>", ""), # strip "skipped" tags
(r"-\n", ""), # strip end-of-line hyphenation and join lines
(r"\n", " "), # join lines
# (r'(\d)\s+(?=\d)', r'\1'), # join digits
]
normalize1 = [(re.compile(pattern), replace) for (pattern, replace) in normalize1]
normalize2: List[Tuple[Union[Pattern[str], str], str]] = [
(
r"([\{-\~\[-\` -\&\(-\+\:-\@\/])",
r" \1 ",
), # tokenize punctuation. apostrophe is missing
(
r"([^0-9])([\.,])",
r"\1 \2 ",
), # tokenize period and comma unless preceded by a digit
(
r"([\.,])([^0-9])",
r" \1 \2",
), # tokenize period and comma unless followed by a digit
(r"([0-9])(-)", r"\1 \2 "), # tokenize dash when preceded by a digit
]
normalize2 = [(re.compile(pattern), replace) for (pattern, replace) in normalize2]
def normalize(s):
"""Normalize and tokenize text. This is lifted from NIST mteval-v11a.pl."""
# Added to bypass NIST-style pre-processing of hyp and ref files -- wade
if nonorm:
return s.split()
if type(s) is not str:
s = " ".join(s)
# language-independent part:
for (pattern, replace) in normalize1:
s = re.sub(pattern, replace, s)
s = xml.sax.saxutils.unescape(s, {"&quot;": '"'})
# language-dependent part (assuming Western languages):
s = " %s " % s
if not preserve_case:
s = s.lower() # this might not be identical to the original
for (pattern, replace) in normalize2:
s = re.sub(pattern, replace, s)
return s.split()
def count_ngrams(words, n=4):
counts: Dict[Any, int] = {}
for k in range(1, n + 1):
for i in range(len(words) - k + 1):
ngram = tuple(words[i : i + k])
counts[ngram] = counts.get(ngram, 0) + 1
return counts
def cook_refs(refs, n=4):
"""Takes a list of reference sentences for a single segment
and returns an object that encapsulates everything that BLEU
needs to know about them."""
refs = [normalize(ref) for ref in refs]
maxcounts: Dict[Tuple[str], int] = {}
for ref in refs:
counts = count_ngrams(ref, n)
for (ngram, count) in counts.items():
maxcounts[ngram] = max(maxcounts.get(ngram, 0), count)
return ([len(ref) for ref in refs], maxcounts)
def cook_test(test, item, n=4):
"""Takes a test sentence and returns an object that
encapsulates everything that BLEU needs to know about it."""
(reflens, refmaxcounts) = item
test = normalize(test)
result: Dict[str, Any] = {}
result["testlen"] = len(test)
# Calculate effective reference sentence length.
if eff_ref_len == "shortest":
result["reflen"] = min(reflens)
elif eff_ref_len == "average":
result["reflen"] = float(sum(reflens)) / len(reflens)
elif eff_ref_len == "closest":
min_diff: Optional[int] = None
for reflen in reflens:
if min_diff is None or abs(reflen - len(test)) < min_diff:
min_diff = abs(reflen - len(test))
result["reflen"] = reflen
result["guess"] = [max(len(test) - k + 1, 0) for k in range(1, n + 1)]
result["correct"] = [0] * n
counts = count_ngrams(test, n)
for (ngram, count) in counts.items():
result["correct"][len(ngram) - 1] += min(refmaxcounts.get(ngram, 0), count)
return result
def score_cooked(allcomps, n=4, ground=0, smooth=1):
totalcomps: Dict[str, Any] = {
"testlen": 0,
"reflen": 0,
"guess": [0] * n,
"correct": [0] * n,
}
for comps in allcomps:
for key in ["testlen", "reflen"]:
totalcomps[key] += comps[key]
for key in ["guess", "correct"]:
for k in range(n):
totalcomps[key][k] += comps[key][k]
logbleu = 0.0
all_bleus: List[float] = []
for k in range(n):
correct = totalcomps["correct"][k]
guess = totalcomps["guess"][k]
addsmooth = 0
if smooth == 1 and k > 0:
addsmooth = 1
logbleu += math.log(correct + addsmooth + sys.float_info.min) - math.log(
guess + addsmooth + sys.float_info.min
)
if guess == 0:
all_bleus.append(-10000000.0)
else:
all_bleus.append(math.log(correct + sys.float_info.min) - math.log(guess))
logbleu /= float(n)
all_bleus.insert(0, logbleu)
brevPenalty = min(
0, 1 - float(totalcomps["reflen"] + 1) / (totalcomps["testlen"] + 1)
)
for i in range(len(all_bleus)):
if i == 0:
all_bleus[i] += brevPenalty
all_bleus[i] = math.exp(all_bleus[i])
return all_bleus
def bleu(refs, candidate, ground=0, smooth=1):
refs = cook_refs(refs)
test = cook_test(candidate, refs)
return score_cooked([test], ground=ground, smooth=smooth)
def splitPuncts(line):
return " ".join(re.findall(r"[\w]+|[^\s\w]", line))
def computeMaps(predictions, goldfile):
predictionMap: Dict[str, list] = {}
goldMap: Dict[str, list] = {}
gf = open(goldfile, "r")
for row in predictions:
cols = row.strip().split("\t")
if len(cols) == 1:
(rid, pred) = (cols[0], "")
else:
(rid, pred) = (cols[0], cols[1])
predictionMap[rid] = [splitPuncts(pred.strip().lower())]
for row in gf:
(rid, pred) = row.split("\t")
if rid in predictionMap: # Only insert if the id exists for the method
if rid not in goldMap:
goldMap[rid] = []
goldMap[rid].append(splitPuncts(pred.strip().lower()))
sys.stderr.write("Total: " + str(len(goldMap)) + "\n")
return (goldMap, predictionMap)
# m1 is the reference map
# m2 is the prediction map
def bleuFromMaps(m1, m2):
score = [0] * 5
num = 0.0
for key in m1:
if key in m2:
bl = bleu(m1[key], m2[key][0])
score = [score[i] + bl[i] for i in range(0, len(bl))]
num += 1
return [s * 100.0 / num for s in score]
def smoothed_bleu_4(references, predictions, **kwargs):
predictionMap = {}
goldMap = {}
for rid, pred in enumerate(predictions):
predictionMap[rid] = [splitPuncts(pred.strip().lower())]
for rid, row in enumerate(references):
goldMap[rid] = [splitPuncts(row.strip().lower())]
return bleuFromMaps(goldMap, predictionMap)[0]
if __name__ == "__main__":
reference_file = sys.argv[1]
predictions = []
for row in sys.stdin:
predictions.append(row)
(goldMap, predictionMap) = computeMaps(predictions, reference_file)
print(bleuFromMaps(goldMap, predictionMap)[0])
group:
- codexglue_code2text
task: code2text_go
dataset_path: CM/codexglue_code2text_go
training_split: train
validation_split: validation
test_split: test
output_type: generate_until
generation_kwargs:
num_beams: 10
max_length: 128
until:
- "</s>"
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
metric_list:
- metric: !function bleu.smoothed_bleu_4
aggregation: mean
higher_is_better: True
group:
- codexglue_code2text
task: code2text_java
dataset_path: CM/codexglue_code2text_java
training_split: train
validation_split: validation
test_split: test
output_type: generate_until
generation_kwargs:
num_beams: 10
max_length: 128
until:
- "</s>"
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
metric_list:
- metric: !function bleu.smoothed_bleu_4
aggregation: mean
higher_is_better: True
group:
- codexglue_code2text
task: code2text_javascript
dataset_path: CM/codexglue_code2text_javascript
training_split: train
validation_split: validation
test_split: test
output_type: generate_until
generation_kwargs:
num_beams: 10
max_length: 128
until:
- "</s>"
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
metric_list:
- metric: !function bleu.smoothed_bleu_4
aggregation: mean
higher_is_better: True
group:
- codexglue_code2text
task: code2text_php
dataset_path: CM/codexglue_code2text_php
training_split: train
validation_split: validation
test_split: test
output_type: generate_until
generation_kwargs:
num_beams: 10
max_length: 128
until:
- "</s>"
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
metric_list:
- metric: !function bleu.smoothed_bleu_4
aggregation: mean
higher_is_better: True
group:
- codexglue_code2text
task: code2text_python
dataset_path: CM/codexglue_code2text_python
training_split: train
validation_split: validation
test_split: test
output_type: generate_until
generation_kwargs:
num_beams: 10
max_length: 128
until:
- "</s>"
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
metric_list:
- metric: !function bleu.smoothed_bleu_4
aggregation: mean
higher_is_better: True
group:
- codexglue_code2text
task: code2text_ruby
dataset_path: CM/codexglue_code2text_ruby
training_split: train
validation_split: validation
test_split: test
output_type: generate_until
generation_kwargs:
num_beams: 10
max_length: 128
until:
- "</s>"
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
metric_list:
- metric: !function bleu.smoothed_bleu_4
aggregation: mean
higher_is_better: True
def doc_to_text(doc):
inputs = " ".join(doc["code_tokens"]).replace("\n", " ")
inputs = " ".join(inputs.strip().split())
return inputs
def doc_to_target(doc):
targets = " ".join(doc["docstring_tokens"]).replace("\n", "")
targets = " ".join(targets.strip().split())
return targets
task: coqa
dataset_path: EleutherAI/coqa
output_type: greedy_until
output_type: generate_until
training_split: train
validation_split: validation
doc_to_text: !function utils.doc_to_text
......
task: drop
dataset_path: EleutherAI/drop
output_type: greedy_until
output_type: generate_until
training_split: train
validation_split: validation
process_docs: !function utils.process_docs
......
......@@ -3,7 +3,7 @@ group:
task: gsm8k_cot
dataset_path: gsm8k
dataset_name: main
output_type: greedy_until
output_type: generate_until
test_split: test
doc_to_text: "Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?\n\nA: There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6. The answer is 6.\n\n\
Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?\n\nA: There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The answer is 5.\n\n\
......@@ -14,8 +14,7 @@ Q: There were nine computers in the server room. Five more computers were instal
Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?\n\nA: Michael started with 58 golf balls. After losing 23 on tuesday, he had 58 - 23 = 35. After losing 2 more, he had 35 - 2 = 33 golf balls. The answer is 33.\n\n\
Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?\n\nA: Olivia had 23 dollars. 5 bagels for 3 dollars each will be 5 x 3 = 15 dollars. So she has 23 - 15 dollars left. 23 - 15 is 8. The answer is 8.\n\n\
Q: {{question}}\n\nA:"
doc_to_target: "{{answer}}" #" {{answer.split('### ')[-1].rstrip()}}"
gold_alias: "{{answer.split('### ')[-1].rstrip()}}" # this post-processes the reference that we'll score against
doc_to_target: " {{answer.split('### ')[-1].rstrip()}}"
metric_list:
- metric: exact_match
aggregation: mean
......@@ -25,6 +24,8 @@ metric_list:
regexes_to_ignore:
- ","
- "\\$"
- "(?s).*#### "
- "\n\n"
generation_kwargs:
until:
- "Q:"
......@@ -37,5 +38,5 @@ filter_list:
- name: "get-answer"
filter:
- function: "regex"
regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)."
- function: "take_first"
group:
- math_word_problems
task: gsm8k_yaml
task: gsm8k
dataset_path: gsm8k
dataset_name: main
output_type: greedy_until
output_type: generate_until
training_split: train
fewshot_split: train
test_split: test
doc_to_text: "Question: {{question}}\nAnswer:"
doc_to_target: "{{answer}}" #" {{answer.split('### ')[-1].rstrip()}}"
gold_alias: "{{answer.split('### ')[-1].rstrip()}}" # this post-processes the reference that we'll score against
metric_list:
- metric: exact_match
aggregation: mean
......@@ -19,7 +18,7 @@ metric_list:
regexes_to_ignore:
- ","
- "\\$"
- ".*### "
- "(?s).*#### "
generation_kwargs:
until:
- "\n\n"
......@@ -28,9 +27,9 @@ generation_kwargs:
temperature: 0.0
repeats: 1
num_fewshot: 5
# filter_list:
# - name: "get-answer"
# filter:
# - function: "regex"
# regex_pattern: "### (\\-?[0-9\\.\\,]+)"
# - function: "take_first"
filter_list:
- name: "get-answer"
filter:
- function: "regex"
regex_pattern: "#### (\\-?[0-9\\.\\,]+)"
- function: "take_first"
......@@ -9,7 +9,6 @@
# template_aliases: #"{% set answer_choices = range(1, 11)|list %}"
# doc_to_text: 'Activity: "{{activity}}"\nRating:'
# doc_to_target: "{{answer_choices[label]}}"
# gold_alias: "{{label}}" # this will be cast to an int.
# metric_list:
# - metric: acc
# TODO: we want this to be implemented as a winograd_schema task type, actually
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment