merge conflict

470059f6 · lintangsutawika · b8d7d6c3 · 9d030712 · 470059f6 · 470059f6
Commit 470059f6 authored Nov 24, 2023 by lintangsutawika
20 changed files
--- a/lm_eval/tasks/bigbench/multiple_choice/what_is_the_tao.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/what_is_the_tao.yaml
+# Generated by utils.py
+dataset_name: what_is_the_tao_zero_shot
+include: ../multiple_choice_template_yaml
+task: bigbench_what_is_the_tao_multiple_choice
--- a/lm_eval/tasks/bigbench/multiple_choice/which_wiki_edit.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/which_wiki_edit.yaml
+# Generated by utils.py
+dataset_name: which_wiki_edit_zero_shot
+include: ../multiple_choice_template_yaml
+task: bigbench_which_wiki_edit_multiple_choice
--- a/lm_eval/tasks/bigbench/multiple_choice/winowhy.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/winowhy.yaml
+# Generated by utils.py
+dataset_name: winowhy_zero_shot
+include: ../multiple_choice_template_yaml
+task: bigbench_winowhy_multiple_choice
--- a/lm_eval/tasks/bigbench/multiple_choice/word_sorting.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/word_sorting.yaml
+# Generated by utils.py
+dataset_name: word_sorting_zero_shot
+include: ../multiple_choice_template_yaml
+task: bigbench_word_sorting_multiple_choice
--- a/lm_eval/tasks/bigbench/multiple_choice/word_unscrambling.yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice/word_unscrambling.yaml
+# Generated by utils.py
+dataset_name: word_unscrambling_zero_shot
+include: ../multiple_choice_template_yaml
+task: bigbench_word_unscrambling_multiple_choice
--- a/lm_eval/tasks/bigbench/multiple_choice_template_yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice_template_yaml
+group: bigbench_multiple_choice
+dataset_path: hails/bigbench
+dataset_kwargs:
+  # num_shots: 0 # TODO: num of shots for `bigbench` HF dataset should be controlled through this, not through the typical methods
+  # subtask_name: null
+output_type: multiple_choice
+test_split: default
+doc_to_text: inputs
+doc_to_target: "{{multiple_choice_targets.index(targets[0])}}"
+doc_to_choice: "{{multiple_choice_targets}}"
+metric_list:
+  - metric: acc
+  # TODO: brier score and other metrics
--- a/lm_eval/tasks/bigbench/push_bigbench_dataset.py
+++ b/lm_eval/tasks/bigbench/push_bigbench_dataset.py
+"""
+A utility script that pushes all Bigbench subtasks from their form in the `bigbench` HF dataset
+into `{org name}/bigbench`.
+
+Prior to running, log into HF Hub for the target HF hub org via `huggingface-cli login`.
+
+Requires the installation of
+`pip install "bigbench @ https://storage.googleapis.com/public_research_data/bigbench/bigbench-0.0.1.tar.gz"`
+and is included so that the bigbench dependency can be avoided.
+"""
+from tqdm import tqdm
+import datasets
+
+import bigbench.api.util as bb_utils
+
+
+all_task_names = bb_utils.get_all_json_task_names()
+
+num_shots = [0]
+
+for shots in num_shots:
+    for task_name in tqdm(all_task_names):
+        try:
+            print(f"Loading '{task_name}' with num_shots={shots}...")
+            task_ds = datasets.load_dataset("bigbench", name=task_name, num_shots=shots)
+
+            print(f"Pushing '{task_name}' with num_shots={shots}...")
+            task_ds.push_to_hub("hails/bigbench", task_name + "_zero_shot")
+
+            del task_ds
+        except Exception as e:
+            raise e
--- a/lm_eval/tasks/code_x_glue/code-text/bleu.py
+++ b/lm_eval/tasks/code_x_glue/code-text/bleu.py
+#!/usr/bin/python
+import os
+import re
+import sys
+import math
+import subprocess
+import xml.sax.saxutils
+
+from typing import List, Pattern, Tuple, Union, Dict, Any, Optional
+
+"""
+This script was adapted from the original version by hieuhoang1972 which is part of MOSES.
+"""
+
+# $Id: bleu.py 1307 2007-03-14 22:22:36Z hieuhoang1972 $
+
+"""Provides:
+
+cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test().
+cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked().
+score_cooked(alltest, n=4): Score a list of cooked test sentences.
+
+score_set(s, testid, refids, n=4): Interface with dataset.py; calculate BLEU score of testid against refids.
+
+The reason for breaking the BLEU computation into three phases cook_refs(), cook_test(), and score_cooked() is to allow the caller to calculate BLEU scores for multiple test sets as efficiently as possible.
+"""
+
+# Added to bypass NIST-style pre-processing of hyp and ref files -- wade
+nonorm = 0
+
+preserve_case = False
+eff_ref_len = "shortest"
+
+normalize1: List[Tuple[Union[Pattern[str], str], str]] = [
+    ("<skipped>", ""),  # strip "skipped" tags
+    (r"-\n", ""),  # strip end-of-line hyphenation and join lines
+    (r"\n", " "),  # join lines
+    #    (r'(\d)\s+(?=\d)', r'\1'), # join digits
+]
+normalize1 = [(re.compile(pattern), replace) for (pattern, replace) in normalize1]
+
+normalize2: List[Tuple[Union[Pattern[str], str], str]] = [
+    (
+        r"([\{-\~\[-\` -\&\(-\+\:-\@\/])",
+        r" \1 ",
+    ),  # tokenize punctuation. apostrophe is missing
+    (
+        r"([^0-9])([\.,])",
+        r"\1 \2 ",
+    ),  # tokenize period and comma unless preceded by a digit
+    (
+        r"([\.,])([^0-9])",
+        r" \1 \2",
+    ),  # tokenize period and comma unless followed by a digit
+    (r"([0-9])(-)", r"\1 \2 "),  # tokenize dash when preceded by a digit
+]
+normalize2 = [(re.compile(pattern), replace) for (pattern, replace) in normalize2]
+
+
+def normalize(s):
+    """Normalize and tokenize text. This is lifted from NIST mteval-v11a.pl."""
+    # Added to bypass NIST-style pre-processing of hyp and ref files -- wade
+    if nonorm:
+        return s.split()
+    if type(s) is not str:
+        s = " ".join(s)
+    # language-independent part:
+    for (pattern, replace) in normalize1:
+        s = re.sub(pattern, replace, s)
+    s = xml.sax.saxutils.unescape(s, {"&quot;": '"'})
+    # language-dependent part (assuming Western languages):
+    s = " %s " % s
+    if not preserve_case:
+        s = s.lower()  # this might not be identical to the original
+    for (pattern, replace) in normalize2:
+        s = re.sub(pattern, replace, s)
+    return s.split()
+
+
+def count_ngrams(words, n=4):
+    counts: Dict[Any, int] = {}
+    for k in range(1, n + 1):
+        for i in range(len(words) - k + 1):
+            ngram = tuple(words[i : i + k])
+            counts[ngram] = counts.get(ngram, 0) + 1
+    return counts
+
+
+def cook_refs(refs, n=4):
+    """Takes a list of reference sentences for a single segment
+    and returns an object that encapsulates everything that BLEU
+    needs to know about them."""
+
+    refs = [normalize(ref) for ref in refs]
+    maxcounts: Dict[Tuple[str], int] = {}
+    for ref in refs:
+        counts = count_ngrams(ref, n)
+        for (ngram, count) in counts.items():
+            maxcounts[ngram] = max(maxcounts.get(ngram, 0), count)
+    return ([len(ref) for ref in refs], maxcounts)
+
+
+def cook_test(test, item, n=4):
+    """Takes a test sentence and returns an object that
+    encapsulates everything that BLEU needs to know about it."""
+    (reflens, refmaxcounts) = item
+    test = normalize(test)
+    result: Dict[str, Any] = {}
+    result["testlen"] = len(test)
+
+    # Calculate effective reference sentence length.
+
+    if eff_ref_len == "shortest":
+        result["reflen"] = min(reflens)
+    elif eff_ref_len == "average":
+        result["reflen"] = float(sum(reflens)) / len(reflens)
+    elif eff_ref_len == "closest":
+        min_diff: Optional[int] = None
+        for reflen in reflens:
+            if min_diff is None or abs(reflen - len(test)) < min_diff:
+                min_diff = abs(reflen - len(test))
+                result["reflen"] = reflen
+
+    result["guess"] = [max(len(test) - k + 1, 0) for k in range(1, n + 1)]
+
+    result["correct"] = [0] * n
+    counts = count_ngrams(test, n)
+    for (ngram, count) in counts.items():
+        result["correct"][len(ngram) - 1] += min(refmaxcounts.get(ngram, 0), count)
+
+    return result
+
+
+def score_cooked(allcomps, n=4, ground=0, smooth=1):
+    totalcomps: Dict[str, Any] = {
+        "testlen": 0,
+        "reflen": 0,
+        "guess": [0] * n,
+        "correct": [0] * n,
+    }
+    for comps in allcomps:
+        for key in ["testlen", "reflen"]:
+            totalcomps[key] += comps[key]
+        for key in ["guess", "correct"]:
+            for k in range(n):
+                totalcomps[key][k] += comps[key][k]
+    logbleu = 0.0
+    all_bleus: List[float] = []
+    for k in range(n):
+        correct = totalcomps["correct"][k]
+        guess = totalcomps["guess"][k]
+        addsmooth = 0
+        if smooth == 1 and k > 0:
+            addsmooth = 1
+        logbleu += math.log(correct + addsmooth + sys.float_info.min) - math.log(
+            guess + addsmooth + sys.float_info.min
+        )
+        if guess == 0:
+            all_bleus.append(-10000000.0)
+        else:
+            all_bleus.append(math.log(correct + sys.float_info.min) - math.log(guess))
+
+    logbleu /= float(n)
+    all_bleus.insert(0, logbleu)
+
+    brevPenalty = min(
+        0, 1 - float(totalcomps["reflen"] + 1) / (totalcomps["testlen"] + 1)
+    )
+    for i in range(len(all_bleus)):
+        if i == 0:
+            all_bleus[i] += brevPenalty
+        all_bleus[i] = math.exp(all_bleus[i])
+    return all_bleus
+
+
+def bleu(refs, candidate, ground=0, smooth=1):
+    refs = cook_refs(refs)
+    test = cook_test(candidate, refs)
+    return score_cooked([test], ground=ground, smooth=smooth)
+
+
+def splitPuncts(line):
+    return " ".join(re.findall(r"[\w]+|[^\s\w]", line))
+
+
+def computeMaps(predictions, goldfile):
+    predictionMap: Dict[str, list] = {}
+    goldMap: Dict[str, list] = {}
+    gf = open(goldfile, "r")
+
+    for row in predictions:
+        cols = row.strip().split("\t")
+        if len(cols) == 1:
+            (rid, pred) = (cols[0], "")
+        else:
+            (rid, pred) = (cols[0], cols[1])
+        predictionMap[rid] = [splitPuncts(pred.strip().lower())]
+
+    for row in gf:
+        (rid, pred) = row.split("\t")
+        if rid in predictionMap:  # Only insert if the id exists for the method
+            if rid not in goldMap:
+                goldMap[rid] = []
+            goldMap[rid].append(splitPuncts(pred.strip().lower()))
+
+    sys.stderr.write("Total: " + str(len(goldMap)) + "\n")
+    return (goldMap, predictionMap)
+
+
+# m1 is the reference map
+# m2 is the prediction map
+def bleuFromMaps(m1, m2):
+    score = [0] * 5
+    num = 0.0
+
+    for key in m1:
+        if key in m2:
+            bl = bleu(m1[key], m2[key][0])
+            score = [score[i] + bl[i] for i in range(0, len(bl))]
+            num += 1
+    return [s * 100.0 / num for s in score]
+
+
+def smoothed_bleu_4(references, predictions, **kwargs):
+
+    predictionMap = {}
+    goldMap = {}
+
+    for rid, pred in enumerate(predictions):
+        predictionMap[rid] = [splitPuncts(pred.strip().lower())]
+
+    for rid, row in enumerate(references):
+        goldMap[rid] = [splitPuncts(row.strip().lower())]
+
+    return bleuFromMaps(goldMap, predictionMap)[0]
+
+
+if __name__ == "__main__":
+    reference_file = sys.argv[1]
+    predictions = []
+    for row in sys.stdin:
+        predictions.append(row)
+    (goldMap, predictionMap) = computeMaps(predictions, reference_file)
+    print(bleuFromMaps(goldMap, predictionMap)[0])
--- a/lm_eval/tasks/code_x_glue/code-text/go.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/go.yaml
+group:
+  - codexglue_code2text
+task: code2text_go
+dataset_path: CM/codexglue_code2text_go
+training_split: train
+validation_split: validation
+test_split: test
+output_type: generate_until
+generation_kwargs:
+  num_beams: 10
+  max_length: 128
+  until:
+    - "</s>"
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+metric_list:
+  - metric: !function bleu.smoothed_bleu_4
+    aggregation: mean
+    higher_is_better: True
--- a/lm_eval/tasks/code_x_glue/code-text/java.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/java.yaml
+group:
+  - codexglue_code2text
+task: code2text_java
+dataset_path: CM/codexglue_code2text_java
+training_split: train
+validation_split: validation
+test_split: test
+output_type: generate_until
+generation_kwargs:
+  num_beams: 10
+  max_length: 128
+  until:
+    - "</s>"
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+metric_list:
+  - metric: !function bleu.smoothed_bleu_4
+    aggregation: mean
+    higher_is_better: True
--- a/lm_eval/tasks/code_x_glue/code-text/javascript.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/javascript.yaml
+group:
+  - codexglue_code2text
+task: code2text_javascript
+dataset_path: CM/codexglue_code2text_javascript
+training_split: train
+validation_split: validation
+test_split: test
+output_type: generate_until
+generation_kwargs:
+  num_beams: 10
+  max_length: 128
+  until:
+    - "</s>"
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+metric_list:
+  - metric: !function bleu.smoothed_bleu_4
+    aggregation: mean
+    higher_is_better: True
--- a/lm_eval/tasks/code_x_glue/code-text/php.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/php.yaml
+group:
+  - codexglue_code2text
+task: code2text_php
+dataset_path: CM/codexglue_code2text_php
+training_split: train
+validation_split: validation
+test_split: test
+output_type: generate_until
+generation_kwargs:
+  num_beams: 10
+  max_length: 128
+  until:
+    - "</s>"
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+metric_list:
+  - metric: !function bleu.smoothed_bleu_4
+    aggregation: mean
+    higher_is_better: True
--- a/lm_eval/tasks/code_x_glue/code-text/python.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/python.yaml
+group:
+  - codexglue_code2text
+task: code2text_python
+dataset_path: CM/codexglue_code2text_python
+training_split: train
+validation_split: validation
+test_split: test
+output_type: generate_until
+generation_kwargs:
+  num_beams: 10
+  max_length: 128
+  until:
+    - "</s>"
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+metric_list:
+  - metric: !function bleu.smoothed_bleu_4
+    aggregation: mean
+    higher_is_better: True
--- a/lm_eval/tasks/code_x_glue/code-text/ruby.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/ruby.yaml
+group:
+  - codexglue_code2text
+task: code2text_ruby
+dataset_path: CM/codexglue_code2text_ruby
+training_split: train
+validation_split: validation
+test_split: test
+output_type: generate_until
+generation_kwargs:
+  num_beams: 10
+  max_length: 128
+  until:
+    - "</s>"
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+metric_list:
+  - metric: !function bleu.smoothed_bleu_4
+    aggregation: mean
+    higher_is_better: True
--- a/lm_eval/tasks/code_x_glue/code-text/utils.py
+++ b/lm_eval/tasks/code_x_glue/code-text/utils.py
+def doc_to_text(doc):
+
+    inputs = " ".join(doc["code_tokens"]).replace("\n", " ")
+    inputs = " ".join(inputs.strip().split())
+
+    return inputs
+
+
+def doc_to_target(doc):
+
+    targets = " ".join(doc["docstring_tokens"]).replace("\n", "")
+    targets = " ".join(targets.strip().split())
+
+    return targets
--- a/lm_eval/tasks/coqa/default.yaml
+++ b/lm_eval/tasks/coqa/default.yaml
 task: coqa
 dataset_path: EleutherAI/coqa
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 validation_split: validation
 doc_to_text: !function utils.doc_to_text

--- a/lm_eval/tasks/drop/default.yaml
+++ b/lm_eval/tasks/drop/default.yaml
 task: drop
 dataset_path: EleutherAI/drop
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 validation_split: validation
 process_docs: !function utils.process_docs

--- a/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
+++ b/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
@@ -3,7 +3,7 @@ group:
 task: gsm8k_cot
 dataset_path: gsm8k
 dataset_name: main
-output_type: greedy_until
+output_type: generate_until
 test_split: test
 doc_to_text: "Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?\n\nA: There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6. The answer is 6.\n\n\
 Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?\n\nA: There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The answer is 5.\n\n\
@@ -14,8 +14,7 @@ Q: There were nine computers in the server room. Five more computers were instal
 Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?\n\nA: Michael started with 58 golf balls. After losing 23 on tuesday, he had 58 - 23 = 35. After losing 2 more, he had 35 - 2 = 33 golf balls. The answer is 33.\n\n\
 Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?\n\nA: Olivia had 23 dollars. 5 bagels for 3 dollars each will be 5 x 3 = 15 dollars. So she has 23 - 15 dollars left. 23 - 15 is 8. The answer is 8.\n\n\
 Q: {{question}}\n\nA:"
-doc_to_target: "{{answer}}" #" {{answer.split('### ')[-1].rstrip()}}"
-gold_alias: "{{answer.split('### ')[-1].rstrip()}}" # this post-processes the reference that we'll score against
+doc_to_target: " {{answer.split('### ')[-1].rstrip()}}"
 metric_list:
  - metric: exact_match
    aggregation: mean
@@ -25,6 +24,8 @@ metric_list:
    regexes_to_ignore:
      - ","
      - "\\$"
+      - "(?s).*#### "
+      - "\n\n"
 generation_kwargs:
  until:
    - "Q:"
@@ -37,5 +38,5 @@ filter_list:
  - name: "get-answer"
    filter:
      - function: "regex"
-        regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
+        regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)."
      - function: "take_first"
--- a/lm_eval/tasks/gsm8k/gsm8k.yaml
+++ b/lm_eval/tasks/gsm8k/gsm8k.yaml
 group:
  - math_word_problems
-task: gsm8k_yaml
+task: gsm8k
 dataset_path: gsm8k
 dataset_name: main
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 fewshot_split: train
 test_split: test
 doc_to_text: "Question: {{question}}\nAnswer:"
 doc_to_target: "{{answer}}" #" {{answer.split('### ')[-1].rstrip()}}"
-gold_alias: "{{answer.split('### ')[-1].rstrip()}}" # this post-processes the reference that we'll score against
 metric_list:
  - metric: exact_match
    aggregation: mean
@@ -19,7 +18,7 @@ metric_list:
    regexes_to_ignore:
      - ","
      - "\\$"
-      - ".*### "
+      - "(?s).*#### "
 generation_kwargs:
  until:
    - "\n\n"
@@ -28,9 +27,9 @@ generation_kwargs:
  temperature: 0.0
 repeats: 1
 num_fewshot: 5
-# filter_list:
-#   - name: "get-answer"
-#     filter:
-#       - function: "regex"
-#         regex_pattern: "### (\\-?[0-9\\.\\,]+)"
-#       - function: "take_first"
+filter_list:
+  - name: "get-answer"
+    filter:
+      - function: "regex"
+        regex_pattern: "#### (\\-?[0-9\\.\\,]+)"
+      - function: "take_first"
--- a/lm_eval/tasks/hendrycks_ethics/utilitarianism_original_yaml
+++ b/lm_eval/tasks/hendrycks_ethics/utilitarianism_original_yaml
@@ -9,7 +9,6 @@
 # template_aliases:  #"{% set answer_choices = range(1, 11)|list %}"
 # doc_to_text: 'Activity: "{{activity}}"\nRating:'
 # doc_to_target: "{{answer_choices[label]}}"
-# gold_alias: "{{label}}" # this will be cast to an int.
 # metric_list:
 #   - metric: acc
 # TODO: we want this to be implemented as a winograd_schema task type, actually