Merge remote-tracking branch 'origin/big-refactor' into calibration

e1ae8a2f · Herbie Bradley · 50e99bd7 · 30936bc7 · e1ae8a2f · e1ae8a2f
Commit e1ae8a2f authored Nov 26, 2023 by Herbie Bradley
20 changed files
--- a/lm_eval/tasks/cmmlu/cmmlu_default_marxist_theory.yaml
+++ b/lm_eval/tasks/cmmlu/cmmlu_default_marxist_theory.yaml
+"dataset_name": "marxist_theory"
+"description": "以下是关于马克思主义理论的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_marxist_theory"
--- a/lm_eval/tasks/cmmlu/cmmlu_default_modern_chinese.yaml
+++ b/lm_eval/tasks/cmmlu/cmmlu_default_modern_chinese.yaml
+"dataset_name": "modern_chinese"
+"description": "以下是关于现代汉语的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_modern_chinese"
--- a/lm_eval/tasks/cmmlu/cmmlu_default_nutrition.yaml
+++ b/lm_eval/tasks/cmmlu/cmmlu_default_nutrition.yaml
+"dataset_name": "nutrition"
+"description": "以下是关于营养学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_nutrition"
--- a/lm_eval/tasks/cmmlu/cmmlu_default_philosophy.yaml
+++ b/lm_eval/tasks/cmmlu/cmmlu_default_philosophy.yaml
+"dataset_name": "philosophy"
+"description": "以下是关于哲学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_philosophy"
--- a/lm_eval/tasks/cmmlu/cmmlu_default_professional_accounting.yaml
+++ b/lm_eval/tasks/cmmlu/cmmlu_default_professional_accounting.yaml
+"dataset_name": "professional_accounting"
+"description": "以下是关于专业会计的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_professional_accounting"
--- a/lm_eval/tasks/cmmlu/cmmlu_default_professional_law.yaml
+++ b/lm_eval/tasks/cmmlu/cmmlu_default_professional_law.yaml
+"dataset_name": "professional_law"
+"description": "以下是关于专业法学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_professional_law"
--- a/lm_eval/tasks/cmmlu/cmmlu_default_professional_medicine.yaml
+++ b/lm_eval/tasks/cmmlu/cmmlu_default_professional_medicine.yaml
+"dataset_name": "professional_medicine"
+"description": "以下是关于专业医学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_professional_medicine"
--- a/lm_eval/tasks/cmmlu/cmmlu_default_professional_psychology.yaml
+++ b/lm_eval/tasks/cmmlu/cmmlu_default_professional_psychology.yaml
+"dataset_name": "professional_psychology"
+"description": "以下是关于专业心理学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_professional_psychology"
--- a/lm_eval/tasks/cmmlu/cmmlu_default_public_relations.yaml
+++ b/lm_eval/tasks/cmmlu/cmmlu_default_public_relations.yaml
+"dataset_name": "public_relations"
+"description": "以下是关于公共关系的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_public_relations"
--- a/lm_eval/tasks/cmmlu/cmmlu_default_security_study.yaml
+++ b/lm_eval/tasks/cmmlu/cmmlu_default_security_study.yaml
+"dataset_name": "security_study"
+"description": "以下是关于安全研究的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_security_study"
--- a/lm_eval/tasks/cmmlu/cmmlu_default_sociology.yaml
+++ b/lm_eval/tasks/cmmlu/cmmlu_default_sociology.yaml
+"dataset_name": "sociology"
+"description": "以下是关于社会学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_sociology"
--- a/lm_eval/tasks/cmmlu/cmmlu_default_sports_science.yaml
+++ b/lm_eval/tasks/cmmlu/cmmlu_default_sports_science.yaml
+"dataset_name": "sports_science"
+"description": "以下是关于体育学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_sports_science"
--- a/lm_eval/tasks/cmmlu/cmmlu_default_traditional_chinese_medicine.yaml
+++ b/lm_eval/tasks/cmmlu/cmmlu_default_traditional_chinese_medicine.yaml
+"dataset_name": "traditional_chinese_medicine"
+"description": "以下是关于中医中药的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_traditional_chinese_medicine"
--- a/lm_eval/tasks/cmmlu/cmmlu_default_virology.yaml
+++ b/lm_eval/tasks/cmmlu/cmmlu_default_virology.yaml
+"dataset_name": "virology"
+"description": "以下是关于病毒学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_virology"
--- a/lm_eval/tasks/cmmlu/cmmlu_default_world_history.yaml
+++ b/lm_eval/tasks/cmmlu/cmmlu_default_world_history.yaml
+"dataset_name": "world_history"
+"description": "以下是关于世界历史的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_world_history"
--- a/lm_eval/tasks/cmmlu/cmmlu_default_world_religions.yaml
+++ b/lm_eval/tasks/cmmlu/cmmlu_default_world_religions.yaml
+"dataset_name": "world_religions"
+"description": "以下是关于世界宗教的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "cmmlu_world_religions"
--- a/lm_eval/tasks/code_x_glue/code-text/bleu.py
+++ b/lm_eval/tasks/code_x_glue/code-text/bleu.py
+#!/usr/bin/python
+import os
+import re
+import sys
+import math
+import subprocess
+import xml.sax.saxutils
+
+from typing import List, Pattern, Tuple, Union, Dict, Any, Optional
+
+"""
+This script was adapted from the original version by hieuhoang1972 which is part of MOSES.
+"""
+
+# $Id: bleu.py 1307 2007-03-14 22:22:36Z hieuhoang1972 $
+
+"""Provides:
+
+cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test().
+cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked().
+score_cooked(alltest, n=4): Score a list of cooked test sentences.
+
+score_set(s, testid, refids, n=4): Interface with dataset.py; calculate BLEU score of testid against refids.
+
+The reason for breaking the BLEU computation into three phases cook_refs(), cook_test(), and score_cooked() is to allow the caller to calculate BLEU scores for multiple test sets as efficiently as possible.
+"""
+
+# Added to bypass NIST-style pre-processing of hyp and ref files -- wade
+nonorm = 0
+
+preserve_case = False
+eff_ref_len = "shortest"
+
+normalize1: List[Tuple[Union[Pattern[str], str], str]] = [
+    ("<skipped>", ""),  # strip "skipped" tags
+    (r"-\n", ""),  # strip end-of-line hyphenation and join lines
+    (r"\n", " "),  # join lines
+    #    (r'(\d)\s+(?=\d)', r'\1'), # join digits
+]
+normalize1 = [(re.compile(pattern), replace) for (pattern, replace) in normalize1]
+
+normalize2: List[Tuple[Union[Pattern[str], str], str]] = [
+    (
+        r"([\{-\~\[-\` -\&\(-\+\:-\@\/])",
+        r" \1 ",
+    ),  # tokenize punctuation. apostrophe is missing
+    (
+        r"([^0-9])([\.,])",
+        r"\1 \2 ",
+    ),  # tokenize period and comma unless preceded by a digit
+    (
+        r"([\.,])([^0-9])",
+        r" \1 \2",
+    ),  # tokenize period and comma unless followed by a digit
+    (r"([0-9])(-)", r"\1 \2 "),  # tokenize dash when preceded by a digit
+]
+normalize2 = [(re.compile(pattern), replace) for (pattern, replace) in normalize2]
+
+
+def normalize(s):
+    """Normalize and tokenize text. This is lifted from NIST mteval-v11a.pl."""
+    # Added to bypass NIST-style pre-processing of hyp and ref files -- wade
+    if nonorm:
+        return s.split()
+    if type(s) is not str:
+        s = " ".join(s)
+    # language-independent part:
+    for (pattern, replace) in normalize1:
+        s = re.sub(pattern, replace, s)
+    s = xml.sax.saxutils.unescape(s, {"&quot;": '"'})
+    # language-dependent part (assuming Western languages):
+    s = " %s " % s
+    if not preserve_case:
+        s = s.lower()  # this might not be identical to the original
+    for (pattern, replace) in normalize2:
+        s = re.sub(pattern, replace, s)
+    return s.split()
+
+
+def count_ngrams(words, n=4):
+    counts: Dict[Any, int] = {}
+    for k in range(1, n + 1):
+        for i in range(len(words) - k + 1):
+            ngram = tuple(words[i : i + k])
+            counts[ngram] = counts.get(ngram, 0) + 1
+    return counts
+
+
+def cook_refs(refs, n=4):
+    """Takes a list of reference sentences for a single segment
+    and returns an object that encapsulates everything that BLEU
+    needs to know about them."""
+
+    refs = [normalize(ref) for ref in refs]
+    maxcounts: Dict[Tuple[str], int] = {}
+    for ref in refs:
+        counts = count_ngrams(ref, n)
+        for (ngram, count) in counts.items():
+            maxcounts[ngram] = max(maxcounts.get(ngram, 0), count)
+    return ([len(ref) for ref in refs], maxcounts)
+
+
+def cook_test(test, item, n=4):
+    """Takes a test sentence and returns an object that
+    encapsulates everything that BLEU needs to know about it."""
+    (reflens, refmaxcounts) = item
+    test = normalize(test)
+    result: Dict[str, Any] = {}
+    result["testlen"] = len(test)
+
+    # Calculate effective reference sentence length.
+
+    if eff_ref_len == "shortest":
+        result["reflen"] = min(reflens)
+    elif eff_ref_len == "average":
+        result["reflen"] = float(sum(reflens)) / len(reflens)
+    elif eff_ref_len == "closest":
+        min_diff: Optional[int] = None
+        for reflen in reflens:
+            if min_diff is None or abs(reflen - len(test)) < min_diff:
+                min_diff = abs(reflen - len(test))
+                result["reflen"] = reflen
+
+    result["guess"] = [max(len(test) - k + 1, 0) for k in range(1, n + 1)]
+
+    result["correct"] = [0] * n
+    counts = count_ngrams(test, n)
+    for (ngram, count) in counts.items():
+        result["correct"][len(ngram) - 1] += min(refmaxcounts.get(ngram, 0), count)
+
+    return result
+
+
+def score_cooked(allcomps, n=4, ground=0, smooth=1):
+    totalcomps: Dict[str, Any] = {
+        "testlen": 0,
+        "reflen": 0,
+        "guess": [0] * n,
+        "correct": [0] * n,
+    }
+    for comps in allcomps:
+        for key in ["testlen", "reflen"]:
+            totalcomps[key] += comps[key]
+        for key in ["guess", "correct"]:
+            for k in range(n):
+                totalcomps[key][k] += comps[key][k]
+    logbleu = 0.0
+    all_bleus: List[float] = []
+    for k in range(n):
+        correct = totalcomps["correct"][k]
+        guess = totalcomps["guess"][k]
+        addsmooth = 0
+        if smooth == 1 and k > 0:
+            addsmooth = 1
+        logbleu += math.log(correct + addsmooth + sys.float_info.min) - math.log(
+            guess + addsmooth + sys.float_info.min
+        )
+        if guess == 0:
+            all_bleus.append(-10000000.0)
+        else:
+            all_bleus.append(math.log(correct + sys.float_info.min) - math.log(guess))
+
+    logbleu /= float(n)
+    all_bleus.insert(0, logbleu)
+
+    brevPenalty = min(
+        0, 1 - float(totalcomps["reflen"] + 1) / (totalcomps["testlen"] + 1)
+    )
+    for i in range(len(all_bleus)):
+        if i == 0:
+            all_bleus[i] += brevPenalty
+        all_bleus[i] = math.exp(all_bleus[i])
+    return all_bleus
+
+
+def bleu(refs, candidate, ground=0, smooth=1):
+    refs = cook_refs(refs)
+    test = cook_test(candidate, refs)
+    return score_cooked([test], ground=ground, smooth=smooth)
+
+
+def splitPuncts(line):
+    return " ".join(re.findall(r"[\w]+|[^\s\w]", line))
+
+
+def computeMaps(predictions, goldfile):
+    predictionMap: Dict[str, list] = {}
+    goldMap: Dict[str, list] = {}
+    gf = open(goldfile, "r")
+
+    for row in predictions:
+        cols = row.strip().split("\t")
+        if len(cols) == 1:
+            (rid, pred) = (cols[0], "")
+        else:
+            (rid, pred) = (cols[0], cols[1])
+        predictionMap[rid] = [splitPuncts(pred.strip().lower())]
+
+    for row in gf:
+        (rid, pred) = row.split("\t")
+        if rid in predictionMap:  # Only insert if the id exists for the method
+            if rid not in goldMap:
+                goldMap[rid] = []
+            goldMap[rid].append(splitPuncts(pred.strip().lower()))
+
+    sys.stderr.write("Total: " + str(len(goldMap)) + "\n")
+    return (goldMap, predictionMap)
+
+
+# m1 is the reference map
+# m2 is the prediction map
+def bleuFromMaps(m1, m2):
+    score = [0] * 5
+    num = 0.0
+
+    for key in m1:
+        if key in m2:
+            bl = bleu(m1[key], m2[key][0])
+            score = [score[i] + bl[i] for i in range(0, len(bl))]
+            num += 1
+    return [s * 100.0 / num for s in score]
+
+
+def smoothed_bleu_4(references, predictions, **kwargs):
+
+    predictionMap = {}
+    goldMap = {}
+
+    for rid, pred in enumerate(predictions):
+        predictionMap[rid] = [splitPuncts(pred.strip().lower())]
+
+    for rid, row in enumerate(references):
+        goldMap[rid] = [splitPuncts(row.strip().lower())]
+
+    return bleuFromMaps(goldMap, predictionMap)[0]
+
+
+if __name__ == "__main__":
+    reference_file = sys.argv[1]
+    predictions = []
+    for row in sys.stdin:
+        predictions.append(row)
+    (goldMap, predictionMap) = computeMaps(predictions, reference_file)
+    print(bleuFromMaps(goldMap, predictionMap)[0])
--- a/lm_eval/tasks/code_x_glue/code-text/go.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/go.yaml
+group:
+  - codexglue_code2text
+task: code2text_go
+dataset_path: CM/codexglue_code2text_go
+training_split: train
+validation_split: validation
+test_split: test
+output_type: generate_until
+generation_kwargs:
+  num_beams: 10
+  max_length: 128
+  until:
+    - "</s>"
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+metric_list:
+  - metric: !function bleu.smoothed_bleu_4
+    aggregation: mean
+    higher_is_better: True
--- a/lm_eval/tasks/code_x_glue/code-text/java.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/java.yaml
+group:
+  - codexglue_code2text
+task: code2text_java
+dataset_path: CM/codexglue_code2text_java
+training_split: train
+validation_split: validation
+test_split: test
+output_type: generate_until
+generation_kwargs:
+  num_beams: 10
+  max_length: 128
+  until:
+    - "</s>"
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+metric_list:
+  - metric: !function bleu.smoothed_bleu_4
+    aggregation: mean
+    higher_is_better: True
--- a/lm_eval/tasks/code_x_glue/code-text/javascript.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/javascript.yaml
+group:
+  - codexglue_code2text
+task: code2text_javascript
+dataset_path: CM/codexglue_code2text_javascript
+training_split: train
+validation_split: validation
+test_split: test
+output_type: generate_until
+generation_kwargs:
+  num_beams: 10
+  max_length: 128
+  until:
+    - "</s>"
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+metric_list:
+  - metric: !function bleu.smoothed_bleu_4
+    aggregation: mean
+    higher_is_better: True