add codexglue

76227f0d · lintangsutawika · ac50adb5 · 76227f0d · 76227f0d
Commit 76227f0d authored Sep 21, 2023 by lintangsutawika
Showing with 126 additions and 104 deletions

lm_eval/tasks/code_x_glue/code-text/bleu.py lm_eval/tasks/code_x_glue/code-text/bleu.py +120 -98

lm_eval/tasks/code_x_glue/code-text/utils.py lm_eval/tasks/code_x_glue/code-text/utils.py +6 -6

No files found.
--- a/lm_eval/tasks/code_x_glue/code-text/bleu.py
+++ b/lm_eval/tasks/code_x_glue/code-text/bleu.py
 #!/usr/bin/python

-'''
-This script was adapted from the original version by hieuhoang1972 which is part of MOSES. 
-'''
+"""
+This script was adapted from the original version by hieuhoang1972 which is part of MOSES.
+"""

 # $Id: bleu.py 1307 2007-03-14 22:22:36Z hieuhoang1972 $

-'''Provides:
+"""Provides:

 cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test().
 cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked().
@@ -15,7 +15,7 @@ score_cooked(alltest, n=4): Score a list of cooked test sentences.
 score_set(s, testid, refids, n=4): Interface with dataset.py; calculate BLEU score of testid against refids.

 The reason for breaking the BLEU computation into three phases cook_refs(), cook_test(), and score_cooked() is to allow the caller to calculate BLEU scores for multiple test sets as efficiently as possible.
-'''
+"""

 import sys, math, re, xml.sax.saxutils
 import subprocess
@@ -28,167 +28,188 @@ preserve_case = False
 eff_ref_len = "shortest"

 normalize1 = [
-    ('<skipped>', ''),         # strip "skipped" tags
-    (r'-\n', ''),              # strip end-of-line hyphenation and join lines
-    (r'\n', ' '),              # join lines
-#    (r'(\d)\s+(?=\d)', r'\1'), # join digits
+    ("<skipped>", ""),  # strip "skipped" tags
+    (r"-\n", ""),  # strip end-of-line hyphenation and join lines
+    (r"\n", " "),  # join lines
+    #    (r'(\d)\s+(?=\d)', r'\1'), # join digits
 ]
 normalize1 = [(re.compile(pattern), replace) for (pattern, replace) in normalize1]

 normalize2 = [
-    (r'([\{-\~\[-\` -\&\(-\+\:-\@\/])',r' \1 '), # tokenize punctuation. apostrophe is missing
-    (r'([^0-9])([\.,])',r'\1 \2 '),              # tokenize period and comma unless preceded by a digit
-    (r'([\.,])([^0-9])',r' \1 \2'),              # tokenize period and comma unless followed by a digit
-    (r'([0-9])(-)',r'\1 \2 ')                    # tokenize dash when preceded by a digit
+    (
+        r"([\{-\~\[-\` -\&\(-\+\:-\@\/])",
+        r" \1 ",
+    ),  # tokenize punctuation. apostrophe is missing
+    (
+        r"([^0-9])([\.,])",
+        r"\1 \2 ",
+    ),  # tokenize period and comma unless preceded by a digit
+    (
+        r"([\.,])([^0-9])",
+        r" \1 \2",
+    ),  # tokenize period and comma unless followed by a digit
+    (r"([0-9])(-)", r"\1 \2 "),  # tokenize dash when preceded by a digit
 ]
 normalize2 = [(re.compile(pattern), replace) for (pattern, replace) in normalize2]

+
 def normalize(s):
-    '''Normalize and tokenize text. This is lifted from NIST mteval-v11a.pl.'''
+    """Normalize and tokenize text. This is lifted from NIST mteval-v11a.pl."""
    # Added to bypass NIST-style pre-processing of hyp and ref files -- wade
-    if (nonorm):
+    if nonorm:
        return s.split()
    if type(s) is not str:
        s = " ".join(s)
    # language-independent part:
    for (pattern, replace) in normalize1:
        s = re.sub(pattern, replace, s)
-    s = xml.sax.saxutils.unescape(s, {'&quot;':'"'})
+    s = xml.sax.saxutils.unescape(s, {"&quot;": '"'})
    # language-dependent part (assuming Western languages):
    s = " %s " % s
    if not preserve_case:
-        s = s.lower()         # this might not be identical to the original
+        s = s.lower()  # this might not be identical to the original
    for (pattern, replace) in normalize2:
        s = re.sub(pattern, replace, s)
    return s.split()

+
 def count_ngrams(words, n=4):
    counts = {}
-    for k in range(1,n+1):
-        for i in range(len(words)-k+1):
-            ngram = tuple(words[i:i+k])
-            counts[ngram] = counts.get(ngram, 0)+1
+    for k in range(1, n + 1):
+        for i in range(len(words) - k + 1):
+            ngram = tuple(words[i : i + k])
+            counts[ngram] = counts.get(ngram, 0) + 1
    return counts

+
 def cook_refs(refs, n=4):
-    '''Takes a list of reference sentences for a single segment
+    """Takes a list of reference sentences for a single segment
    and returns an object that encapsulates everything that BLEU
-    needs to know about them.'''
-    
+    needs to know about them."""
+
    refs = [normalize(ref) for ref in refs]
    maxcounts = {}
    for ref in refs:
        counts = count_ngrams(ref, n)
-        for (ngram,count) in counts.items():
-            maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
+        for (ngram, count) in counts.items():
+            maxcounts[ngram] = max(maxcounts.get(ngram, 0), count)
    return ([len(ref) for ref in refs], maxcounts)

+
 def cook_test(test, item, n=4):
-    '''Takes a test sentence and returns an object that
-    encapsulates everything that BLEU needs to know about it.'''
-    (reflens, refmaxcounts)=item
+    """Takes a test sentence and returns an object that
+    encapsulates everything that BLEU needs to know about it."""
+    (reflens, refmaxcounts) = item
    test = normalize(test)
    result = {}
    result["testlen"] = len(test)

    # Calculate effective reference sentence length.
-    
+
    if eff_ref_len == "shortest":
        result["reflen"] = min(reflens)
    elif eff_ref_len == "average":
-        result["reflen"] = float(sum(reflens))/len(reflens)
+        result["reflen"] = float(sum(reflens)) / len(reflens)
    elif eff_ref_len == "closest":
        min_diff = None
        for reflen in reflens:
-            if min_diff is None or abs(reflen-len(test)) < min_diff:
-                min_diff = abs(reflen-len(test))
-                result['reflen'] = reflen
+            if min_diff is None or abs(reflen - len(test)) < min_diff:
+                min_diff = abs(reflen - len(test))
+                result["reflen"] = reflen

-    result["guess"] = [max(len(test)-k+1,0) for k in range(1,n+1)]
+    result["guess"] = [max(len(test) - k + 1, 0) for k in range(1, n + 1)]

-    result['correct'] = [0]*n
+    result["correct"] = [0] * n
    counts = count_ngrams(test, n)
    for (ngram, count) in counts.items():
-        result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count)
+        result["correct"][len(ngram) - 1] += min(refmaxcounts.get(ngram, 0), count)

    return result

+
 def score_cooked(allcomps, n=4, ground=0, smooth=1):
-    totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n}
+    totalcomps = {"testlen": 0, "reflen": 0, "guess": [0] * n, "correct": [0] * n}
    for comps in allcomps:
-        for key in ['testlen','reflen']:
+        for key in ["testlen", "reflen"]:
            totalcomps[key] += comps[key]
-        for key in ['guess','correct']:
+        for key in ["guess", "correct"]:
            for k in range(n):
                totalcomps[key][k] += comps[key][k]
    logbleu = 0.0
    all_bleus = []
    for k in range(n):
-      correct = totalcomps['correct'][k]
-      guess = totalcomps['guess'][k]
-      addsmooth = 0
-      if smooth == 1 and k > 0:
-        addsmooth = 1
-      logbleu += math.log(correct + addsmooth + sys.float_info.min)-math.log(guess + addsmooth+ sys.float_info.min)
-      if guess == 0:
-        all_bleus.append(-10000000)
-      else:
-        all_bleus.append(math.log(correct + sys.float_info.min)-math.log( guess ))
+        correct = totalcomps["correct"][k]
+        guess = totalcomps["guess"][k]
+        addsmooth = 0
+        if smooth == 1 and k > 0:
+            addsmooth = 1
+        logbleu += math.log(correct + addsmooth + sys.float_info.min) - math.log(
+            guess + addsmooth + sys.float_info.min
+        )
+        if guess == 0:
+            all_bleus.append(-10000000)
+        else:
+            all_bleus.append(math.log(correct + sys.float_info.min) - math.log(guess))

    logbleu /= float(n)
    all_bleus.insert(0, logbleu)

-    brevPenalty = min(0,1-float(totalcomps['reflen'] + 1)/(totalcomps['testlen'] + 1))
+    brevPenalty = min(
+        0, 1 - float(totalcomps["reflen"] + 1) / (totalcomps["testlen"] + 1)
+    )
    for i in range(len(all_bleus)):
-      if i ==0:
-        all_bleus[i] += brevPenalty
-      all_bleus[i] = math.exp(all_bleus[i])
+        if i == 0:
+            all_bleus[i] += brevPenalty
+        all_bleus[i] = math.exp(all_bleus[i])
    return all_bleus

-def bleu(refs,  candidate, ground=0, smooth=1):
+
+def bleu(refs, candidate, ground=0, smooth=1):
    refs = cook_refs(refs)
    test = cook_test(candidate, refs)
    return score_cooked([test], ground=ground, smooth=smooth)

+
 def splitPuncts(line):
-  return ' '.join(re.findall(r"[\w]+|[^\s\w]", line))
+    return " ".join(re.findall(r"[\w]+|[^\s\w]", line))
+

 def computeMaps(predictions, goldfile):
-  predictionMap = {}
-  goldMap = {}
-  gf = open(goldfile, 'r')
-
-  for row in predictions:
-    cols = row.strip().split('\t')
-    if len(cols) == 1:
-      (rid, pred) = (cols[0], '') 
-    else:
-      (rid, pred) = (cols[0], cols[1]) 
-    predictionMap[rid] = [splitPuncts(pred.strip().lower())]
-
-  for row in gf:
-    (rid, pred) = row.split('\t') 
-    if rid in predictionMap: # Only insert if the id exists for the method
-      if rid not in goldMap:
-        goldMap[rid] = []
-      goldMap[rid].append(splitPuncts(pred.strip().lower()))
-
-  sys.stderr.write('Total: ' + str(len(goldMap)) + '\n')
-  return (goldMap, predictionMap)
-
-
-#m1 is the reference map
-#m2 is the prediction map
+    predictionMap = {}
+    goldMap = {}
+    gf = open(goldfile, "r")
+
+    for row in predictions:
+        cols = row.strip().split("\t")
+        if len(cols) == 1:
+            (rid, pred) = (cols[0], "")
+        else:
+            (rid, pred) = (cols[0], cols[1])
+        predictionMap[rid] = [splitPuncts(pred.strip().lower())]
+
+    for row in gf:
+        (rid, pred) = row.split("\t")
+        if rid in predictionMap:  # Only insert if the id exists for the method
+            if rid not in goldMap:
+                goldMap[rid] = []
+            goldMap[rid].append(splitPuncts(pred.strip().lower()))
+
+    sys.stderr.write("Total: " + str(len(goldMap)) + "\n")
+    return (goldMap, predictionMap)
+
+
+# m1 is the reference map
+# m2 is the prediction map
 def bleuFromMaps(m1, m2):
-  score = [0] * 5
-  num = 0.0
+    score = [0] * 5
+    num = 0.0

-  for key in m1:
-    if key in m2:
-      bl = bleu(m1[key], m2[key][0])
-      score = [ score[i] + bl[i] for i in range(0, len(bl))]
-      num += 1
-  return [s * 100.0 / num for s in score]
+    for key in m1:
+        if key in m2:
+            bl = bleu(m1[key], m2[key][0])
+            score = [score[i] + bl[i] for i in range(0, len(bl))]
+            num += 1
+    return [s * 100.0 / num for s in score]


 def smoothed_bleu_4(references, predictions, **kwargs):
@@ -197,17 +218,18 @@ def smoothed_bleu_4(references, predictions, **kwargs):
    goldMap = {}

    for rid, pred in enumerate(predictions):
-      predictionMap[rid] = [splitPuncts(pred.strip().lower())]
+        predictionMap[rid] = [splitPuncts(pred.strip().lower())]

    for rid, row in enumerate(references):
-      goldMap[rid] = [splitPuncts(row.strip().lower())]
+        goldMap[rid] = [splitPuncts(row.strip().lower())]

    return bleuFromMaps(goldMap, predictionMap)[0]

-if __name__ == '__main__':
-  reference_file = sys.argv[1]
-  predictions = []
-  for row in sys.stdin:
-    predictions.append(row)
-  (goldMap, predictionMap) = computeMaps(predictions, reference_file) 
-  print (bleuFromMaps(goldMap, predictionMap)[0])
+
+if __name__ == "__main__":
+    reference_file = sys.argv[1]
+    predictions = []
+    for row in sys.stdin:
+        predictions.append(row)
+    (goldMap, predictionMap) = computeMaps(predictions, reference_file)
+    print(bleuFromMaps(goldMap, predictionMap)[0])
--- a/lm_eval/tasks/code_x_glue/code-text/utils.py
+++ b/lm_eval/tasks/code_x_glue/code-text/utils.py
-
 def doc_to_text(doc):

-    inputs = ' '.join(doc['code_tokens']).replace('\n',' ')
-    inputs = ' '.join(inputs.strip().split())
+    inputs = " ".join(doc["code_tokens"]).replace("\n", " ")
+    inputs = " ".join(inputs.strip().split())

    return inputs

+
 def doc_to_target(doc):

-    targets = ' '.join(doc['docstring_tokens']).replace('\n','')
-    targets = ' '.join(targets.strip().split())     
+    targets = " ".join(doc["docstring_tokens"]).replace("\n", "")
+    targets = " ".join(targets.strip().split())

-    return targets
\ No newline at end of file
+    return targets