codexglue_text_to_text.py 4.09 KB
Newer Older
hepj's avatar
hepj committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
"""
CodeXGLUE: A Machine Learning Benchmark Dataset for Code Understanding and Generation
https://arxiv.org/abs/2102.04664

Text to text task from CodeXGlue (documentation translation)
"""

import json
import os
import re

from evaluate import load

from bigcode_eval.base import Task

_CITATION = """
@article{CodeXGLUE,
         title={CodeXGLUE: A Benchmark Dataset and Open Challenge for Code Intelligence},
         year={2020},}
"""

SOURCE_LANG = {
    "da_en": "danish",
    "zh_en": "chinese",
    "no_en": "norwegian",
    "lv_en": "latvian",
}


def create_all_tasks():
    """Creates a dictionary of tasks from a list of languages
    :return: {task_name: task}
        e.g. {codexglue_text_to_text-da_en: Task, codexglue_text_to_text-zh_en: Task}
    """
    return {
        f"codexglue_text_to_text-{translation_task}": create_task(translation_task)
        for translation_task in SOURCE_LANG
    }


def create_task(translation_task):
    class CodexglueTextToTextTask(CodexglueTextToText):
        def __init__(self, **kwargs):
            super().__init__(translation_task, **kwargs)

    return CodexglueTextToTextTask


class CodexglueTextToText(Task):

    DATASET_PATH = "code_x_glue_tt_text_to_text"
    DATASET_NAME = None

    def __init__(self, translation_task, max_order=4, smooth=True):
        self.DATASET_NAME = translation_task
        stop_words = ["\n"]
        requires_execution = False
        super().__init__(stop_words, requires_execution)
        self.max_order = max_order
        self.smooth = smooth

    def get_dataset(self):
        """Returns dataset for the task or an iterable of any object, that get_prompt can handle"""
        return self.dataset["test"]

    def fewshot_examples(self):
        """Loads and returns the few-shot examples for the task if they exist."""
        with open(
            "bigcode_eval/tasks/few_shot_examples/codexglue_text_to_text_few_shot_prompts.json",
            "r",
        ) as file:
            examples = json.load(file)
        return examples

    @staticmethod
    def two_shot_prompt(entry, text, examples, language):
        """Two shot prompt format as source & target language documentation"""
        prompt = f"\n{language.title()}:\n{examples['source1']}\
                   \nEnglish:\n{examples['target1']}\
                   \n{language.title()}:\n{examples['source2']}\
                   \nEnglish:\n{examples['target2']}\
                   \n{language.title()}:\n{text}\
                   \nEnglish:\n"
        return entry + prompt

    def get_prompt(self, doc):
        """Builds the prompt for the LM to generate from."""
        language = SOURCE_LANG[self.DATASET_NAME]
        text = doc["source"]
        entry = f"Translate the following documentation from {language.title()} to English:\n"
        examples = self.fewshot_examples()
        examples = examples[language]
        prompt = self.two_shot_prompt(entry, text, examples, language)
        return prompt

    def get_reference(self, doc):
        """Builds the reference solution for the doc (sample from the test dataset)."""
        return doc["target"].strip()

    def postprocess_generation(self, generation, idx):
        """Defines the postprocessing for a LM generation.
        :param generation: str
            code generation from LM
        :param idx: int
            index of doc in the dataset to which the generation belongs
            (not used for this task)
        """
        output = generation.split("\nEnglish:\n", 3)[-1].strip()
        return output

    def process_results(self, generations, references):
        """Takes the list of LM generations and evaluates them against ground truth references,
        returning the metric for the generations.
        :param generations: list(list(str))
            list of lists containing generations
        :param references: list(str)
            list of str containing references
        """
        bleu = load("bleu")
        gens = [gen[0] for gen in generations]
        results = bleu.compute(
            references=references, predictions=gens, max_order=self.max_order, smooth=self.smooth
        )
        return results