glue.py

# REMINDER: this code needs to be rewritten for the new framework. Remove this comment when the code is fully converted.

import numpy as np
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import f1_score, matthews_corrcoef
from tqdm import auto as tqdm_lib
from . common import HFTask, simple_accuracy_metric, yesno

def get_accuracy_and_f1(preds, golds):
    golds = np.array(golds)
    preds = np.array(preds)
    acc = float((preds == golds).mean())
    f1 = float(f1_score(y_true=golds, y_pred=preds))
    minor = {
        "acc": acc,
        "f1": f1,
        "acc_and_f1": (acc + f1) / 2,
    }
    return {
        "major": minor["acc_and_f1"],
        "minor": minor,
        "higher_is_better": True,
    }


class CoLA(HFTask):
    DATASET_PATH = "glue"
    DATASET_NAME = "cola"
    
    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

    def fewshot_description(self):
        return "Does this sentence make sense?:\tTrue or False?"

    def doc_to_text(self, doc, include_target=True):
        text = "Sentence: {}\nAnswer:".format(doc["sentence"])
        if include_target:
            text += " {}".format({1: "True", 0: "False"}[doc["label"]])
        return text

    def evaluate(self, docs, lm, provide_description, num_fewshot):

        # TODO: Implement evaluation code using new framework

        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
        # Remove this comment when the evaluation code is implemented.
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
            ctx = self.fewshot_context(
                doc=doc,
                provide_description=provide_description,
                num_fewshot=num_fewshot,
            )
            preds.append(lm.loglikelihood(ctx, ' True') > lm.loglikelihood(ctx, ' False'))
        golds = np.array(golds)
        preds = np.array(preds)
        mcc = float(matthews_corrcoef(y_true=golds, y_pred=preds))
        return {
            "major": mcc,
            "minor": {"mcc": mcc},
            "higher_is_better": True,
        }


class MNLI(HFTask):
    DATASET_PATH = "glue"
    DATASET_NAME = "mnli"

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

    def validation_docs(self):
        if self.has_validation_docs():
            return self.data["validation_matched"]

    def test_docs(self):
        if self.has_test_docs():
            return self.data["test_matched"]

    def doc_to_text(self, doc, include_target=True):
        text = "{}\nquestion:\t{}\tTrue, False or Neither?\nanswer:".format(
            doc["premise"],
            doc["hypothesis"],
        )
        if include_target:
            # True = entailment
            # False = contradiction
            # Neither = neutral
            text += " {}".format({0: "True", 1: "Neither", 2: "False"}[doc["label"]])
        return text

    def evaluate(self, docs, lm, provide_description, num_fewshot):
        # TODO: Implement evaluation code using new framework

        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
        # Remove this comment when the evaluation code is implemented.
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
            ctx = self.fewshot_context(
                doc=doc,
                provide_description=provide_description,
                num_fewshot=num_fewshot,
            )
            probs = np.array([
                lm.loglikelihood(ctx, ' True'),
                lm.loglikelihood(ctx, ' Neither'),
                lm.loglikelihood(ctx, ' False'),
            ])
            preds.append(np.argmax(probs))
        return simple_accuracy_metric(preds=preds, golds=golds)


class MNLIMismatched(MNLI):

    def validation_docs(self):
        if self.has_validation_docs():
            return self.data["validation_mismatched"]

    def test_docs(self):
        if self.has_test_docs():
            return self.data["test_mismatched"]


class MRPC(HFTask):
    DATASET_PATH = "glue"
    DATASET_NAME = "mrpc"

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

    def fewshot_description(self):
        return "Indicate if both sentences mean the same thing."

    def doc_to_text(self, doc, include_target=True):
        text = "sentence 1:\t{}\nsentence 2:\t{}\nanswer:".format(
            doc["sentence1"],
            doc["sentence2"],
        )
        if include_target:
            text += " {}".format(yesno(doc["label"]))
        return text

    def evaluate(self, docs, lm, provide_description, num_fewshot):
        # TODO: Implement evaluation code using new framework

        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
        # Remove this comment when the evaluation code is implemented.
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
            ctx = self.fewshot_context(
                doc=doc,
                provide_description=provide_description,
                num_fewshot=num_fewshot,
            )
            preds.append(lm.loglikelihood(ctx, 'yes') > lm.loglikelihood(ctx, 'no'))
        return get_accuracy_and_f1(preds=preds, golds=golds)

      
class RTE(HFTask):
    DATASET_PATH = "glue"
    DATASET_NAME = "rte"

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

    def doc_to_text(self, doc, include_target=True):
        text = "{}\nquestion:\t{}\tTrue or False?\nanswer:".format(
            doc["sentence1"],
            doc["sentence2"],
        )
        if include_target:
            # 0 = entailment
            # 1 = not_entailment
            text += " {}".format({0: "True", 1: "False"}[doc["label"]])
        return text

    def evaluate(self, docs, lm, provide_description, num_fewshot):
        # TODO: Implement evaluation code using new framework

        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
        # Remove this comment when the evaluation code is implemented.
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
            ctx = self.fewshot_context(
                doc=doc,
                provide_description=provide_description,
                num_fewshot=num_fewshot,
            )
            preds.append(lm.loglikelihood(ctx, ' False') > lm.loglikelihood(ctx, ' True'))
        return simple_accuracy_metric(preds=preds, golds=golds)


class QNLI(HFTask):
    DATASET_PATH = "glue"
    DATASET_NAME = "qnli"

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

    def doc_to_text(self, doc, include_target=True):
        text = "question:\t{}\nresponse:\t{}\nDoes this answer the question, Yes or No?:".format(
            doc["question"],
            doc["sentence"],
        )
        if include_target:
            # True = entailment
            # False = not entailment
            text += " {}".format({0: "Yes", 1: "No"}[doc["label"]])
        return text

    def evaluate(self, docs, lm, provide_description, num_fewshot):
        # TODO: Implement evaluation code using new framework

        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
        # Remove this comment when the evaluation code is implemented.
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
            ctx = self.fewshot_context(
                doc=doc,
                provide_description=provide_description,
                num_fewshot=num_fewshot,
            )
            preds.append(lm.loglikelihood(ctx, ' False') > lm.loglikelihood(ctx, ' True'))
        return simple_accuracy_metric(preds=preds, golds=golds)


class QQP(HFTask):
    DATASET_PATH = "glue"
    DATASET_NAME = "qqp"

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

    def fewshot_description(self):
        return "Indicate if both questions ask the same thing."

    def doc_to_text(self, doc, include_target=True):
        text = "question 1:\t{}\nquestion 2:\t{}\nanswer:".format(
            doc["question1"],
            doc["question2"],
        )
        if include_target:
            text += " {}".format(yesno(doc["label"]))
        return text

    def evaluate(self, docs, lm, provide_description, num_fewshot):
        # TODO: Implement evaluation code using new framework

        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
        # Remove this comment when the evaluation code is implemented.
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
            ctx = self.fewshot_context(
                doc=doc,
                provide_description=provide_description,
                num_fewshot=num_fewshot,
            )
            preds.append(lm.loglikelihood(ctx, ' yes') > lm.loglikelihood(ctx, ' no'))
        return get_accuracy_and_f1(preds=preds, golds=golds)


class STSB(HFTask):
    DATASET_PATH = "glue"
    DATASET_NAME = "stsb"

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

    def fewshot_description(self):
        return "Indicate if both sentences mean the same thing from a scale of 0-5, " \
           "where 5 means identical and 0 means unrelated."

    def doc_to_text(self, doc, include_target=True):
        text = "sentence 1:\t{}\nsentence 2:\t{}\nanswer:".format(
            doc["sentence1"],
            doc["sentence2"],
        )
        if include_target:
            text += " {}".format(doc["label"])
        return text

    def evaluate(self, docs, lm, provide_description, num_fewshot):
        # TODO: Implement evaluation code using new framework

        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
        # Remove this comment when the evaluation code is implemented.
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
            ctx = self.fewshot_context(
                doc=doc,
                provide_description=provide_description,
                num_fewshot=num_fewshot,
            )
            output = lm.generate(context=ctx, max_gen_length=5).strip()
            first_element = output.split()[0]
            if first_element.isnumeric():
                pred = max(min(float(first_element), 5.0), 0.0)
            else:
                pred = 2.5
            import pdb; pdb.set_trace()
            preds.append(pred)
        pearson_corr = float(pearsonr(preds, golds)[0])
        spearman_corr = float(spearmanr(preds, golds)[0])
        minor = {
            "pearson": pearson_corr,
            "spearmanr": spearman_corr,
            "corr": (pearson_corr + spearman_corr) / 2,
        }
        return {
            "major": minor["corr"],
            "minor": minor,
            "higher_is_better": True,
        }


class SST(HFTask):
    DATASET_PATH = "glue"
    DATASET_NAME = "sst2"

    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

    def fewshot_description(self):
        return "Indicate if each sentence is Positive or Negative."

    def doc_to_text(self, doc, include_target=True):
        text = "sentence:\t{}\t\nanswer:".format(
            doc["sentence"],
        )
        if include_target:
            text += " {}".format({1: "Positive", 0: "Negative"}[doc["label"]])
        return text

    def evaluate(self, docs, lm, provide_description, num_fewshot):
        # TODO: Implement evaluation code using new framework

        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
        # Remove this comment when the evaluation code is implemented.
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
            ctx = self.fewshot_context(
                doc=doc,
                provide_description=provide_description,
                num_fewshot=num_fewshot,
            )
            preds.append(lm.loglikelihood(ctx, ' Positive') > lm.loglikelihood(ctx, ' Negative'))
        return simple_accuracy_metric(preds=preds, golds=golds)


class WNLI(HFTask):
    DATASET_PATH = "glue"
    DATASET_NAME = "wnli"
    
    def has_training_docs(self):
        return True

    def has_validation_docs(self):
        return True

    def has_test_docs(self):
        return True

    def doc_to_text(self, doc, include_target=True):
        text = "{}\nquestion:\t{}\tTrue, False or Neither?\nanswer:".format(
            doc["sentence1"],
            doc["sentence2"],
        )
        if include_target:
            # True = entailment
            # False = contradiction
            # Neither = neutral
            text += " {}".format({0: "True", 1: "Neither", 2: "False"}[doc["label"]])
        return text

    def evaluate(self, docs, lm, provide_description, num_fewshot):
        # TODO: Implement evaluation code using new framework

        # ***IMPORTANT***: this evaluation function needs to be rewritten for the new framework. 
        # For more info, check out the interface in base.py and the example BoolQ implementation in superglue.py. 
        # Remove this comment when the evaluation code is implemented.
        golds = [doc["label"] for doc in docs]
        preds = []
        for doc in tqdm_lib.tqdm(docs):
            ctx = self.fewshot_context(
                doc=doc,
                provide_description=provide_description,
                num_fewshot=num_fewshot,
            )
            probs = np.array([
                lm.loglikelihood(ctx, ' True'),
                lm.loglikelihood(ctx, ' Neither'),
                lm.loglikelihood(ctx, ' False'),
            ])
            preds.append(np.argmax(probs))
        return simple_accuracy_metric(preds=preds, golds=golds)