evaluator.py

# MIT License

# Copyright (c) 2022 Ming Zhong

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import numpy as np
from nltk import sent_tokenize

from .scorer import UniEvaluator
from .utils import add_question


class SumEvaluator:
    def __init__(self, model_name_or_path, max_length=1024, device="cuda:0", cache_dir=None):
        """Set up evaluator for text summarization"""
        self.scorer = UniEvaluator(
            model_name_or_path="MingZhong/unieval-sum" if model_name_or_path == "" else model_name_or_path,
            max_length=max_length,
            device=device,
            cache_dir=cache_dir,
        )
        self.task = "summarization"
        self.dimensions = ["coherence", "consistency", "fluency", "relevance"]

    def evaluate(self, data, category, dims=None, overall=True):
        """
        Get the scores of all the given dimensions

        category: The category to be evaluated.

        dims: A list of dimensions to be evaluated. If dims is None, SumEvaluator will evaluate
              four dimensions: coherence, consistency, fluency, relevance.

        overall: indicates whether the overall score is to be calculated.
                 Overall score can be customized to a combination of scores based on different
                 dimensions. The default here is the average score of all the given dimensions.
        """
        n_data = len(data)
        eval_scores = [{} for _ in range(n_data)]

        if dims == None:
            eval_dims = self.dimensions
        else:
            assert isinstance(dims, list)
            eval_dims = dims

        for dim in eval_dims:
            # Calculate average sentence-level scores for 'consistency' and 'fluency'
            if dim == "consistency" or dim == "fluency":
                src_list, output_list = [], []
                n_sents = []  # the number of sentences in each generated summary
                for i in range(n_data):
                    source = data[i]["source"]
                    system_outputs = sent_tokenize(data[i]["system_output"])
                    n_sents.append(len(system_outputs))
                    for j in range(len(system_outputs)):
                        src_list.append(source)
                        output_list.append(system_outputs[j])
                input_list = add_question(dimension=dim, output=output_list, src=src_list, task=self.task)
                sent_score = self.scorer.score(input_list, self.task, category, dim)

                # Get average score for each sample
                start_idx = 0
                score = []
                for cur_n_sent in n_sents:
                    # prevent denominator from being 0
                    score.append(sum(sent_score[start_idx : start_idx + cur_n_sent]) / (cur_n_sent + 1e-6))
                    start_idx += cur_n_sent

            # Calculate summary-level score for 'coherence' and 'relevance'
            elif dim == "coherence" or dim == "relevance":
                src_list, output_list, ref_list = [], [], []
                for i in range(n_data):
                    src_list.append(data[i]["source"])
                    output_list.append(data[i]["system_output"])
                    if dim == "relevance":
                        ref_list.append(data[i]["reference"])
                input_list = add_question(dimension=dim, output=output_list, src=src_list, ref=ref_list, task=self.task)
                score = self.scorer.score(input_list, self.task, category, dim)

            # Please customize other dimensions here for summarization
            else:
                raise NotImplementedError(
                    "The input format for this dimension is still undefined. \
                                           Please customize it first."
                )

            for i in range(n_data):
                eval_scores[i][dim] = score[i]

        # Customize your overall score here.
        if overall == True:
            for i in range(n_data):
                eval_scores[i]["overall"] = np.mean(list(eval_scores[i].values()))

        return eval_scores


class DialogEvaluator:
    def __init__(self, model_name_or_path, max_length=1024, device="cuda:0", cache_dir=None):
        """Set up evaluator for dialogues"""
        self.scorer = UniEvaluator(
            model_name_or_path="MingZhong/unieval-dialog" if model_name_or_path == "" else model_name_or_path,
            max_length=max_length,
            device=device,
            cache_dir=cache_dir,
        )
        self.task = "dialogue"
        self.dimensions = ["naturalness", "coherence", "engagingness", "groundedness", "understandability"]

    def evaluate(self, data, category, dims=None, overall=True):
        """
        Get the scores of all the given dimensions

        category: The category to be evaluated.

        dims: A list of dimensions to be evaluated. If dims is None, DialogEvaluator will evaluate
              five dimensions: naturalness, coherence, engagingness, groundedness and understandability.

        overall: indicates whether the overall score is to be calculated.
                 Overall score can be customized to a combination of scores based on different
                 dimensions. The default here is the average score of all the given dimensions.
        """
        n_data = len(data)
        eval_scores = [{} for _ in range(n_data)]

        if dims == None:
            eval_dims = self.dimensions
        else:
            assert isinstance(dims, list)
            eval_dims = dims

        for dim in eval_dims:
            # Calculate summation score for 'engagingness'
            if dim == "engagingness":
                src_list, output_list, context_list = [], [], []
                n_sents = []  # the number of sentences in each generated response
                for i in range(n_data):
                    source = data[i]["source"]
                    context = data[i]["context"]
                    system_outputs = sent_tokenize(data[i]["system_output"])
                    n_sents.append(len(system_outputs))
                    for j in range(len(system_outputs)):
                        src_list.append(source)
                        context_list.append(context)
                        output_list.append(system_outputs[j])
                input_list = add_question(
                    dimension=dim, output=output_list, src=src_list, context=context_list, task=self.task
                )
                sent_score = self.scorer.score(input_list, self.task, category, dim)

                # Get the summation score for each sample
                start_idx = 0
                score = []
                for cur_n_sent in n_sents:
                    score.append(sum(sent_score[start_idx : start_idx + cur_n_sent]))
                    start_idx += cur_n_sent

            # Calculate turn-level score for other dimensions
            elif dim in ["naturalness", "coherence", "groundedness", "understandability"]:
                src_list, output_list, context_list = [], [], []
                for i in range(n_data):
                    src_list.append(data[i]["source"])
                    output_list.append(data[i]["system_output"])
                    context_list.append(data[i]["context"])
                input_list = add_question(
                    dimension=dim, output=output_list, src=src_list, context=context_list, task=self.task
                )
                score = self.scorer.score(input_list, self.task, category, dim)

            # Please customize other dimensions here for summarization
            else:
                raise NotImplementedError(
                    "The input format for this dimension is still undefined. \
                                           Please customize it first."
                )

            for i in range(n_data):
                eval_scores[i][dim] = score[i]

        # Customize your overall score here.
        if overall == True:
            for i in range(n_data):
                eval_scores[i]["overall"] = np.mean(list(eval_scores[i].values()))

        return eval_scores


class D2tEvaluator:
    def __init__(self, model_name_or_path, max_length=1024, device="cuda:0", cache_dir=None):
        """Set up evaluator for data-to-text"""
        self.scorer = UniEvaluator(
            model_name_or_path="MingZhong/unieval-sum" if model_name_or_path == "" else model_name_or_path,
            max_length=max_length,
            device=device,
            cache_dir=cache_dir,
        )
        self.task = "data2text"
        self.dimensions = ["naturalness", "informativeness"]

    def evaluate(self, data, category, dims=None, overall=True):
        """
        Get the scores of all the given dimensions

        category: The category to be evaluated.

        dims: A list of dimensions to be evaluated. If dims is None, D2tEvaluator will evaluate
              two dimensions: naturalness and informativeness.

        overall: indicates whether the overall score is to be calculated.
                 Overall score can be customized to a combination of scores based on different
                 dimensions. The default here is the average score of all the given dimensions.
        """
        n_data = len(data)
        eval_scores = [{} for _ in range(n_data)]

        if dims == None:
            eval_dims = self.dimensions
        else:
            assert isinstance(dims, list)
            eval_dims = dims

        for dim in eval_dims:
            output_list, ref_list = [], []
            for i in range(n_data):
                output_list.append(data[i]["system_output"])
                ref_list.append(data[i]["reference"])

            input_list = add_question(dimension=dim, output=output_list, ref=ref_list, task=self.task)
            score = self.scorer.score(input_list, self.task, category, dim)

            for i in range(n_data):
                eval_scores[i][dim] = score[i]

        # Customize your overall score here.
        if overall == True:
            for i in range(n_data):
                eval_scores[i]["overall"] = np.mean(list(eval_scores[i].values()))

        return eval_scores


class FactEvaluator:
    def __init__(self, model_name_or_path, max_length=1024, device="cuda:0", cache_dir=None):
        """Set up evaluator for factual consistency detection"""
        self.scorer = UniEvaluator(
            model_name_or_path="MingZhong/unieval-fact" if model_name_or_path == "" else model_name_or_path,
            max_length=max_length,
            device=device,
            cache_dir=cache_dir,
        )
        self.task = "fact"
        self.dim = "consistency"

    def evaluate(self, data, category):
        """
        Get the factual consistency score (only 1 dimension for this task)

        category: The category to be evaluated.
        """
        n_data = len(data)
        eval_scores = [{} for _ in range(n_data)]

        # Calculate average sentence-level scores for factual consistency
        src_list, output_list = [], []
        n_sents = []  # the number of sentences in the claim
        for i in range(n_data):
            source = data[i]["source"]
            system_outputs = sent_tokenize(data[i]["system_output"])
            n_sents.append(len(system_outputs))
            for j in range(len(system_outputs)):
                src_list.append(source)
                output_list.append(system_outputs[j])
        input_list = add_question(dimension=self.dim, output=output_list, src=src_list, task=self.task)
        sent_score = self.scorer.score(input_list, self.task, category, self.dim)

        # Get average score for each sample
        start_idx = 0
        score = []
        for cur_n_sent in n_sents:
            score.append(sum(sent_score[start_idx : start_idx + cur_n_sent]) / cur_n_sent)
            start_idx += cur_n_sent

        for i in range(n_data):
            eval_scores[i][self.dim] = score[i]

        return eval_scores


def get_evaluator(task, model_name_or_path="", max_length=1024, device="cuda:0", cache_dir=None):
    assert task in ["summarization", "dialogue", "data2text", "fact"]
    if task == "summarization":
        return SumEvaluator(
            model_name_or_path=model_name_or_path, max_length=max_length, device=device, cache_dir=cache_dir
        )
    elif task == "dialogue":
        return DialogEvaluator(
            model_name_or_path=model_name_or_path, max_length=max_length, device=device, cache_dir=cache_dir
        )
    elif task == "data2text":
        return D2tEvaluator(
            model_name_or_path=model_name_or_path, max_length=max_length, device=device, cache_dir=cache_dir
        )
    elif task == "fact":
        return FactEvaluator(
            model_name_or_path=model_name_or_path, max_length=max_length, device=device, cache_dir=cache_dir
        )
    else:
        raise NotImplementedError(
            "Other tasks are not implemented, \
                                   please customize specific tasks here."
        )