text_classification.py 7.12 KB
Newer Older
1
from typing import Dict
2

3
4
import numpy as np

5
from ..file_utils import ExplicitEnum, add_end_docstrings, is_tf_available, is_torch_available
6
from .base import PIPELINE_INIT_ARGS, GenericTensor, Pipeline
7
8
9
10
11
12
13
14
15


if is_tf_available():
    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING

if is_torch_available():
    from ..models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING


16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
def sigmoid(_outputs):
    return 1.0 / (1.0 + np.exp(-_outputs))


def softmax(_outputs):
    maxes = np.max(_outputs, axis=-1, keepdims=True)
    shifted_exp = np.exp(_outputs - maxes)
    return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)


class ClassificationFunction(ExplicitEnum):
    SIGMOID = "sigmoid"
    SOFTMAX = "softmax"
    NONE = "none"


32
33
34
@add_end_docstrings(
    PIPELINE_INIT_ARGS,
    r"""
35
        return_all_scores (`bool`, *optional*, defaults to `False`):
36
            Whether to return all prediction scores or just the one of the predicted class.
37
        function_to_apply (`str`, *optional*, defaults to `"default"`):
38
39
            The function to apply to the model outputs in order to retrieve the scores. Accepts four different values:

40
            - `"default"`: if the model has a single label, will apply the sigmoid function on the output. If the
41
              model has several labels, will apply the softmax function on the output.
42
43
44
            - `"sigmoid"`: Applies the sigmoid function on the output.
            - `"softmax"`: Applies the softmax function on the output.
            - `"none"`: Does not apply any function on the output.
45
46
47
48
    """,
)
class TextClassificationPipeline(Pipeline):
    """
49
50
    Text classification pipeline using any `ModelForSequenceClassification`. See the [sequence classification
    examples](../task_summary#sequence-classification) for more information.
51

52
53
    This text classification pipeline can currently be loaded from [`pipeline`] using the following
    task identifier: `"sentiment-analysis"` (for classifying sequences according to positive or negative
54
55
    sentiments).

56
    If multiple classification labels are available (`model.config.num_labels >= 2`), the pipeline will run a
57
58
59
    softmax over the results. If there is a single label, the pipeline will run a sigmoid over the result.

    The models that this pipeline can use are models that have been fine-tuned on a sequence classification task. See
60
    the up-to-date list of available models on [huggingface.co/models](https://huggingface.co/models?filter=text-classification).
61
62
    """

63
64
    return_all_scores = False
    function_to_apply = ClassificationFunction.NONE
65

66
    def __init__(self, **kwargs):
67
68
69
70
71
72
73
74
        super().__init__(**kwargs)

        self.check_model_type(
            TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
            if self.framework == "tf"
            else MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
        )

75
76
77
78
    def _sanitize_parameters(self, return_all_scores=None, function_to_apply=None, **tokenizer_kwargs):
        preprocess_params = tokenizer_kwargs

        postprocess_params = {}
79
80
81
        if hasattr(self.model.config, "return_all_scores") and return_all_scores is None:
            return_all_scores = self.model.config.return_all_scores

82
83
84
85
86
        if return_all_scores is not None:
            postprocess_params["return_all_scores"] = return_all_scores

        if isinstance(function_to_apply, str):
            function_to_apply = ClassificationFunction[function_to_apply.upper()]
87

88
89
90
        if function_to_apply is not None:
            postprocess_params["function_to_apply"] = function_to_apply
        return preprocess_params, {}, postprocess_params
91

92
    def __call__(self, *args, **kwargs):
93
94
95
96
        """
        Classify the text(s) given as inputs.

        Args:
97
            args (`str` or `List[str]`):
98
                One or several texts (or one list of prompts) to classify.
99
            return_all_scores (`bool`, *optional*, defaults to `False`):
100
                Whether to return scores for all labels.
101
            function_to_apply (`str`, *optional*, defaults to `"default"`):
102
103
104
105
106
107
108
109
110
111
112
                The function to apply to the model outputs in order to retrieve the scores. Accepts four different
                values:

                If this argument is not specified, then it will apply the following functions according to the number
                of labels:

                - If the model has a single label, will apply the sigmoid function on the output.
                - If the model has several labels, will apply the softmax function on the output.

                Possible values are:

113
114
115
                - `"sigmoid"`: Applies the sigmoid function on the output.
                - `"softmax"`: Applies the softmax function on the output.
                - `"none"`: Does not apply any function on the output.
116
117

        Return:
118
            A list or a list of list of `dict`: Each result comes as list of dictionaries with the following keys:
119

120
121
            - **label** (`str`) -- The label predicted.
            - **score** (`float`) -- The corresponding probability.
122

123
            If `self.return_all_scores=True`, one such dictionary is returned per label.
124
        """
125
126
127
128
129
130
        result = super().__call__(*args, **kwargs)
        if isinstance(args[0], str):
            # This pipeline is odd, and return a list when single item is run
            return [result]
        else:
            return result
131
132
133
134

    def preprocess(self, inputs, **tokenizer_kwargs) -> Dict[str, GenericTensor]:
        return_tensors = self.framework
        return self.tokenizer(inputs, return_tensors=return_tensors, **tokenizer_kwargs)
135

136
137
    def _forward(self, model_inputs):
        return self.model(**model_inputs)
138

139
140
    def postprocess(self, model_outputs, function_to_apply=None, return_all_scores=False):
        # Default value before `set_parameters`
141
142
143
144
145
        if function_to_apply is None:
            if self.model.config.problem_type == "multi_label_classification" or self.model.config.num_labels == 1:
                function_to_apply = ClassificationFunction.SIGMOID
            elif self.model.config.problem_type == "single_label_classification" or self.model.config.num_labels > 1:
                function_to_apply = ClassificationFunction.SOFTMAX
146
147
148
149
150
151
            elif hasattr(self.model.config, "function_to_apply") and function_to_apply is None:
                function_to_apply = self.model.config.function_to_apply
            else:
                function_to_apply = ClassificationFunction.NONE

        outputs = model_outputs["logits"][0]
152
        outputs = outputs.numpy()
153
154
155
156
157
158
159

        if function_to_apply == ClassificationFunction.SIGMOID:
            scores = sigmoid(outputs)
        elif function_to_apply == ClassificationFunction.SOFTMAX:
            scores = softmax(outputs)
        elif function_to_apply == ClassificationFunction.NONE:
            scores = outputs
160
        else:
161
162
163
            raise ValueError(f"Unrecognized `function_to_apply` argument: {function_to_apply}")

        if return_all_scores:
164
            return [{"label": self.model.config.id2label[i], "score": score.item()} for i, score in enumerate(scores)]
165
        else:
166
            return {"label": self.model.config.id2label[scores.argmax().item()], "score": scores.max().item()}