text_classification.py 9.81 KB
Newer Older
1
import warnings
2
from typing import Dict
3

4
5
import numpy as np

6
from ..utils import ExplicitEnum, add_end_docstrings, is_tf_available, is_torch_available
7
from .base import PIPELINE_INIT_ARGS, GenericTensor, Pipeline
8
9
10
11
12
13
14
15
16


if is_tf_available():
    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING

if is_torch_available():
    from ..models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING


17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
def sigmoid(_outputs):
    return 1.0 / (1.0 + np.exp(-_outputs))


def softmax(_outputs):
    maxes = np.max(_outputs, axis=-1, keepdims=True)
    shifted_exp = np.exp(_outputs - maxes)
    return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)


class ClassificationFunction(ExplicitEnum):
    SIGMOID = "sigmoid"
    SOFTMAX = "softmax"
    NONE = "none"


33
34
35
@add_end_docstrings(
    PIPELINE_INIT_ARGS,
    r"""
36
        return_all_scores (`bool`, *optional*, defaults to `False`):
37
            Whether to return all prediction scores or just the one of the predicted class.
38
        function_to_apply (`str`, *optional*, defaults to `"default"`):
39
40
            The function to apply to the model outputs in order to retrieve the scores. Accepts four different values:

Sylvain Gugger's avatar
Sylvain Gugger committed
41
42
            - `"default"`: if the model has a single label, will apply the sigmoid function on the output. If the model
              has several labels, will apply the softmax function on the output.
43
44
45
            - `"sigmoid"`: Applies the sigmoid function on the output.
            - `"softmax"`: Applies the softmax function on the output.
            - `"none"`: Does not apply any function on the output.
46
47
48
49
    """,
)
class TextClassificationPipeline(Pipeline):
    """
50
51
    Text classification pipeline using any `ModelForSequenceClassification`. See the [sequence classification
    examples](../task_summary#sequence-classification) for more information.
52

53
54
55
56
57
58
59
60
61
62
63
64
65
    Example:

    ```python
    >>> from transformers import pipeline

    >>> classifier = pipeline(model="distilbert-base-uncased-finetuned-sst-2-english")
    >>> classifier("This movie is disgustingly good !")
    [{'label': 'POSITIVE', 'score': 1.0}]

    >>> classifier("Director tried too much.")
    [{'label': 'NEGATIVE', 'score': 0.996}]
    ```

Steven Liu's avatar
Steven Liu committed
66
    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
67

Sylvain Gugger's avatar
Sylvain Gugger committed
68
69
    This text classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"sentiment-analysis"` (for classifying sequences according to positive or negative sentiments).
70

Sylvain Gugger's avatar
Sylvain Gugger committed
71
72
    If multiple classification labels are available (`model.config.num_labels >= 2`), the pipeline will run a softmax
    over the results. If there is a single label, the pipeline will run a sigmoid over the result.
73
74

    The models that this pipeline can use are models that have been fine-tuned on a sequence classification task. See
Sylvain Gugger's avatar
Sylvain Gugger committed
75
76
    the up-to-date list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=text-classification).
77
78
    """

79
80
    return_all_scores = False
    function_to_apply = ClassificationFunction.NONE
81

82
    def __init__(self, **kwargs):
83
84
85
86
87
88
89
90
        super().__init__(**kwargs)

        self.check_model_type(
            TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
            if self.framework == "tf"
            else MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
        )

91
92
93
    def _sanitize_parameters(self, return_all_scores=None, function_to_apply=None, top_k="", **tokenizer_kwargs):
        # Using "" as default argument because we're going to use `top_k=None` in user code to declare
        # "No top_k"
94
95
96
        preprocess_params = tokenizer_kwargs

        postprocess_params = {}
97
98
99
        if hasattr(self.model.config, "return_all_scores") and return_all_scores is None:
            return_all_scores = self.model.config.return_all_scores

100
101
102
103
104
        if isinstance(top_k, int) or top_k is None:
            postprocess_params["top_k"] = top_k
            postprocess_params["_legacy"] = False
        elif return_all_scores is not None:
            warnings.warn(
Moon Gi Cho's avatar
Moon Gi Cho committed
105
                "`return_all_scores` is now deprecated,  if want a similar functionality use `top_k=None` instead of"
106
107
                " `return_all_scores=True` or `top_k=1` instead of `return_all_scores=False`.",
                UserWarning,
108
109
110
111
112
            )
            if return_all_scores:
                postprocess_params["top_k"] = None
            else:
                postprocess_params["top_k"] = 1
113
114
115

        if isinstance(function_to_apply, str):
            function_to_apply = ClassificationFunction[function_to_apply.upper()]
116

117
118
119
        if function_to_apply is not None:
            postprocess_params["function_to_apply"] = function_to_apply
        return preprocess_params, {}, postprocess_params
120

121
    def __call__(self, *args, **kwargs):
122
123
124
125
        """
        Classify the text(s) given as inputs.

        Args:
126
127
            args (`str` or `List[str]` or `Dict[str]`, or `List[Dict[str]]`):
                One or several texts to classify. In order to use text pairs for your classification, you can send a
Kaito Sugimoto's avatar
Kaito Sugimoto committed
128
                dictionary containing `{"text", "text_pair"}` keys, or a list of those.
129
130
            top_k (`int`, *optional*, defaults to `1`):
                How many results to return.
131
            function_to_apply (`str`, *optional*, defaults to `"default"`):
132
133
134
135
136
137
138
139
140
141
142
                The function to apply to the model outputs in order to retrieve the scores. Accepts four different
                values:

                If this argument is not specified, then it will apply the following functions according to the number
                of labels:

                - If the model has a single label, will apply the sigmoid function on the output.
                - If the model has several labels, will apply the softmax function on the output.

                Possible values are:

143
144
145
                - `"sigmoid"`: Applies the sigmoid function on the output.
                - `"softmax"`: Applies the softmax function on the output.
                - `"none"`: Does not apply any function on the output.
146
147

        Return:
148
            A list or a list of list of `dict`: Each result comes as list of dictionaries with the following keys:
149

150
151
            - **label** (`str`) -- The label predicted.
            - **score** (`float`) -- The corresponding probability.
152

153
            If `top_k` is used, one such dictionary is returned per label.
154
        """
155
        result = super().__call__(*args, **kwargs)
156
157
158
        # TODO try and retrieve it in a nicer way from _sanitize_parameters.
        _legacy = "top_k" not in kwargs
        if isinstance(args[0], str) and _legacy:
159
160
161
162
            # This pipeline is odd, and return a list when single item is run
            return [result]
        else:
            return result
163
164
165

    def preprocess(self, inputs, **tokenizer_kwargs) -> Dict[str, GenericTensor]:
        return_tensors = self.framework
166
167
168
169
170
171
172
173
174
175
176
        if isinstance(inputs, dict):
            return self.tokenizer(**inputs, return_tensors=return_tensors, **tokenizer_kwargs)
        elif isinstance(inputs, list) and len(inputs) == 1 and isinstance(inputs[0], list) and len(inputs[0]) == 2:
            # It used to be valid to use a list of list of list for text pairs, keeping this path for BC
            return self.tokenizer(
                text=inputs[0][0], text_pair=inputs[0][1], return_tensors=return_tensors, **tokenizer_kwargs
            )
        elif isinstance(inputs, list):
            # This is likely an invalid usage of the pipeline attempting to pass text pairs.
            raise ValueError(
                "The pipeline received invalid inputs, if you are trying to send text pairs, you can try to send a"
Kaito Sugimoto's avatar
Kaito Sugimoto committed
177
                ' dictionary `{"text": "My text", "text_pair": "My pair"}` in order to send a text pair.'
178
            )
179
        return self.tokenizer(inputs, return_tensors=return_tensors, **tokenizer_kwargs)
180

181
182
    def _forward(self, model_inputs):
        return self.model(**model_inputs)
183

184
185
186
187
    def postprocess(self, model_outputs, function_to_apply=None, top_k=1, _legacy=True):
        # `_legacy` is used to determine if we're running the naked pipeline and in backward
        # compatibility mode, or if running the pipeline with `pipeline(..., top_k=1)` we're running
        # the more natural result containing the list.
188
        # Default value before `set_parameters`
189
190
191
192
193
        if function_to_apply is None:
            if self.model.config.problem_type == "multi_label_classification" or self.model.config.num_labels == 1:
                function_to_apply = ClassificationFunction.SIGMOID
            elif self.model.config.problem_type == "single_label_classification" or self.model.config.num_labels > 1:
                function_to_apply = ClassificationFunction.SOFTMAX
194
195
196
197
198
199
            elif hasattr(self.model.config, "function_to_apply") and function_to_apply is None:
                function_to_apply = self.model.config.function_to_apply
            else:
                function_to_apply = ClassificationFunction.NONE

        outputs = model_outputs["logits"][0]
200
        outputs = outputs.numpy()
201
202
203
204
205
206
207

        if function_to_apply == ClassificationFunction.SIGMOID:
            scores = sigmoid(outputs)
        elif function_to_apply == ClassificationFunction.SOFTMAX:
            scores = softmax(outputs)
        elif function_to_apply == ClassificationFunction.NONE:
            scores = outputs
208
        else:
209
210
            raise ValueError(f"Unrecognized `function_to_apply` argument: {function_to_apply}")

211
        if top_k == 1 and _legacy:
212
            return {"label": self.model.config.id2label[scores.argmax().item()], "score": scores.max().item()}
213
214
215
216
217
218
219
220
221

        dict_scores = [
            {"label": self.model.config.id2label[i], "score": score.item()} for i, score in enumerate(scores)
        ]
        if not _legacy:
            dict_scores.sort(key=lambda x: x["score"], reverse=True)
            if top_k is not None:
                dict_scores = dict_scores[:top_k]
        return dict_scores