Return raw outputs in TextClassificationPipeline (#8328)

* Return raw outputs in TextClassificationPipeline * Style * Support for problem type * Update src/transformers/pipelines/text_classification.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Apply Nicolas' comments Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

Return raw outputs in TextClassificationPipeline (#8328)
* Return raw outputs in TextClassificationPipeline * Style * Support for problem type * Update src/transformers/pipelines/text_classification.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Apply Nicolas' comments Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
3f44a66c · Lysandre Debut · GitHub · d4c834d2 · 3f44a66c · 3f44a66c
Unverified Commit 3f44a66c authored Aug 04, 2021 by Lysandre Debut Committed by GitHub Aug 04, 2021
Showing with 84 additions and 9 deletions

src/transformers/configuration_utils.py src/transformers/configuration_utils.py +1 -1

src/transformers/pipelines/text_classification.py src/transformers/pipelines/text_classification.py +83 -8

No files found.
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -288,7 +288,7 @@ class PretrainedConfig(PushToHubMixin):
        allowed_problem_types = ("regression", "single_label_classification", "multi_label_classification")
        if self.problem_type is not None and self.problem_type not in allowed_problem_types:
            raise ValueError(
-                f"The config parameter `problem_type` wasnot understood: received {self.problem_type}"
+                f"The config parameter `problem_type` was not understood: received {self.problem_type}"
                "but only 'regression', 'single_label_classification' and 'multi_label_classification' are valid."
            )

--- a/src/transformers/pipelines/text_classification.py
+++ b/src/transformers/pipelines/text_classification.py
+from typing import Optional
 import numpy as np
-from ..file_utils import add_end_docstrings, is_tf_available, is_torch_available
+from ..file_utils import ExplicitEnum, add_end_docstrings, is_tf_available, is_torch_available
 from .base import PIPELINE_INIT_ARGS, Pipeline
@@ -11,11 +13,35 @@ if is_torch_available():
    from ..models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
+def sigmoid(_outputs):
+    return 1.0 / (1.0 + np.exp(-_outputs))
+def softmax(_outputs):
+    maxes = np.max(_outputs, axis=-1, keepdims=True)
+    shifted_exp = np.exp(_outputs - maxes)
+    return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
+class ClassificationFunction(ExplicitEnum):
+    SIGMOID = "sigmoid"
+    SOFTMAX = "softmax"
+    NONE = "none"
 @add_end_docstrings(
    PIPELINE_INIT_ARGS,
    r"""
        return_all_scores (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether to return all prediction scores or just the one of the predicted class.
+        function_to_apply (:obj:`str`, `optional`, defaults to :obj:`"default"`):
+            The function to apply to the model outputs in order to retrieve the scores. Accepts four different values:
+            - :obj:`"default"`: if the model has a single label, will apply the sigmoid function on the output. If the
+              model has several labels, will apply the softmax function on the output.
+            - :obj:`"sigmoid"`: Applies the sigmoid function on the output.
+            - :obj:`"softmax"`: Applies the softmax function on the output.
+            - :obj:`"none"`: Does not apply any function on the output.
    """,
 )
 class TextClassificationPipeline(Pipeline):
@@ -35,7 +61,9 @@ class TextClassificationPipeline(Pipeline):
    <https://huggingface.co/models?filter=text-classification>`__.
    """
-    def __init__(self, return_all_scores: bool = False, **kwargs):
+    task = "text-classification"
+    def __init__(self, return_all_scores: bool = None, function_to_apply: str = None, **kwargs):
        super().__init__(**kwargs)
        self.check_model_type(
@@ -44,15 +72,45 @@ class TextClassificationPipeline(Pipeline):
            else MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
        )
-        self.return_all_scores = return_all_scores
+        if hasattr(self.model.config, "return_all_scores") and return_all_scores is None:
+            return_all_scores = self.model.config.return_all_scores
+        if hasattr(self.model.config, "function_to_apply") and function_to_apply is None:
+            function_to_apply = self.model.config.function_to_apply
-    def __call__(self, *args, **kwargs):
+        self.return_all_scores = return_all_scores if return_all_scores is not None else False
+        self.function_to_apply = function_to_apply if function_to_apply is not None else None
+    def __call__(
+        self,
+        *args,
+        return_all_scores: Optional[bool] = None,
+        function_to_apply: Optional[ClassificationFunction] = None,
+        **kwargs
+    ):
        """
        Classify the text(s) given as inputs.
        Args:
            args (:obj:`str` or :obj:`List[str]`):
                One or several texts (or one list of prompts) to classify.
+            return_all_scores (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether to return scores for all labels.
+            function_to_apply (:obj:`str`, `optional`, defaults to :obj:`"default"`):
+                The function to apply to the model outputs in order to retrieve the scores. Accepts four different
+                values:
+                If this argument is not specified, then it will apply the following functions according to the number
+                of labels:
+                - If the model has a single label, will apply the sigmoid function on the output.
+                - If the model has several labels, will apply the softmax function on the output.
+                Possible values are:
+                - :obj:`"sigmoid"`: Applies the sigmoid function on the output.
+                - :obj:`"softmax"`: Applies the softmax function on the output.
+                - :obj:`"none"`: Does not apply any function on the output.
        Return:
            A list or a list of list of :obj:`dict`: Each result comes as list of dictionaries with the following keys:
@@ -64,11 +122,28 @@ class TextClassificationPipeline(Pipeline):
        """
        outputs = super().__call__(*args, **kwargs)
-        if self.model.config.num_labels == 1:
+        return_all_scores = return_all_scores if return_all_scores is not None else self.return_all_scores
-            scores = 1.0 / (1.0 + np.exp(-outputs))
+        function_to_apply = function_to_apply if function_to_apply is not None else self.function_to_apply
+        if function_to_apply is None:
+            if self.model.config.problem_type == "multi_label_classification" or self.model.config.num_labels == 1:
+                function_to_apply = ClassificationFunction.SIGMOID
+            elif self.model.config.problem_type == "single_label_classification" or self.model.config.num_labels > 1:
+                function_to_apply = ClassificationFunction.SOFTMAX
+        if isinstance(function_to_apply, str):
+            function_to_apply = ClassificationFunction[function_to_apply.upper()]
+        if function_to_apply == ClassificationFunction.SIGMOID:
+            scores = sigmoid(outputs)
+        elif function_to_apply == ClassificationFunction.SOFTMAX:
+            scores = softmax(outputs)
+        elif function_to_apply == ClassificationFunction.NONE:
+            scores = outputs
        else:
-            scores = np.exp(outputs) / np.exp(outputs).sum(-1, keepdims=True)
+            raise ValueError(f"Unrecognized `function_to_apply` argument: {function_to_apply}")
-        if self.return_all_scores:
+        if return_all_scores:
            return [
                [{"label": self.model.config.id2label[i], "score": score.item()} for i, score in enumerate(item)]
                for item in scores