Unverified Commit 3f44a66c authored by Lysandre Debut's avatar Lysandre Debut Committed by GitHub
Browse files

Return raw outputs in TextClassificationPipeline (#8328)



* Return raw outputs in TextClassificationPipeline

* Style

* Support for problem type

* Update src/transformers/pipelines/text_classification.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Apply Nicolas' comments
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent d4c834d2
...@@ -288,7 +288,7 @@ class PretrainedConfig(PushToHubMixin): ...@@ -288,7 +288,7 @@ class PretrainedConfig(PushToHubMixin):
allowed_problem_types = ("regression", "single_label_classification", "multi_label_classification") allowed_problem_types = ("regression", "single_label_classification", "multi_label_classification")
if self.problem_type is not None and self.problem_type not in allowed_problem_types: if self.problem_type is not None and self.problem_type not in allowed_problem_types:
raise ValueError( raise ValueError(
f"The config parameter `problem_type` wasnot understood: received {self.problem_type}" f"The config parameter `problem_type` was not understood: received {self.problem_type}"
"but only 'regression', 'single_label_classification' and 'multi_label_classification' are valid." "but only 'regression', 'single_label_classification' and 'multi_label_classification' are valid."
) )
......
from typing import Optional
import numpy as np import numpy as np
from ..file_utils import add_end_docstrings, is_tf_available, is_torch_available from ..file_utils import ExplicitEnum, add_end_docstrings, is_tf_available, is_torch_available
from .base import PIPELINE_INIT_ARGS, Pipeline from .base import PIPELINE_INIT_ARGS, Pipeline
...@@ -11,11 +13,35 @@ if is_torch_available(): ...@@ -11,11 +13,35 @@ if is_torch_available():
from ..models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING from ..models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
def sigmoid(_outputs):
return 1.0 / (1.0 + np.exp(-_outputs))
def softmax(_outputs):
maxes = np.max(_outputs, axis=-1, keepdims=True)
shifted_exp = np.exp(_outputs - maxes)
return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
class ClassificationFunction(ExplicitEnum):
SIGMOID = "sigmoid"
SOFTMAX = "softmax"
NONE = "none"
@add_end_docstrings( @add_end_docstrings(
PIPELINE_INIT_ARGS, PIPELINE_INIT_ARGS,
r""" r"""
return_all_scores (:obj:`bool`, `optional`, defaults to :obj:`False`): return_all_scores (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to return all prediction scores or just the one of the predicted class. Whether to return all prediction scores or just the one of the predicted class.
function_to_apply (:obj:`str`, `optional`, defaults to :obj:`"default"`):
The function to apply to the model outputs in order to retrieve the scores. Accepts four different values:
- :obj:`"default"`: if the model has a single label, will apply the sigmoid function on the output. If the
model has several labels, will apply the softmax function on the output.
- :obj:`"sigmoid"`: Applies the sigmoid function on the output.
- :obj:`"softmax"`: Applies the softmax function on the output.
- :obj:`"none"`: Does not apply any function on the output.
""", """,
) )
class TextClassificationPipeline(Pipeline): class TextClassificationPipeline(Pipeline):
...@@ -35,7 +61,9 @@ class TextClassificationPipeline(Pipeline): ...@@ -35,7 +61,9 @@ class TextClassificationPipeline(Pipeline):
<https://huggingface.co/models?filter=text-classification>`__. <https://huggingface.co/models?filter=text-classification>`__.
""" """
def __init__(self, return_all_scores: bool = False, **kwargs): task = "text-classification"
def __init__(self, return_all_scores: bool = None, function_to_apply: str = None, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.check_model_type( self.check_model_type(
...@@ -44,15 +72,45 @@ class TextClassificationPipeline(Pipeline): ...@@ -44,15 +72,45 @@ class TextClassificationPipeline(Pipeline):
else MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING else MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
) )
self.return_all_scores = return_all_scores if hasattr(self.model.config, "return_all_scores") and return_all_scores is None:
return_all_scores = self.model.config.return_all_scores
if hasattr(self.model.config, "function_to_apply") and function_to_apply is None:
function_to_apply = self.model.config.function_to_apply
def __call__(self, *args, **kwargs): self.return_all_scores = return_all_scores if return_all_scores is not None else False
self.function_to_apply = function_to_apply if function_to_apply is not None else None
def __call__(
self,
*args,
return_all_scores: Optional[bool] = None,
function_to_apply: Optional[ClassificationFunction] = None,
**kwargs
):
""" """
Classify the text(s) given as inputs. Classify the text(s) given as inputs.
Args: Args:
args (:obj:`str` or :obj:`List[str]`): args (:obj:`str` or :obj:`List[str]`):
One or several texts (or one list of prompts) to classify. One or several texts (or one list of prompts) to classify.
return_all_scores (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to return scores for all labels.
function_to_apply (:obj:`str`, `optional`, defaults to :obj:`"default"`):
The function to apply to the model outputs in order to retrieve the scores. Accepts four different
values:
If this argument is not specified, then it will apply the following functions according to the number
of labels:
- If the model has a single label, will apply the sigmoid function on the output.
- If the model has several labels, will apply the softmax function on the output.
Possible values are:
- :obj:`"sigmoid"`: Applies the sigmoid function on the output.
- :obj:`"softmax"`: Applies the softmax function on the output.
- :obj:`"none"`: Does not apply any function on the output.
Return: Return:
A list or a list of list of :obj:`dict`: Each result comes as list of dictionaries with the following keys: A list or a list of list of :obj:`dict`: Each result comes as list of dictionaries with the following keys:
...@@ -64,11 +122,28 @@ class TextClassificationPipeline(Pipeline): ...@@ -64,11 +122,28 @@ class TextClassificationPipeline(Pipeline):
""" """
outputs = super().__call__(*args, **kwargs) outputs = super().__call__(*args, **kwargs)
if self.model.config.num_labels == 1: return_all_scores = return_all_scores if return_all_scores is not None else self.return_all_scores
scores = 1.0 / (1.0 + np.exp(-outputs)) function_to_apply = function_to_apply if function_to_apply is not None else self.function_to_apply
if function_to_apply is None:
if self.model.config.problem_type == "multi_label_classification" or self.model.config.num_labels == 1:
function_to_apply = ClassificationFunction.SIGMOID
elif self.model.config.problem_type == "single_label_classification" or self.model.config.num_labels > 1:
function_to_apply = ClassificationFunction.SOFTMAX
if isinstance(function_to_apply, str):
function_to_apply = ClassificationFunction[function_to_apply.upper()]
if function_to_apply == ClassificationFunction.SIGMOID:
scores = sigmoid(outputs)
elif function_to_apply == ClassificationFunction.SOFTMAX:
scores = softmax(outputs)
elif function_to_apply == ClassificationFunction.NONE:
scores = outputs
else: else:
scores = np.exp(outputs) / np.exp(outputs).sum(-1, keepdims=True) raise ValueError(f"Unrecognized `function_to_apply` argument: {function_to_apply}")
if self.return_all_scores:
if return_all_scores:
return [ return [
[{"label": self.model.config.id2label[i], "score": score.item()} for i, score in enumerate(item)] [{"label": self.model.config.id2label[i], "score": score.item()} for i, score in enumerate(item)]
for item in scores for item in scores
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment