Unverified Commit ba3264b4 authored by amyeroberts's avatar amyeroberts Committed by GitHub
Browse files

Image Feature Extraction pipeline (#28216)



* Draft pipeline

* Fixup

* Fix docstrings

* Update doctest

* Update pipeline_model_mapping

* Update docstring

* Update tests

* Update src/transformers/pipelines/image_feature_extraction.py
Co-authored-by: default avatarOmar Sanseviero <osanseviero@gmail.com>

* Fix docstrings - review comments

* Remove pipeline mapping for composite vision models

* Add to pipeline tests

* Remove for flava (multimodal)

* safe pil import

* Add requirements for pipeline run

* Account for super slow efficientnet

* Review comments

* Fix tests

* Swap order of kwargs

* Use build_pipeline_init_args

* Add back FE pipeline for Vilt

* Include image_processor_kwargs in docstring

* Mark test as flaky

* Update TODO

* Update tests/pipelines/test_pipelines_image_feature_extraction.py
Co-authored-by: default avatarArthur <48595927+ArthurZucker@users.noreply.github.com>

* Add license header

---------
Co-authored-by: default avatarOmar Sanseviero <osanseviero@gmail.com>
Co-authored-by: default avatarArthur <48595927+ArthurZucker@users.noreply.github.com>
parent 7addc934
...@@ -469,6 +469,12 @@ Pipelines available for multimodal tasks include the following. ...@@ -469,6 +469,12 @@ Pipelines available for multimodal tasks include the following.
- __call__ - __call__
- all - all
### ImageFeatureExtractionPipeline
[[autodoc]] ImageFeatureExtractionPipeline
- __call__
- all
### ImageToTextPipeline ### ImageToTextPipeline
[[autodoc]] ImageToTextPipeline [[autodoc]] ImageToTextPipeline
......
...@@ -25,7 +25,7 @@ Recognition、Masked Language Modeling、Sentiment Analysis、Feature Extraction ...@@ -25,7 +25,7 @@ Recognition、Masked Language Modeling、Sentiment Analysis、Feature Extraction
パイプラインの抽象化には2つのカテゴリーがある: パイプラインの抽象化には2つのカテゴリーがある:
- [`pipeline`] は、他のすべてのパイプラインをカプセル化する最も強力なオブジェクトです。 - [`pipeline`] は、他のすべてのパイプラインをカプセル化する最も強力なオブジェクトです。
- タスク固有のパイプラインは、[オーディオ](#audio)[コンピューター ビジョン](#computer-vision)[自然言語処理](#natural-language-processing)、および [マルチモーダル](#multimodal) タスクで使用できます。 - タスク固有のパイプラインは、[オーディオ](#audio)[コンピューター ビジョン](#computer-vision)[自然言語処理](#natural-language-processing)、および [マルチモーダル](#multimodal) タスクで使用できます。
## The pipeline abstraction ## The pipeline abstraction
...@@ -477,6 +477,12 @@ my_pipeline = pipeline(model="xxxx", pipeline_class=MyPipeline) ...@@ -477,6 +477,12 @@ my_pipeline = pipeline(model="xxxx", pipeline_class=MyPipeline)
- __call__ - __call__
- all - all
### ImageFeatureExtractionPipeline
[[autodoc]] ImageFeatureExtractionPipeline
- __call__
- all
### ImageToTextPipeline ### ImageToTextPipeline
[[autodoc]] ImageToTextPipeline [[autodoc]] ImageToTextPipeline
......
...@@ -435,7 +435,7 @@ See [`TokenClassificationPipeline`] for all details. ...@@ -435,7 +435,7 @@ See [`TokenClassificationPipeline`] for all details.
- __call__ - __call__
- all - all
## 多模态 ## 多模态
可用于多模态任务的pipeline包括以下几种。 可用于多模态任务的pipeline包括以下几种。
...@@ -451,6 +451,12 @@ See [`TokenClassificationPipeline`] for all details. ...@@ -451,6 +451,12 @@ See [`TokenClassificationPipeline`] for all details.
- __call__ - __call__
- all - all
### ImageFeatureExtractionPipeline
[[autodoc]] ImageFeatureExtractionPipeline
- __call__
- all
### ImageToTextPipeline ### ImageToTextPipeline
[[autodoc]] ImageToTextPipeline [[autodoc]] ImageToTextPipeline
......
...@@ -973,6 +973,7 @@ _import_structure = { ...@@ -973,6 +973,7 @@ _import_structure = {
"FeatureExtractionPipeline", "FeatureExtractionPipeline",
"FillMaskPipeline", "FillMaskPipeline",
"ImageClassificationPipeline", "ImageClassificationPipeline",
"ImageFeatureExtractionPipeline",
"ImageSegmentationPipeline", "ImageSegmentationPipeline",
"ImageToImagePipeline", "ImageToImagePipeline",
"ImageToTextPipeline", "ImageToTextPipeline",
...@@ -5709,6 +5710,7 @@ if TYPE_CHECKING: ...@@ -5709,6 +5710,7 @@ if TYPE_CHECKING:
FeatureExtractionPipeline, FeatureExtractionPipeline,
FillMaskPipeline, FillMaskPipeline,
ImageClassificationPipeline, ImageClassificationPipeline,
ImageFeatureExtractionPipeline,
ImageSegmentationPipeline, ImageSegmentationPipeline,
ImageToImagePipeline, ImageToImagePipeline,
ImageToTextPipeline, ImageToTextPipeline,
......
...@@ -66,6 +66,7 @@ from .document_question_answering import DocumentQuestionAnsweringPipeline ...@@ -66,6 +66,7 @@ from .document_question_answering import DocumentQuestionAnsweringPipeline
from .feature_extraction import FeatureExtractionPipeline from .feature_extraction import FeatureExtractionPipeline
from .fill_mask import FillMaskPipeline from .fill_mask import FillMaskPipeline
from .image_classification import ImageClassificationPipeline from .image_classification import ImageClassificationPipeline
from .image_feature_extraction import ImageFeatureExtractionPipeline
from .image_segmentation import ImageSegmentationPipeline from .image_segmentation import ImageSegmentationPipeline
from .image_to_image import ImageToImagePipeline from .image_to_image import ImageToImagePipeline
from .image_to_text import ImageToTextPipeline from .image_to_text import ImageToTextPipeline
...@@ -362,6 +363,18 @@ SUPPORTED_TASKS = { ...@@ -362,6 +363,18 @@ SUPPORTED_TASKS = {
}, },
"type": "image", "type": "image",
}, },
"image-feature-extraction": {
"impl": ImageFeatureExtractionPipeline,
"tf": (TFAutoModel,) if is_tf_available() else (),
"pt": (AutoModel,) if is_torch_available() else (),
"default": {
"model": {
"pt": ("google/vit-base-patch16-224", "29e7a1e183"),
"tf": ("google/vit-base-patch16-224", "29e7a1e183"),
}
},
"type": "image",
},
"image-segmentation": { "image-segmentation": {
"impl": ImageSegmentationPipeline, "impl": ImageSegmentationPipeline,
"tf": (), "tf": (),
...@@ -500,6 +513,7 @@ def check_task(task: str) -> Tuple[str, Dict, Any]: ...@@ -500,6 +513,7 @@ def check_task(task: str) -> Tuple[str, Dict, Any]:
- `"feature-extraction"` - `"feature-extraction"`
- `"fill-mask"` - `"fill-mask"`
- `"image-classification"` - `"image-classification"`
- `"image-feature-extraction"`
- `"image-segmentation"` - `"image-segmentation"`
- `"image-to-text"` - `"image-to-text"`
- `"image-to-image"` - `"image-to-image"`
...@@ -586,6 +600,7 @@ def pipeline( ...@@ -586,6 +600,7 @@ def pipeline(
- `"feature-extraction"`: will return a [`FeatureExtractionPipeline`]. - `"feature-extraction"`: will return a [`FeatureExtractionPipeline`].
- `"fill-mask"`: will return a [`FillMaskPipeline`]:. - `"fill-mask"`: will return a [`FillMaskPipeline`]:.
- `"image-classification"`: will return a [`ImageClassificationPipeline`]. - `"image-classification"`: will return a [`ImageClassificationPipeline`].
- `"image-feature-extraction"`: will return an [`ImageFeatureExtractionPipeline`].
- `"image-segmentation"`: will return a [`ImageSegmentationPipeline`]. - `"image-segmentation"`: will return a [`ImageSegmentationPipeline`].
- `"image-to-image"`: will return a [`ImageToImagePipeline`]. - `"image-to-image"`: will return a [`ImageToImagePipeline`].
- `"image-to-text"`: will return a [`ImageToTextPipeline`]. - `"image-to-text"`: will return a [`ImageToTextPipeline`].
......
...@@ -14,7 +14,7 @@ from .base import GenericTensor, Pipeline, build_pipeline_init_args ...@@ -14,7 +14,7 @@ from .base import GenericTensor, Pipeline, build_pipeline_init_args
) )
class FeatureExtractionPipeline(Pipeline): class FeatureExtractionPipeline(Pipeline):
""" """
Feature extraction pipeline using no model head. This pipeline extracts the hidden states from the base Feature extraction pipeline uses no model head. This pipeline extracts the hidden states from the base
transformer, which can be used as features in downstream tasks. transformer, which can be used as features in downstream tasks.
Example: Example:
......
from typing import Dict
from ..utils import add_end_docstrings, is_vision_available
from .base import GenericTensor, Pipeline, build_pipeline_init_args
if is_vision_available():
from ..image_utils import load_image
@add_end_docstrings(
build_pipeline_init_args(has_image_processor=True),
"""
image_processor_kwargs (`dict`, *optional*):
Additional dictionary of keyword arguments passed along to the image processor e.g.
{"size": {"height": 100, "width": 100}}
""",
)
class ImageFeatureExtractionPipeline(Pipeline):
"""
Image feature extraction pipeline uses no model head. This pipeline extracts the hidden states from the base
transformer, which can be used as features in downstream tasks.
Example:
```python
>>> from transformers import pipeline
>>> extractor = pipeline(model="google/vit-base-patch16-224", task="image-feature-extraction")
>>> result = extractor("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", return_tensors=True)
>>> result.shape # This is a tensor of shape [1, sequence_lenth, hidden_dimension] representing the input image.
torch.Size([1, 197, 768])
```
Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
This image feature extraction pipeline can currently be loaded from [`pipeline`] using the task identifier:
`"image-feature-extraction"`.
All vision models may be used for this pipeline. See a list of all models, including community-contributed models on
[huggingface.co/models](https://huggingface.co/models).
"""
def _sanitize_parameters(self, image_processor_kwargs=None, return_tensors=None, **kwargs):
preprocess_params = {} if image_processor_kwargs is None else image_processor_kwargs
postprocess_params = {"return_tensors": return_tensors} if return_tensors is not None else {}
if "timeout" in kwargs:
preprocess_params["timeout"] = kwargs["timeout"]
return preprocess_params, {}, postprocess_params
def preprocess(self, image, timeout=None, **image_processor_kwargs) -> Dict[str, GenericTensor]:
image = load_image(image, timeout=timeout)
model_inputs = self.image_processor(image, return_tensors=self.framework, **image_processor_kwargs)
return model_inputs
def _forward(self, model_inputs):
model_outputs = self.model(**model_inputs)
return model_outputs
def postprocess(self, model_outputs, return_tensors=False):
# [0] is the first available tensor, logits or last_hidden_state.
if return_tensors:
return model_outputs[0]
if self.framework == "pt":
return model_outputs[0].tolist()
elif self.framework == "tf":
return model_outputs[0].numpy().tolist()
def __call__(self, *args, **kwargs):
"""
Extract the features of the input(s).
Args:
images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
The pipeline handles three types of images:
- A string containing a http link pointing to an image
- A string containing a local path to an image
- An image loaded in PIL directly
The pipeline accepts either a single image or a batch of images, which must then be passed as a string.
Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL
images.
timeout (`float`, *optional*, defaults to None):
The maximum time in seconds to wait for fetching images from the web. If None, no timeout is used and
the call may block forever.
Return:
A nested list of `float`: The features computed by the model.
"""
return super().__call__(*args, **kwargs)
# coding=utf-8
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List, Union from typing import List, Union
from ..utils import ( from ..utils import (
......
...@@ -242,7 +242,7 @@ class BeitModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): ...@@ -242,7 +242,7 @@ class BeitModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
) )
pipeline_model_mapping = ( pipeline_model_mapping = (
{ {
"feature-extraction": BeitModel, "image-feature-extraction": BeitModel,
"image-classification": BeitForImageClassification, "image-classification": BeitForImageClassification,
"image-segmentation": BeitForSemanticSegmentation, "image-segmentation": BeitForSemanticSegmentation,
} }
......
...@@ -162,7 +162,7 @@ class BitModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): ...@@ -162,7 +162,7 @@ class BitModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (BitModel, BitForImageClassification, BitBackbone) if is_torch_available() else () all_model_classes = (BitModel, BitForImageClassification, BitBackbone) if is_torch_available() else ()
pipeline_model_mapping = ( pipeline_model_mapping = (
{"feature-extraction": BitModel, "image-classification": BitForImageClassification} {"image-feature-extraction": BitModel, "image-classification": BitForImageClassification}
if is_torch_available() if is_torch_available()
else {} else {}
) )
......
...@@ -429,7 +429,10 @@ class BlipModelTester: ...@@ -429,7 +429,10 @@ class BlipModelTester:
class BlipModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): class BlipModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (BlipModel,) if is_torch_available() else () all_model_classes = (BlipModel,) if is_torch_available() else ()
pipeline_model_mapping = ( pipeline_model_mapping = (
{"feature-extraction": BlipModel, "image-to-text": BlipForConditionalGeneration} {
"feature-extraction": BlipModel,
"image-to-text": BlipForConditionalGeneration,
}
if is_torch_available() if is_torch_available()
else {} else {}
) )
......
...@@ -477,7 +477,9 @@ class CLIPModelTester: ...@@ -477,7 +477,9 @@ class CLIPModelTester:
@require_torch @require_torch
class CLIPModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): class CLIPModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (CLIPModel,) if is_torch_available() else () all_model_classes = (CLIPModel,) if is_torch_available() else ()
pipeline_model_mapping = {"feature-extraction": CLIPModel} if is_torch_available() else {} pipeline_model_mapping = (
{"feature-extraction": CLIPModel, "image-feature-extraction": CLIPVisionModel} if is_torch_available() else {}
)
fx_compatible = True fx_compatible = True
test_head_masking = False test_head_masking = False
test_pruning = False test_pruning = False
......
...@@ -185,7 +185,7 @@ class ConditionalDetrModelTest(ModelTesterMixin, GenerationTesterMixin, Pipeline ...@@ -185,7 +185,7 @@ class ConditionalDetrModelTest(ModelTesterMixin, GenerationTesterMixin, Pipeline
else () else ()
) )
pipeline_model_mapping = ( pipeline_model_mapping = (
{"feature-extraction": ConditionalDetrModel, "object-detection": ConditionalDetrForObjectDetection} {"image-feature-extraction": ConditionalDetrModel, "object-detection": ConditionalDetrForObjectDetection}
if is_torch_available() if is_torch_available()
else {} else {}
) )
......
...@@ -172,7 +172,7 @@ class ConvNextModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase ...@@ -172,7 +172,7 @@ class ConvNextModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
else () else ()
) )
pipeline_model_mapping = ( pipeline_model_mapping = (
{"feature-extraction": ConvNextModel, "image-classification": ConvNextForImageClassification} {"image-feature-extraction": ConvNextModel, "image-classification": ConvNextForImageClassification}
if is_torch_available() if is_torch_available()
else {} else {}
) )
......
...@@ -180,7 +180,7 @@ class ConvNextV2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa ...@@ -180,7 +180,7 @@ class ConvNextV2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa
else () else ()
) )
pipeline_model_mapping = ( pipeline_model_mapping = (
{"feature-extraction": ConvNextV2Model, "image-classification": ConvNextV2ForImageClassification} {"image-feature-extraction": ConvNextV2Model, "image-classification": ConvNextV2ForImageClassification}
if is_torch_available() if is_torch_available()
else {} else {}
) )
......
...@@ -151,7 +151,7 @@ class CvtModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): ...@@ -151,7 +151,7 @@ class CvtModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (CvtModel, CvtForImageClassification) if is_torch_available() else () all_model_classes = (CvtModel, CvtForImageClassification) if is_torch_available() else ()
pipeline_model_mapping = ( pipeline_model_mapping = (
{"feature-extraction": CvtModel, "image-classification": CvtForImageClassification} {"image-feature-extraction": CvtModel, "image-classification": CvtForImageClassification}
if is_torch_available() if is_torch_available()
else {} else {}
) )
......
...@@ -178,7 +178,7 @@ class Data2VecVisionModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Te ...@@ -178,7 +178,7 @@ class Data2VecVisionModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Te
) )
pipeline_model_mapping = ( pipeline_model_mapping = (
{ {
"feature-extraction": Data2VecVisionModel, "image-feature-extraction": Data2VecVisionModel,
"image-classification": Data2VecVisionForImageClassification, "image-classification": Data2VecVisionForImageClassification,
"image-segmentation": Data2VecVisionForSemanticSegmentation, "image-segmentation": Data2VecVisionForSemanticSegmentation,
} }
......
...@@ -191,7 +191,7 @@ class DeformableDetrModelTester: ...@@ -191,7 +191,7 @@ class DeformableDetrModelTester:
class DeformableDetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): class DeformableDetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (DeformableDetrModel, DeformableDetrForObjectDetection) if is_torch_available() else () all_model_classes = (DeformableDetrModel, DeformableDetrForObjectDetection) if is_torch_available() else ()
pipeline_model_mapping = ( pipeline_model_mapping = (
{"feature-extraction": DeformableDetrModel, "object-detection": DeformableDetrForObjectDetection} {"image-feature-extraction": DeformableDetrModel, "object-detection": DeformableDetrForObjectDetection}
if is_torch_available() if is_torch_available()
else {} else {}
) )
......
...@@ -206,7 +206,7 @@ class DeiTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): ...@@ -206,7 +206,7 @@ class DeiTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
) )
pipeline_model_mapping = ( pipeline_model_mapping = (
{ {
"feature-extraction": DeiTModel, "image-feature-extraction": DeiTModel,
"image-classification": (DeiTForImageClassification, DeiTForImageClassificationWithTeacher), "image-classification": (DeiTForImageClassification, DeiTForImageClassificationWithTeacher),
} }
if is_torch_available() if is_torch_available()
......
...@@ -217,7 +217,7 @@ class DetaModelTester: ...@@ -217,7 +217,7 @@ class DetaModelTester:
class DetaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): class DetaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (DetaModel, DetaForObjectDetection) if is_torchvision_available() else () all_model_classes = (DetaModel, DetaForObjectDetection) if is_torchvision_available() else ()
pipeline_model_mapping = ( pipeline_model_mapping = (
{"feature-extraction": DetaModel, "object-detection": DetaForObjectDetection} {"image-feature-extraction": DetaModel, "object-detection": DetaForObjectDetection}
if is_torchvision_available() if is_torchvision_available()
else {} else {}
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment