Unverified Commit b5c6fdec authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

PoC for a ProcessorMixin class (#15549)



* PoC for a ProcessorMixin class

* Documentation

* Apply suggestions from code review
Co-authored-by: default avatarNielsRogge <48327001+NielsRogge@users.noreply.github.com>
Co-authored-by: default avatarSuraj Patil <surajp815@gmail.com>
Co-authored-by: default avatarPatrick von Platen <patrick.v.platen@gmail.com>

* Roll out to other processors

* Add base feature extractor class in init

* Use args and kwargs
Co-authored-by: default avatarNielsRogge <48327001+NielsRogge@users.noreply.github.com>
Co-authored-by: default avatarSuraj Patil <surajp815@gmail.com>
Co-authored-by: default avatarPatrick von Platen <patrick.v.platen@gmail.com>
parent ba3f9a71
...@@ -12,10 +12,22 @@ specific language governing permissions and limitations under the License. ...@@ -12,10 +12,22 @@ specific language governing permissions and limitations under the License.
# Processors # Processors
This library includes processors for several traditional tasks. These processors can be used to process a dataset into Processors can mean two different things in the Transformers library:
examples that can be fed to a model. - the objects that pre-process inputs for multi-modal models such as [Wav2Vec2](../model_doc/wav2vec2) (speech and text)
or [CLIP](../model_doc/clip) (text and vision)
- deprecated objects that were used in older versions of the library to preprocess data for GLUE or SQUAD.
## Processors ## Multi-modal processors
Any multi-modal model will require an object to encode or decode the data that groups several modalities (among text,
vision and audio). This is handled by objects called processors, which group tokenizers (for the text modality) and
feature extractors (for vision and audio).
Those processors inherit from the following base class that implements the saving and loading functionality:
[[autodoc]] ProcessorMixin
## Deprecated processors
All processors follow the same architecture which is that of the All processors follow the same architecture which is that of the
[`~data.processors.utils.DataProcessor`]. The processor returns a list of [`~data.processors.utils.DataProcessor`]. The processor returns a list of
......
...@@ -95,7 +95,7 @@ _import_structure = { ...@@ -95,7 +95,7 @@ _import_structure = {
"dependency_versions_table": [], "dependency_versions_table": [],
"dynamic_module_utils": [], "dynamic_module_utils": [],
"feature_extraction_sequence_utils": ["SequenceFeatureExtractor"], "feature_extraction_sequence_utils": ["SequenceFeatureExtractor"],
"feature_extraction_utils": ["BatchFeature"], "feature_extraction_utils": ["BatchFeature", "FeatureExtractionMixin"],
"file_utils": [ "file_utils": [
"CONFIG_NAME", "CONFIG_NAME",
"MODEL_CARD_NAME", "MODEL_CARD_NAME",
...@@ -365,6 +365,7 @@ _import_structure = { ...@@ -365,6 +365,7 @@ _import_structure = {
"ZeroShotClassificationPipeline", "ZeroShotClassificationPipeline",
"pipeline", "pipeline",
], ],
"processing_utils": ["ProcessorMixin"],
"testing_utils": [], "testing_utils": [],
"tokenization_utils": ["PreTrainedTokenizer"], "tokenization_utils": ["PreTrainedTokenizer"],
"tokenization_utils_base": [ "tokenization_utils_base": [
...@@ -2307,7 +2308,7 @@ if TYPE_CHECKING: ...@@ -2307,7 +2308,7 @@ if TYPE_CHECKING:
from .feature_extraction_sequence_utils import SequenceFeatureExtractor from .feature_extraction_sequence_utils import SequenceFeatureExtractor
# Feature Extractor # Feature Extractor
from .feature_extraction_utils import BatchFeature from .feature_extraction_utils import BatchFeature, FeatureExtractionMixin
# Files and general utilities # Files and general utilities
from .file_utils import ( from .file_utils import (
...@@ -2555,6 +2556,7 @@ if TYPE_CHECKING: ...@@ -2555,6 +2556,7 @@ if TYPE_CHECKING:
ZeroShotClassificationPipeline, ZeroShotClassificationPipeline,
pipeline, pipeline,
) )
from .processing_utils import ProcessorMixin
# Tokenization # Tokenization
from .tokenization_utils import PreTrainedTokenizer from .tokenization_utils import PreTrainedTokenizer
......
...@@ -15,12 +15,11 @@ ...@@ -15,12 +15,11 @@
""" """
Image/Text processor class for CLIP Image/Text processor class for CLIP
""" """
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding from ...tokenization_utils_base import BatchEncoding
from .feature_extraction_clip import CLIPFeatureExtractor
from .tokenization_clip import CLIPTokenizer
class CLIPProcessor: class CLIPProcessor(ProcessorMixin):
r""" r"""
Constructs a CLIP processor which wraps a CLIP feature extractor and a CLIP tokenizer into a single processor. Constructs a CLIP processor which wraps a CLIP feature extractor and a CLIP tokenizer into a single processor.
...@@ -33,77 +32,13 @@ class CLIPProcessor: ...@@ -33,77 +32,13 @@ class CLIPProcessor:
tokenizer ([`CLIPTokenizer`]): tokenizer ([`CLIPTokenizer`]):
The tokenizer is a required input. The tokenizer is a required input.
""" """
feature_extractor_class = "CLIPFeatureExtractor"
tokenizer_class = "CLIPTokenizer"
def __init__(self, feature_extractor, tokenizer): def __init__(self, feature_extractor, tokenizer):
if not isinstance(feature_extractor, CLIPFeatureExtractor): super().__init__(feature_extractor, tokenizer)
raise ValueError(
f"`feature_extractor` has to be of type CLIPFeatureExtractor, but is {type(feature_extractor)}"
)
if not isinstance(tokenizer, CLIPTokenizer):
raise ValueError(f"`tokenizer` has to be of type CLIPTokenizer, but is {type(tokenizer)}")
self.feature_extractor = feature_extractor
self.tokenizer = tokenizer
self.current_processor = self.feature_extractor self.current_processor = self.feature_extractor
def save_pretrained(self, save_directory):
"""
Save a CLIP feature extractor object and CLIP tokenizer object to the directory `save_directory`, so that it
can be re-loaded using the [`~CLIPProcessor.from_pretrained`] class method.
<Tip>
This class method is simply calling [`~PreTrainedFeatureExtractor.save_pretrained`] and
[`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods
above for more information.
</Tip>
Args:
save_directory (`str` or `os.PathLike`):
Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
be created if it does not exist).
"""
self.feature_extractor._set_processor_class(self.__class__.__name__)
self.feature_extractor.save_pretrained(save_directory)
self.tokenizer._set_processor_class(self.__class__.__name__)
self.tokenizer.save_pretrained(save_directory)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
r"""
Instantiate a [`CLIPProcessor`] from a pretrained CLIP processor.
<Tip>
This class method is simply calling CLIPFeatureExtractor's [`~PreTrainedFeatureExtractor.from_pretrained`] and
CLIPTokenizer's [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the
docstrings of the methods above for more information.
</Tip>
Args:
pretrained_model_name_or_path (`str` or `os.PathLike`):
This can be either:
- a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
huggingface.co. Valid model ids can be located at the root-level, like `clip-vit-base-patch32`, or
namespaced under a user or organization name, like `openai/clip-vit-base-patch32`.
- a path to a *directory* containing a feature extractor file saved using the
[`~PreTrainedFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
- a path or url to a saved feature extractor JSON *file*, e.g.,
`./my_model_directory/preprocessor_config.json`.
**kwargs
Additional keyword arguments passed along to both [`PreTrainedFeatureExtractor`] and
[`PreTrainedTokenizer`]
"""
feature_extractor = CLIPFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
def __call__(self, text=None, images=None, return_tensors=None, **kwargs): def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
""" """
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
......
...@@ -18,13 +18,11 @@ Processor class for LayoutLMv2. ...@@ -18,13 +18,11 @@ Processor class for LayoutLMv2.
from typing import List, Optional, Union from typing import List, Optional, Union
from ...file_utils import TensorType from ...file_utils import TensorType
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
from .feature_extraction_layoutlmv2 import LayoutLMv2FeatureExtractor
from .tokenization_layoutlmv2 import LayoutLMv2Tokenizer
from .tokenization_layoutlmv2_fast import LayoutLMv2TokenizerFast
class LayoutLMv2Processor: class LayoutLMv2Processor(ProcessorMixin):
r""" r"""
Constructs a LayoutLMv2 processor which combines a LayoutLMv2 feature extractor and a LayoutLMv2 tokenizer into a Constructs a LayoutLMv2 processor which combines a LayoutLMv2 feature extractor and a LayoutLMv2 tokenizer into a
single processor. single processor.
...@@ -43,84 +41,8 @@ class LayoutLMv2Processor: ...@@ -43,84 +41,8 @@ class LayoutLMv2Processor:
tokenizer (`LayoutLMv2Tokenizer` or `LayoutLMv2TokenizerFast`): tokenizer (`LayoutLMv2Tokenizer` or `LayoutLMv2TokenizerFast`):
An instance of [`LayoutLMv2Tokenizer`] or [`LayoutLMv2TokenizerFast`]. The tokenizer is a required input. An instance of [`LayoutLMv2Tokenizer`] or [`LayoutLMv2TokenizerFast`]. The tokenizer is a required input.
""" """
feature_extractor_class = "LayoutLMv2FeatureExtractor"
def __init__(self, feature_extractor, tokenizer): tokenizer_class = ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast")
if not isinstance(feature_extractor, LayoutLMv2FeatureExtractor):
raise ValueError(
f"`feature_extractor` has to be of type {LayoutLMv2FeatureExtractor.__class__}, but is {type(feature_extractor)}"
)
if not isinstance(tokenizer, (LayoutLMv2Tokenizer, LayoutLMv2TokenizerFast)):
raise ValueError(
f"`tokenizer` has to be of type {LayoutLMv2Tokenizer.__class__} or {LayoutLMv2TokenizerFast.__class__}, but is {type(tokenizer)}"
)
self.feature_extractor = feature_extractor
self.tokenizer = tokenizer
def save_pretrained(self, save_directory):
"""
Save a LayoutLMv2 feature_extractor object and LayoutLMv2 tokenizer object to the directory `save_directory`,
so that it can be re-loaded using the [`~LayoutLMv2Processor.from_pretrained`] class method.
<Tip>
This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and
[`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods
above for more information.
</Tip>
Args:
save_directory (`str` or `os.PathLike`):
Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
be created if it does not exist).
"""
self.feature_extractor._set_processor_class(self.__class__.__name__)
self.feature_extractor.save_pretrained(save_directory)
self.tokenizer._set_processor_class(self.__class__.__name__)
self.tokenizer.save_pretrained(save_directory)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, use_fast=True, **kwargs):
r"""
Instantiate a [`LayoutLMv2Processor`] from a pretrained LayoutLMv2 processor.
<Tip>
This class method is simply calling LayoutLMv2FeatureExtractor's
[`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and LayoutLMv2TokenizerFast's
[`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the docstrings of the methods
above for more information.
</Tip>
Args:
pretrained_model_name_or_path (`str` or `os.PathLike`):
This can be either:
- a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
- a path to a *directory* containing a feature extractor file saved using the
[`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
- a path or url to a saved feature extractor JSON *file*, e.g.,
`./my_model_directory/preprocessor_config.json`.
use_fast (`bool`, *optional*, defaults to `True`):
Whether or not to instantiate a fast tokenizer.
**kwargs
Additional keyword arguments passed along to both [`SequenceFeatureExtractor`] and
[`PreTrainedTokenizer`]
"""
feature_extractor = LayoutLMv2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
if use_fast:
tokenizer = LayoutLMv2TokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
else:
tokenizer = LayoutLMv2Tokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
def __call__( def __call__(
self, self,
......
...@@ -17,15 +17,12 @@ Processor class for LayoutXLM. ...@@ -17,15 +17,12 @@ Processor class for LayoutXLM.
""" """
from typing import List, Optional, Union from typing import List, Optional, Union
from transformers.models.layoutlmv2.feature_extraction_layoutlmv2 import LayoutLMv2FeatureExtractor
from ...file_utils import TensorType from ...file_utils import TensorType
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
from .tokenization_layoutxlm import LayoutXLMTokenizer
from .tokenization_layoutxlm_fast import LayoutXLMTokenizerFast
class LayoutXLMProcessor: class LayoutXLMProcessor(ProcessorMixin):
r""" r"""
Constructs a LayoutXLM processor which combines a LayoutXLM feature extractor and a LayoutXLM tokenizer into a Constructs a LayoutXLM processor which combines a LayoutXLM feature extractor and a LayoutXLM tokenizer into a
single processor. single processor.
...@@ -44,84 +41,8 @@ class LayoutXLMProcessor: ...@@ -44,84 +41,8 @@ class LayoutXLMProcessor:
tokenizer (`LayoutXLMTokenizer` or `LayoutXLMTokenizerFast`): tokenizer (`LayoutXLMTokenizer` or `LayoutXLMTokenizerFast`):
An instance of [`LayoutXLMTokenizer`] or [`LayoutXLMTokenizerFast`]. The tokenizer is a required input. An instance of [`LayoutXLMTokenizer`] or [`LayoutXLMTokenizerFast`]. The tokenizer is a required input.
""" """
feature_extractor_class = "LayoutLMv2FeatureExtractor"
def __init__(self, feature_extractor, tokenizer): tokenizer_class = ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast")
if not isinstance(feature_extractor, LayoutLMv2FeatureExtractor):
raise ValueError(
f"`feature_extractor` has to be of type {LayoutLMv2FeatureExtractor.__class__}, but is {type(feature_extractor)}"
)
if not isinstance(tokenizer, (LayoutXLMTokenizer, LayoutXLMTokenizerFast)):
raise ValueError(
f"`tokenizer` has to be of type {LayoutXLMTokenizer.__class__} or {LayoutXLMTokenizerFast.__class__}, but is {type(tokenizer)}"
)
self.feature_extractor = feature_extractor
self.tokenizer = tokenizer
def save_pretrained(self, save_directory):
"""
Save a LayoutXLM feature_extractor object and LayoutXLM tokenizer object to the directory `save_directory`, so
that it can be re-loaded using the [`~LayoutXLMProcessor.from_pretrained`] class method.
<Tip>
This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and
[`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods
above for more information.
</Tip>
Args:
save_directory (`str` or `os.PathLike`):
Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
be created if it does not exist).
"""
self.feature_extractor._set_processor_class(self.__class__.__name__)
self.feature_extractor.save_pretrained(save_directory)
self.tokenizer._set_processor_class(self.__class__.__name__)
self.tokenizer.save_pretrained(save_directory)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, use_fast=True, **kwargs):
r"""
Instantiate a [`LayoutXLMProcessor`] from a pretrained LayoutXLM processor.
<Tip>
This class method is simply calling Layoutv2FeatureExtractor's
[`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and LayoutXLMTokenizerFast's
[`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the docstrings of the methods
above for more information.
</Tip>
Args:
pretrained_model_name_or_path (`str` or `os.PathLike`):
This can be either:
- a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
- a path to a *directory* containing a feature extractor file saved using the
[`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
- a path or url to a saved feature extractor JSON *file*, e.g.,
`./my_model_directory/preprocessor_config.json`.
use_fast (`bool`, *optional*, defaults to `True`):
Whether or not to instantiate a fast tokenizer.
**kwargs
Additional keyword arguments passed along to both [`SequenceFeatureExtractor`] and
[`PreTrainedTokenizer`]
"""
feature_extractor = LayoutLMv2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
if use_fast:
tokenizer = LayoutXLMTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
else:
tokenizer = LayoutXLMTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
def __call__( def __call__(
self, self,
......
...@@ -17,11 +17,10 @@ Speech processor class for Speech2Text ...@@ -17,11 +17,10 @@ Speech processor class for Speech2Text
""" """
from contextlib import contextmanager from contextlib import contextmanager
from .feature_extraction_speech_to_text import Speech2TextFeatureExtractor from ...processing_utils import ProcessorMixin
from .tokenization_speech_to_text import Speech2TextTokenizer
class Speech2TextProcessor: class Speech2TextProcessor(ProcessorMixin):
r""" r"""
Constructs a Speech2Text processor which wraps a Speech2Text feature extractor and a Speech2Text tokenizer into a Constructs a Speech2Text processor which wraps a Speech2Text feature extractor and a Speech2Text tokenizer into a
single processor. single processor.
...@@ -36,79 +35,13 @@ class Speech2TextProcessor: ...@@ -36,79 +35,13 @@ class Speech2TextProcessor:
tokenizer (`Speech2TextTokenizer`): tokenizer (`Speech2TextTokenizer`):
An instance of [`Speech2TextTokenizer`]. The tokenizer is a required input. An instance of [`Speech2TextTokenizer`]. The tokenizer is a required input.
""" """
feature_extractor_class = "Speech2TextFeatureExtractor"
tokenizer_class = "Speech2TextTokenizer"
def __init__(self, feature_extractor, tokenizer): def __init__(self, feature_extractor, tokenizer):
if not isinstance(feature_extractor, Speech2TextFeatureExtractor): super().__init__(feature_extractor, tokenizer)
raise ValueError(
f"`feature_extractor` has to be of type {Speech2TextFeatureExtractor.__class__}, but is {type(feature_extractor)}"
)
if not isinstance(tokenizer, Speech2TextTokenizer):
raise ValueError(
f"`tokenizer` has to be of type {Speech2TextTokenizer.__class__}, but is {type(tokenizer)}"
)
self.feature_extractor = feature_extractor
self.tokenizer = tokenizer
self.current_processor = self.feature_extractor self.current_processor = self.feature_extractor
def save_pretrained(self, save_directory):
"""
Save a Speech2Text feature extractor object and Speech2Text tokenizer object to the directory `save_directory`,
so that it can be re-loaded using the [`~Speech2TextProcessor.from_pretrained`] class method.
<Tip>
This class method is simply calling [`~PreTrainedFeatureExtractor.save_pretrained`] and
[`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods
above for more information.
</Tip>
Args:
save_directory (`str` or `os.PathLike`):
Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
be created if it does not exist).
"""
self.feature_extractor._set_processor_class(self.__class__.__name__)
self.feature_extractor.save_pretrained(save_directory)
self.tokenizer._set_processor_class(self.__class__.__name__)
self.tokenizer.save_pretrained(save_directory)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
r"""
Instantiate a [`Speech2TextProcessor`] from a pretrained Speech2Text processor.
<Tip>
This class method is simply calling Speech2TextFeatureExtractor's
[`~PreTrainedFeatureExtractor.from_pretrained`] and Speech2TextTokenizer's
[`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the docstrings of the methods
above for more information.
</Tip>
Args:
pretrained_model_name_or_path (`str` or `os.PathLike`):
This can be either:
- a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
- a path to a *directory* containing a feature extractor file saved using the
[`~PreTrainedFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
- a path or url to a saved feature extractor JSON *file*, e.g.,
`./my_model_directory/preprocessor_config.json`.
**kwargs
Additional keyword arguments passed along to both [`PreTrainedFeatureExtractor`] and
[`PreTrainedTokenizer`]
"""
feature_extractor = Speech2TextFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
tokenizer = Speech2TextTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
def __call__(self, *args, **kwargs): def __call__(self, *args, **kwargs):
""" """
When used in normal mode, this method forwards all its arguments to Speech2TextFeatureExtractor's When used in normal mode, this method forwards all its arguments to Speech2TextFeatureExtractor's
......
...@@ -17,12 +17,10 @@ Speech processor class for Speech2Text2 ...@@ -17,12 +17,10 @@ Speech processor class for Speech2Text2
""" """
from contextlib import contextmanager from contextlib import contextmanager
from ...feature_extraction_sequence_utils import SequenceFeatureExtractor from ...processing_utils import ProcessorMixin
from ..auto.feature_extraction_auto import AutoFeatureExtractor
from .tokenization_speech_to_text_2 import Speech2Text2Tokenizer
class Speech2Text2Processor: class Speech2Text2Processor(ProcessorMixin):
r""" r"""
Constructs a Speech2Text2 processor which wraps a Speech2Text2 feature extractor and a Speech2Text2 tokenizer into Constructs a Speech2Text2 processor which wraps a Speech2Text2 feature extractor and a Speech2Text2 tokenizer into
a single processor. a single processor.
...@@ -36,77 +34,13 @@ class Speech2Text2Processor: ...@@ -36,77 +34,13 @@ class Speech2Text2Processor:
tokenizer (`Speech2Text2Tokenizer`): tokenizer (`Speech2Text2Tokenizer`):
An instance of [`Speech2Text2Tokenizer`]. The tokenizer is a required input. An instance of [`Speech2Text2Tokenizer`]. The tokenizer is a required input.
""" """
feature_extractor_class = "AutoFeatureExtractor"
tokenizer_class = "Speech2Text2Tokenizer"
def __init__(self, feature_extractor, tokenizer): def __init__(self, feature_extractor, tokenizer):
if not isinstance(feature_extractor, SequenceFeatureExtractor): super().__init__(feature_extractor, tokenizer)
raise ValueError(
f"`feature_extractor` has to be of type {SequenceFeatureExtractor.__class__}, but is {type(feature_extractor)}"
)
if not isinstance(tokenizer, Speech2Text2Tokenizer):
raise ValueError(
f"`tokenizer` has to be of type {Speech2Text2Tokenizer.__class__}, but is {type(tokenizer)}"
)
self.feature_extractor = feature_extractor
self.tokenizer = tokenizer
self.current_processor = self.feature_extractor self.current_processor = self.feature_extractor
def save_pretrained(self, save_directory):
"""
Save a Speech2Text2 feature extractor object and Speech2Text2 tokenizer object to the directory
`save_directory`, so that it can be re-loaded using the [`~Speech2Text2Processor.from_pretrained`] class
method.
<Tip>
This class method is simply calling [`~PreTrainedFeatureExtractor.save_pretrained`] and
[`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods
above for more information.
</Tip>
Args:
save_directory (`str` or `os.PathLike`):
Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
be created if it does not exist).
"""
self.feature_extractor.save_pretrained(save_directory)
self.tokenizer.save_pretrained(save_directory)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
r"""
Instantiate a [`Speech2Text2Processor`] from a pretrained Speech2Text2 processor.
<Tip>
This class method is simply calling AutoFeatureExtractor's [`~PreTrainedFeatureExtractor.from_pretrained`] and
Speech2Text2Tokenizer's [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the
docstrings of the methods above for more information.
</Tip>
Args:
pretrained_model_name_or_path (`str` or `os.PathLike`):
This can be either:
- a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
- a path to a *directory* containing a feature extractor file saved using the
[`~PreTrainedFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
- a path or url to a saved feature extractor JSON *file*, e.g.,
`./my_model_directory/preprocessor_config.json`.
**kwargs
Additional keyword arguments passed along to both [`PreTrainedFeatureExtractor`] and
[`PreTrainedTokenizer`]
"""
feature_extractor = AutoFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
tokenizer = Speech2Text2Tokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
def __call__(self, *args, **kwargs): def __call__(self, *args, **kwargs):
""" """
When used in normal mode, this method forwards all its arguments to AutoFeatureExtractor's When used in normal mode, this method forwards all its arguments to AutoFeatureExtractor's
......
...@@ -17,15 +17,10 @@ Processor class for TrOCR. ...@@ -17,15 +17,10 @@ Processor class for TrOCR.
""" """
from contextlib import contextmanager from contextlib import contextmanager
from transformers import AutoFeatureExtractor, AutoTokenizer from ...processing_utils import ProcessorMixin
from transformers.feature_extraction_utils import FeatureExtractionMixin
from transformers.models.roberta.tokenization_roberta import RobertaTokenizer
from transformers.models.roberta.tokenization_roberta_fast import RobertaTokenizerFast
from transformers.models.xlm_roberta.tokenization_xlm_roberta import XLMRobertaTokenizer
from transformers.models.xlm_roberta.tokenization_xlm_roberta_fast import XLMRobertaTokenizerFast
class TrOCRProcessor: class TrOCRProcessor(ProcessorMixin):
r""" r"""
Constructs a TrOCR processor which wraps a vision feature extractor and a TrOCR tokenizer into a single processor. Constructs a TrOCR processor which wraps a vision feature extractor and a TrOCR tokenizer into a single processor.
...@@ -39,78 +34,13 @@ class TrOCRProcessor: ...@@ -39,78 +34,13 @@ class TrOCRProcessor:
tokenizer ([`RobertaTokenizer`/`XLMRobertaTokenizer`]): tokenizer ([`RobertaTokenizer`/`XLMRobertaTokenizer`]):
An instance of [`RobertaTokenizer`/`XLMRobertaTokenizer`]. The tokenizer is a required input. An instance of [`RobertaTokenizer`/`XLMRobertaTokenizer`]. The tokenizer is a required input.
""" """
feature_extractor_class = "AutoFeatureExtractor"
tokenizer_class = "AutoTokenizer"
def __init__(self, feature_extractor, tokenizer): def __init__(self, feature_extractor, tokenizer):
if not isinstance(feature_extractor, FeatureExtractionMixin): super().__init__(feature_extractor, tokenizer)
raise ValueError(
f"`feature_extractor` has to be of type {FeatureExtractionMixin.__class__}, but is {type(feature_extractor)}"
)
if not isinstance(
tokenizer, (RobertaTokenizer, RobertaTokenizerFast, XLMRobertaTokenizer, XLMRobertaTokenizerFast)
):
raise ValueError(
f"`tokenizer` has to be of type {RobertaTokenizer.__class__} or {RobertaTokenizerFast.__class__} or {XLMRobertaTokenizer.__class__} or {XLMRobertaTokenizerFast.__class__}, but is {type(tokenizer)}"
)
self.feature_extractor = feature_extractor
self.tokenizer = tokenizer
self.current_processor = self.feature_extractor self.current_processor = self.feature_extractor
def save_pretrained(self, save_directory):
"""
Save a TrOCR feature extractor object and TrOCR tokenizer object to the directory `save_directory`, so that it
can be re-loaded using the [`~TrOCRProcessor.from_pretrained`] class method.
<Tip>
This class method is simply calling [`~PreTrainedFeatureExtractor.save_pretrained`] and
[`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods
above for more information.
</Tip>
Args:
save_directory (`str` or `os.PathLike`):
Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
be created if it does not exist).
"""
self.feature_extractor.save_pretrained(save_directory)
self.tokenizer.save_pretrained(save_directory)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
r"""
Instantiate a [`TrOCRProcessor`] from a pretrained TrOCR processor.
<Tip>
This class method is simply calling AutoFeatureExtractor's [`~PreTrainedFeatureExtractor.from_pretrained`] and
TrOCRTokenizer's [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the
docstrings of the methods above for more information.
</Tip>
Args:
pretrained_model_name_or_path (`str` or `os.PathLike`):
This can be either:
- a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
- a path to a *directory* containing a feature extractor file saved using the
[`~PreTrainedFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
- a path or url to a saved feature extractor JSON *file*, e.g.,
`./my_model_directory/preprocessor_config.json`.
**kwargs
Additional keyword arguments passed along to both [`PreTrainedFeatureExtractor`] and
[`PreTrainedTokenizer`]
"""
feature_extractor = AutoFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
def __call__(self, *args, **kwargs): def __call__(self, *args, **kwargs):
""" """
When used in normal mode, this method forwards all its arguments to AutoFeatureExtractor's When used in normal mode, this method forwards all its arguments to AutoFeatureExtractor's
......
...@@ -18,14 +18,12 @@ Processor class for ViLT. ...@@ -18,14 +18,12 @@ Processor class for ViLT.
from typing import List, Optional, Union from typing import List, Optional, Union
from transformers import BertTokenizerFast
from ...file_utils import TensorType from ...file_utils import TensorType
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
from .feature_extraction_vilt import ViltFeatureExtractor
class ViltProcessor: class ViltProcessor(ProcessorMixin):
r""" r"""
Constructs a ViLT processor which wraps a BERT tokenizer and ViLT feature extractor into a single processor. Constructs a ViLT processor which wraps a BERT tokenizer and ViLT feature extractor into a single processor.
...@@ -38,75 +36,13 @@ class ViltProcessor: ...@@ -38,75 +36,13 @@ class ViltProcessor:
tokenizer (`BertTokenizerFast`): tokenizer (`BertTokenizerFast`):
An instance of ['BertTokenizerFast`]. The tokenizer is a required input. An instance of ['BertTokenizerFast`]. The tokenizer is a required input.
""" """
feature_extractor_class = "ViltFeatureExtractor"
tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
def __init__(self, feature_extractor, tokenizer): def __init__(self, feature_extractor, tokenizer):
if not isinstance(feature_extractor, ViltFeatureExtractor): super().__init__(feature_extractor, tokenizer)
raise ValueError(
f"`feature_extractor` has to be of type {ViltFeatureExtractor.__class__}, but is {type(feature_extractor)}"
)
if not isinstance(tokenizer, BertTokenizerFast):
raise ValueError(f"`tokenizer` has to be of type {BertTokenizerFast.__class__}, but is {type(tokenizer)}")
self.feature_extractor = feature_extractor
self.tokenizer = tokenizer
self.current_processor = self.feature_extractor self.current_processor = self.feature_extractor
def save_pretrained(self, save_directory):
"""
Save a ViLT feature_extractor object and BERT tokenizer object to the directory `save_directory`, so that it
can be re-loaded using the [`~ViltProcessor.from_pretrained`] class method.
<Tip>
This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and
[`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods
above for more information.
</Tip>
Args:
save_directory (`str` or `os.PathLike`):
Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
be created if it does not exist).
"""
self.feature_extractor.save_pretrained(save_directory)
self.tokenizer.save_pretrained(save_directory)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
r"""
Instantiate a [`ViltProcessor`] from a pretrained ViLT processor.
<Tip>
This class method is simply calling ViltFeatureExtractor's
[`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and BertTokenizerFast's
[`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the docstrings of the methods
above for more information.
</Tip>
Args:
pretrained_model_name_or_path (`str` or `os.PathLike`):
This can be either:
- a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
- a path to a *directory* containing a feature extractor file saved using the
[`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
- a path or url to a saved feature extractor JSON *file*, e.g.,
`./my_model_directory/preprocessor_config.json`.
**kwargs
Additional keyword arguments passed along to both [`SequenceFeatureExtractor`] and
[`PreTrainedTokenizer`]
"""
feature_extractor = ViltFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
def __call__( def __call__(
self, self,
images, images,
......
...@@ -15,17 +15,12 @@ ...@@ -15,17 +15,12 @@
""" """
Processor class for VisionTextDualEncoder Processor class for VisionTextDualEncoder
""" """
from typing import Union
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
from transformers.feature_extraction_utils import FeatureExtractionMixin
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding from ...tokenization_utils_base import BatchEncoding
from ..auto.feature_extraction_auto import AutoFeatureExtractor
from ..auto.tokenization_auto import AutoTokenizer
class VisionTextDualEncoderProcessor: class VisionTextDualEncoderProcessor(ProcessorMixin):
r""" r"""
Constructs a VisionTextDualEncoder processor which wraps a vision feature extractor and a tokenizer into a single Constructs a VisionTextDualEncoder processor which wraps a vision feature extractor and a tokenizer into a single
processor. processor.
...@@ -40,82 +35,13 @@ class VisionTextDualEncoderProcessor: ...@@ -40,82 +35,13 @@ class VisionTextDualEncoderProcessor:
tokenizer ([`PreTrainedTokenizer`]): tokenizer ([`PreTrainedTokenizer`]):
The tokenizer is a required input. The tokenizer is a required input.
""" """
feature_extractor_class = "AutoFeatureExtractor"
tokenizer_class = "AutoTokenizer"
def __init__( def __init__(self, feature_extractor, tokenizer):
self, feature_extractor: FeatureExtractionMixin, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast] super().__init__(feature_extractor, tokenizer)
):
if not isinstance(feature_extractor, FeatureExtractionMixin):
raise ValueError(
f"`feature_extractor` has to be of type {FeatureExtractionMixin.__class__}, but is {type(feature_extractor)}"
)
if not isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
raise ValueError(
f"`tokenizer` has to be of type `PreTrainedTokenizer` or `PreTrainedTokenizerFast`, but is {type(tokenizer)}"
)
self.feature_extractor = feature_extractor
self.tokenizer = tokenizer
self.current_processor = self.feature_extractor self.current_processor = self.feature_extractor
def save_pretrained(self, save_directory):
"""
Save a VisionTextDualEncoder feature extractor object and VisionTextDualEncoder tokenizer object to the
directory `save_directory`, so that it can be re-loaded using the
[`~VisionTextDualEncoderProcessor.from_pretrained`] class method.
<Tip>
This class method is simply calling [`~PreTrainedFeatureExtractor.save_pretrained`] and
[`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods
above for more information.
</Tip>
Args:
save_directory (`str` or `os.PathLike`):
Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
be created if it does not exist).
"""
self.feature_extractor._set_processor_class(self.__class__.__name__)
self.feature_extractor.save_pretrained(save_directory)
self.tokenizer._set_processor_class(self.__class__.__name__)
self.tokenizer.save_pretrained(save_directory)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
r"""
Instantiate a [`VisionTextDualEncoderProcessor`] from a pretrained VisionTextDualEncoder processor.
<Tip>
This class method is simply calling AutoFeatureExtractor's [`~PreTrainedFeatureExtractor.from_pretrained`] and
AutoTokenizer's [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the
docstrings of the methods above for more information.
</Tip>
Args:
pretrained_model_name_or_path (`str` or `os.PathLike`):
This can be either:
- a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
- a path to a *directory* containing a feature extractor file saved using the
[`~PreTrainedFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
- a path or url to a saved feature extractor JSON *file*, e.g.,
`./my_model_directory/preprocessor_config.json`.
**kwargs
Additional keyword arguments passed along to both [`PreTrainedFeatureExtractor`] and
[`PreTrainedTokenizer`]
"""
feature_extractor = AutoFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
def __call__(self, text=None, images=None, return_tensors=None, **kwargs): def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
""" """
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
......
...@@ -18,14 +18,12 @@ Speech processor class for Wav2Vec2 ...@@ -18,14 +18,12 @@ Speech processor class for Wav2Vec2
import warnings import warnings
from contextlib import contextmanager from contextlib import contextmanager
from ...tokenization_utils import PreTrainedTokenizer from ...processing_utils import ProcessorMixin
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ..auto.tokenization_auto import AutoTokenizer
from .feature_extraction_wav2vec2 import Wav2Vec2FeatureExtractor from .feature_extraction_wav2vec2 import Wav2Vec2FeatureExtractor
from .tokenization_wav2vec2 import Wav2Vec2CTCTokenizer from .tokenization_wav2vec2 import Wav2Vec2CTCTokenizer
class Wav2Vec2Processor: class Wav2Vec2Processor(ProcessorMixin):
r""" r"""
Constructs a Wav2Vec2 processor which wraps a Wav2Vec2 feature extractor and a Wav2Vec2 CTC tokenizer into a single Constructs a Wav2Vec2 processor which wraps a Wav2Vec2 feature extractor and a Wav2Vec2 CTC tokenizer into a single
processor. processor.
...@@ -39,82 +37,17 @@ class Wav2Vec2Processor: ...@@ -39,82 +37,17 @@ class Wav2Vec2Processor:
tokenizer ([`PreTrainedTokenizer`]): tokenizer ([`PreTrainedTokenizer`]):
An instance of [`PreTrainedTokenizer`]. The tokenizer is a required input. An instance of [`PreTrainedTokenizer`]. The tokenizer is a required input.
""" """
feature_extractor_class = "Wav2Vec2FeatureExtractor"
tokenizer_class = "AutoTokenizer"
def __init__(self, feature_extractor, tokenizer): def __init__(self, feature_extractor, tokenizer):
if not isinstance(feature_extractor, Wav2Vec2FeatureExtractor): super().__init__(feature_extractor, tokenizer)
raise ValueError(
f"`feature_extractor` has to be of type {Wav2Vec2FeatureExtractor.__class__}, but is {type(feature_extractor)}"
)
if not isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
raise ValueError(
f"`tokenizer` has to be of type {PreTrainedTokenizer.__class__}, but is {type(tokenizer)}"
)
self.feature_extractor = feature_extractor
self.tokenizer = tokenizer
self.current_processor = self.feature_extractor self.current_processor = self.feature_extractor
def save_pretrained(self, save_directory):
"""
Save a Wav2Vec2 feature_extractor object and Wav2Vec2 tokenizer object to the directory `save_directory`, so
that it can be re-loaded using the [`~Wav2Vec2Processor.from_pretrained`] class method.
<Tip>
This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and
[`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods
above for more information.
</Tip>
Args:
save_directory (`str` or `os.PathLike`):
Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
be created if it does not exist).
"""
self.feature_extractor._set_processor_class(self.__class__.__name__)
self.feature_extractor.save_pretrained(save_directory)
self.tokenizer._set_processor_class(self.__class__.__name__)
self.tokenizer.save_pretrained(save_directory)
@classmethod @classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
r"""
Instantiate a [`Wav2Vec2Processor`] from a pretrained Wav2Vec2 processor.
<Tip>
This class method is simply calling Wav2Vec2FeatureExtractor's
[`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and PreTrainedTokenizer's
[`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the docstrings of the methods
above for more information.
</Tip>
Args:
pretrained_model_name_or_path (`str` or `os.PathLike`):
This can be either:
- a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
- a path to a *directory* containing a feature extractor file saved using the
[`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
- a path or url to a saved feature extractor JSON *file*, e.g.,
`./my_model_directory/preprocessor_config.json`.
**kwargs
Additional keyword arguments passed along to both [`SequenceFeatureExtractor`] and
[`PreTrainedTokenizer`]
"""
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
# load generic `AutoTokenizer`
# need fallback here for backward compatibility in case processor is
# loaded from just a tokenizer file that does not have a `tokenizer_class` attribute
# behavior should be deprecated in major future release
try: try:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) return super().from_pretrained(pretrained_model_name_or_path, **kwargs)
except OSError: except OSError:
warnings.warn( warnings.warn(
f"Loading a tokenizer inside {cls.__name__} from a config that does not" f"Loading a tokenizer inside {cls.__name__} from a config that does not"
...@@ -124,9 +57,11 @@ class Wav2Vec2Processor: ...@@ -124,9 +57,11 @@ class Wav2Vec2Processor:
"file to suppress this warning: ", "file to suppress this warning: ",
FutureWarning, FutureWarning,
) )
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
return cls(feature_extractor=feature_extractor, tokenizer=tokenizer) return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
def __call__(self, *args, **kwargs): def __call__(self, *args, **kwargs):
""" """
......
...@@ -23,16 +23,16 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union ...@@ -23,16 +23,16 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
import numpy as np import numpy as np
from ...feature_extraction_utils import FeatureExtractionMixin
from ...file_utils import ModelOutput, requires_backends from ...file_utils import ModelOutput, requires_backends
from ...tokenization_utils import PreTrainedTokenizer from ...processing_utils import ProcessorMixin
from ..wav2vec2.feature_extraction_wav2vec2 import Wav2Vec2FeatureExtractor
from ..wav2vec2.tokenization_wav2vec2 import Wav2Vec2CTCTokenizer
if TYPE_CHECKING: if TYPE_CHECKING:
from pyctcdecode import BeamSearchDecoderCTC from pyctcdecode import BeamSearchDecoderCTC
from ...feature_extraction_utils import FeatureExtractionMixin
from ...tokenization_utils import PreTrainedTokenizerBase
@dataclass @dataclass
class Wav2Vec2DecoderWithLMOutput(ModelOutput): class Wav2Vec2DecoderWithLMOutput(ModelOutput):
...@@ -47,7 +47,7 @@ class Wav2Vec2DecoderWithLMOutput(ModelOutput): ...@@ -47,7 +47,7 @@ class Wav2Vec2DecoderWithLMOutput(ModelOutput):
text: Union[List[str], str] text: Union[List[str], str]
class Wav2Vec2ProcessorWithLM: class Wav2Vec2ProcessorWithLM(ProcessorMixin):
r""" r"""
Constructs a Wav2Vec2 processor which wraps a Wav2Vec2 feature extractor, a Wav2Vec2 CTC tokenizer and a decoder Constructs a Wav2Vec2 processor which wraps a Wav2Vec2 feature extractor, a Wav2Vec2 CTC tokenizer and a decoder
with language model support into a single processor for language model boosted speech recognition decoding. with language model support into a single processor for language model boosted speech recognition decoding.
...@@ -60,24 +60,18 @@ class Wav2Vec2ProcessorWithLM: ...@@ -60,24 +60,18 @@ class Wav2Vec2ProcessorWithLM:
decoder (`pyctcdecode.BeamSearchDecoderCTC`): decoder (`pyctcdecode.BeamSearchDecoderCTC`):
An instance of [`pyctcdecode.BeamSearchDecoderCTC`]. The decoder is a required input. An instance of [`pyctcdecode.BeamSearchDecoderCTC`]. The decoder is a required input.
""" """
feature_extractor_class = "Wav2Vec2FeatureExtractor"
tokenizer_class = "Wav2Vec2CTCTokenizer"
def __init__( def __init__(
self, self,
feature_extractor: FeatureExtractionMixin, feature_extractor: "FeatureExtractionMixin",
tokenizer: PreTrainedTokenizer, tokenizer: "PreTrainedTokenizerBase",
decoder: "BeamSearchDecoderCTC", decoder: "BeamSearchDecoderCTC",
): ):
from pyctcdecode import BeamSearchDecoderCTC from pyctcdecode import BeamSearchDecoderCTC
if not isinstance(feature_extractor, Wav2Vec2FeatureExtractor): super().__init__(feature_extractor, tokenizer)
raise ValueError(
f"`feature_extractor` has to be of type {Wav2Vec2FeatureExtractor.__class__}, but is {type(feature_extractor)}"
)
if not isinstance(tokenizer, Wav2Vec2CTCTokenizer):
# TODO(PVP) - this can be relaxed in the future to allow other kinds of tokenizers
raise ValueError(
f"`tokenizer` has to be of type {Wav2Vec2CTCTokenizer.__class__}, but is {type(tokenizer)}"
)
if not isinstance(decoder, BeamSearchDecoderCTC): if not isinstance(decoder, BeamSearchDecoderCTC):
raise ValueError(f"`decoder` has to be of type {BeamSearchDecoderCTC.__class__}, but is {type(decoder)}") raise ValueError(f"`decoder` has to be of type {BeamSearchDecoderCTC.__class__}, but is {type(decoder)}")
...@@ -90,37 +84,11 @@ class Wav2Vec2ProcessorWithLM: ...@@ -90,37 +84,11 @@ class Wav2Vec2ProcessorWithLM:
f"Make sure to include {missing_decoder_tokens} in the decoder's alphabet." f"Make sure to include {missing_decoder_tokens} in the decoder's alphabet."
) )
self.feature_extractor = feature_extractor
self.tokenizer = tokenizer
self.decoder = decoder self.decoder = decoder
self.current_processor = self.feature_extractor self.current_processor = self.feature_extractor
def save_pretrained(self, save_directory): def save_pretrained(self, save_directory):
""" super().save_pretrained(save_directory)
Save the Wav2Vec2 feature_extractor, a tokenizer object and a pyctcdecode decoder to the directory
`save_directory`, so that they can be re-loaded using the [`~Wav2Vec2ProcessorWithLM.from_pretrained`] class
method.
<Tip>
This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained,`]
[`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`] and pyctcdecode's
[`pyctcdecode.BeamSearchDecoderCTC.save_to_dir`].
Please refer to the docstrings of the methods above for more information.
</Tip>
Args:
save_directory (`str` or `os.PathLike`):
Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
be created if it does not exist).
"""
self.feature_extractor._set_processor_class(self.__class__.__name__)
self.feature_extractor.save_pretrained(save_directory)
self.tokenizer._set_processor_class(self.__class__.__name__)
self.tokenizer.save_pretrained(save_directory)
self.decoder.save_to_dir(save_directory) self.decoder.save_to_dir(save_directory)
@classmethod @classmethod
...@@ -157,8 +125,7 @@ class Wav2Vec2ProcessorWithLM: ...@@ -157,8 +125,7 @@ class Wav2Vec2ProcessorWithLM:
requires_backends(cls, "pyctcdecode") requires_backends(cls, "pyctcdecode")
from pyctcdecode import BeamSearchDecoderCTC from pyctcdecode import BeamSearchDecoderCTC
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs) feature_extractor, tokenizer = super()._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
if os.path.isdir(pretrained_model_name_or_path): if os.path.isdir(pretrained_model_name_or_path):
decoder = BeamSearchDecoderCTC.load_from_dir(pretrained_model_name_or_path) decoder = BeamSearchDecoderCTC.load_from_dir(pretrained_model_name_or_path)
......
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Processing saving/loading class for common processors.
"""
import importlib.util
from pathlib import Path
# Comment to write
spec = importlib.util.spec_from_file_location(
"transformers", Path(__file__).parent / "__init__.py", submodule_search_locations=[Path(__file__).parent]
)
transformers_module = spec.loader.load_module()
AUTO_TO_BASE_CLASS_MAPPING = {
"AutoTokenizer": "PreTrainedTokenizerBase",
"AutoFeatureExtractor": "FeatureExtractionMixin",
}
class ProcessorMixin:
"""
This is a mixin used to provide saving/loading functionality for all processor classes.
"""
attributes = ["feature_extractor", "tokenizer"]
# Names need to be attr_class for attr in attributes
feature_extractor_class = None
tokenizer_class = None
# args have to match the attributes class attribute
def __init__(self, *args, **kwargs):
# Sanitize args and kwargs
for key in kwargs:
if key not in self.attributes:
raise TypeError(f"Unexepcted keyword argument {key}.")
for arg, attribute_name in zip(args, self.attributes):
if attribute_name in kwargs:
raise TypeError(f"Got multiple values for argument {attribute_name}.")
else:
kwargs[attribute_name] = arg
if len(kwargs) != len(self.attributes):
raise ValueError(
f"This processor requires {len(self.attributes)} arguments: {', '.join(self.attributes)}. Got "
f"{len(args)} arguments instead."
)
# Check each arg is of the proper class (this will also catch a user initializing in the wrong order)
for attribute_name, arg in kwargs.items():
class_name = getattr(self, f"{attribute_name}_class")
# Nothing is ever going to be an instance of "AutoXxx", in that case we check the base class.
class_name = AUTO_TO_BASE_CLASS_MAPPING.get(class_name, class_name)
if isinstance(class_name, tuple):
proper_class = tuple(getattr(transformers_module, n) for n in class_name if n is not None)
else:
proper_class = getattr(transformers_module, class_name)
if not isinstance(arg, proper_class):
raise ValueError(
f"Received a {type(arg).__name__} for argument {attribute_name}, but a {class_name} was expected."
)
setattr(self, attribute_name, arg)
def __repr__(self):
attributes_repr = [f"- {name}: {repr(getattr(self, name))}" for name in self.attributes]
attributes_repr = "\n".join(attributes_repr)
return f"{self.__class__.__name__}:\n{attributes_repr}"
def save_pretrained(self, save_directory):
"""
Saves the attributes of this processor (feature extractor, tokenizer...) in the specified directory so that it
can be reloaded using the [`~ProcessorMixin.from_pretrained`] method.
<Tip>
This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and
[`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods
above for more information.
</Tip>
Args:
save_directory (`str` or `os.PathLike`):
Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
be created if it does not exist).
"""
for attribute_name in self.attributes:
attribute = getattr(self, attribute_name)
# Include the processor class in the attribute config so this processor can then be reloaded with the
# `AutoProcessor` API.
if hasattr(attribute, "_set_processor_class"):
attribute._set_processor_class(self.__class__.__name__)
attribute.save_pretrained(save_directory)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
r"""
Instantiate a processor associated with a pretrained model.
<Tip>
This class method is simply calling the feature extractor
[`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and the tokenizer
[`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`] methods. Please refer to the docstrings of the
methods above for more information.
</Tip>
Args:
pretrained_model_name_or_path (`str` or `os.PathLike`):
This can be either:
- a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
- a path to a *directory* containing a feature extractor file saved using the
[`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
- a path or url to a saved feature extractor JSON *file*, e.g.,
`./my_model_directory/preprocessor_config.json`.
**kwargs
Additional keyword arguments passed along to both
[`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and
[`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`].
"""
args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
return cls(*args)
@classmethod
def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
args = []
for attribute_name in cls.attributes:
class_name = getattr(cls, f"{attribute_name}_class")
if isinstance(class_name, tuple):
classes = tuple(getattr(transformers_module, n) if n is not None else None for n in class_name)
use_fast = kwargs.get("use_fast", True)
if use_fast and classes[1] is not None:
attribute_class = classes[1]
else:
attribute_class = classes[0]
else:
attribute_class = getattr(transformers_module, class_name)
args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
return args
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment