"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "2c0da7803a75f0fc6e6d484e23ca283faa32d785"
Unverified Commit cb38ffcc authored by Patrick von Platen's avatar Patrick von Platen Committed by GitHub
Browse files

[PretrainedFeatureExtractor] + Wav2Vec2FeatureExtractor, Wav2Vec2Processor,...

[PretrainedFeatureExtractor] + Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2Tokenizer (#10324)

* push to show

* small improvement

* small improvement

* Update src/transformers/feature_extraction_utils.py

* Update src/transformers/feature_extraction_utils.py

* implement base

* add common tests

* make all tests pass for wav2vec2

* make padding work & add more tests

* finalize feature extractor utils

* add call method to feature extraction

* finalize feature processor

* finish tokenizer

* finish general processor design

* finish tests

* typo

* remove bogus file

* finish docstring

* add docs

* finish docs

* small fix

* correct docs

* save intermediate

* load changes

* apply changes

* apply changes to doc

* change tests

* apply surajs recommend

* final changes

* Apply suggestions from code review

* fix typo

* fix import

* correct docstring
parent 9dc78257
...@@ -375,6 +375,7 @@ TensorFlow and/or Flax. ...@@ -375,6 +375,7 @@ TensorFlow and/or Flax.
main_classes/processors main_classes/processors
main_classes/tokenizer main_classes/tokenizer
main_classes/trainer main_classes/trainer
main_classes/feature_extractor
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
...@@ -441,3 +442,4 @@ TensorFlow and/or Flax. ...@@ -441,3 +442,4 @@ TensorFlow and/or Flax.
internal/tokenization_utils internal/tokenization_utils
internal/trainer_utils internal/trainer_utils
internal/generation_utils internal/generation_utils
internal/file_utils
..
Copyright 2021 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
General Utilities
-----------------------------------------------------------------------------------------------------------------------
This page lists all of Transformers general utility functions that are found in the file ``file_utils.py``.
Most of those are only useful if you are studying the general code in the library.
Enums and namedtuples
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.file_utils.ExplicitEnum
.. autoclass:: transformers.file_utils.PaddingStrategy
.. autoclass:: transformers.file_utils.TensorType
Special Decorators
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: transformers.file_utils.add_start_docstrings
.. autofunction:: transformers.file_utils.add_start_docstrings_to_model_forward
.. autofunction:: transformers.file_utils.add_end_docstrings
.. autofunction:: transformers.file_utils.add_code_sample_docstrings
.. autofunction:: transformers.file_utils.replace_return_docstrings
Special Properties
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.file_utils.cached_property
Other Utilities
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.file_utils._BaseLazyModule
...@@ -38,12 +38,6 @@ SpecialTokensMixin ...@@ -38,12 +38,6 @@ SpecialTokensMixin
Enums and namedtuples Enums and namedtuples
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.tokenization_utils_base.ExplicitEnum
.. autoclass:: transformers.tokenization_utils_base.PaddingStrategy
.. autoclass:: transformers.tokenization_utils_base.TensorType
.. autoclass:: transformers.tokenization_utils_base.TruncationStrategy .. autoclass:: transformers.tokenization_utils_base.TruncationStrategy
.. autoclass:: transformers.tokenization_utils_base.CharSpan .. autoclass:: transformers.tokenization_utils_base.CharSpan
......
..
Copyright 2021 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
Feature Extractor
-----------------------------------------------------------------------------------------------------------------------
A feature extractor is in charge of preparing read-in audio files for a speech model. This includes feature extraction,
such as processing audio files to, *e.g.*, Log-Mel Spectrogram features, but also padding, normalization, and
conversion to Numpy, PyTorch, and TensorFlow tensors.
PreTrainedFeatureExtractor
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.PreTrainedFeatureExtractor
:members: from_pretrained, save_pretrained, pad
BatchFeature
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.BatchFeature
:members:
...@@ -34,7 +34,7 @@ Tips: ...@@ -34,7 +34,7 @@ Tips:
- Wav2Vec2 is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. - Wav2Vec2 is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
- Wav2Vec2 model was trained using connectionist temporal classification (CTC) so the model output has to be decoded - Wav2Vec2 model was trained using connectionist temporal classification (CTC) so the model output has to be decoded
using :class:`~transformers.Wav2Vec2Tokenizer`. using :class:`~transformers.Wav2Vec2CTCTokenizer`.
Wav2Vec2Config Wav2Vec2Config
...@@ -44,13 +44,27 @@ Wav2Vec2Config ...@@ -44,13 +44,27 @@ Wav2Vec2Config
:members: :members:
Wav2Vec2Tokenizer Wav2Vec2CTCTokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.Wav2Vec2Tokenizer .. autoclass:: transformers.Wav2Vec2CTCTokenizer
:members: __call__, save_vocabulary :members: __call__, save_vocabulary
Wav2Vec2FeatureExtractor
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.Wav2Vec2FeatureExtractor
:members: __call__
Wav2Vec2Processor
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.Wav2Vec2Processor
:members: __call__, from_pretrained, save_pretrained, batch_decode, decode, as_target_processor
Wav2Vec2Model Wav2Vec2Model
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
......
...@@ -39,7 +39,8 @@ from transformers import ( ...@@ -39,7 +39,8 @@ from transformers import (
default_data_collator, default_data_collator,
set_seed, set_seed,
) )
from transformers.tokenization_utils_base import PaddingStrategy, PreTrainedTokenizerBase from transformers.file_utils import PaddingStrategy
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from transformers.trainer_utils import get_last_checkpoint, is_main_process from transformers.trainer_utils import get_last_checkpoint, is_main_process
...@@ -133,7 +134,7 @@ class DataCollatorForMultipleChoice: ...@@ -133,7 +134,7 @@ class DataCollatorForMultipleChoice:
Args: Args:
tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`): tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
The tokenizer used for encoding the data. The tokenizer used for encoding the data.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`): padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding index) Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
among: among:
......
...@@ -88,6 +88,7 @@ _import_structure = { ...@@ -88,6 +88,7 @@ _import_structure = {
"TF_WEIGHTS_NAME", "TF_WEIGHTS_NAME",
"TRANSFORMERS_CACHE", "TRANSFORMERS_CACHE",
"WEIGHTS_NAME", "WEIGHTS_NAME",
"TensorType",
"add_end_docstrings", "add_end_docstrings",
"add_start_docstrings", "add_start_docstrings",
"cached_path", "cached_path",
...@@ -125,7 +126,14 @@ _import_structure = { ...@@ -125,7 +126,14 @@ _import_structure = {
], ],
"models": [], "models": [],
# Models # Models
"models.wav2vec2": ["WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP", "Wav2Vec2Config", "Wav2Vec2Tokenizer"], "models.wav2vec2": [
"WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP",
"Wav2Vec2Config",
"Wav2Vec2CTCTokenizer",
"Wav2Vec2Tokenizer",
"Wav2Vec2FeatureExtractor",
"Wav2Vec2Processor",
],
"models.convbert": ["CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvBertConfig", "ConvBertTokenizer"], "models.convbert": ["CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvBertConfig", "ConvBertTokenizer"],
"models.albert": ["ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "AlbertConfig"], "models.albert": ["ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "AlbertConfig"],
"models.auto": [ "models.auto": [
...@@ -234,9 +242,9 @@ _import_structure = { ...@@ -234,9 +242,9 @@ _import_structure = {
"CharSpan", "CharSpan",
"PreTrainedTokenizerBase", "PreTrainedTokenizerBase",
"SpecialTokensMixin", "SpecialTokensMixin",
"TensorType",
"TokenSpan", "TokenSpan",
], ],
"feature_extraction_utils": ["PreTrainedFeatureExtractor", "BatchFeature"],
"trainer_callback": [ "trainer_callback": [
"DefaultFlowCallback", "DefaultFlowCallback",
"EarlyStoppingCallback", "EarlyStoppingCallback",
...@@ -1217,6 +1225,9 @@ if TYPE_CHECKING: ...@@ -1217,6 +1225,9 @@ if TYPE_CHECKING:
xnli_tasks_num_labels, xnli_tasks_num_labels,
) )
# Feature Extractor
from .feature_extraction_utils import BatchFeature, PreTrainedFeatureExtractor
# Files and general utilities # Files and general utilities
from .file_utils import ( from .file_utils import (
CONFIG_NAME, CONFIG_NAME,
...@@ -1228,6 +1239,7 @@ if TYPE_CHECKING: ...@@ -1228,6 +1239,7 @@ if TYPE_CHECKING:
TF_WEIGHTS_NAME, TF_WEIGHTS_NAME,
TRANSFORMERS_CACHE, TRANSFORMERS_CACHE,
WEIGHTS_NAME, WEIGHTS_NAME,
TensorType,
add_end_docstrings, add_end_docstrings,
add_start_docstrings, add_start_docstrings,
cached_path, cached_path,
...@@ -1343,7 +1355,14 @@ if TYPE_CHECKING: ...@@ -1343,7 +1355,14 @@ if TYPE_CHECKING:
TransfoXLCorpus, TransfoXLCorpus,
TransfoXLTokenizer, TransfoXLTokenizer,
) )
from .models.wav2vec2 import WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP, Wav2Vec2Config, Wav2Vec2Tokenizer from .models.wav2vec2 import (
WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP,
Wav2Vec2Config,
Wav2Vec2CTCTokenizer,
Wav2Vec2FeatureExtractor,
Wav2Vec2Processor,
Wav2Vec2Tokenizer,
)
from .models.xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMConfig, XLMTokenizer from .models.xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMConfig, XLMTokenizer
from .models.xlm_prophetnet import XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMProphetNetConfig from .models.xlm_prophetnet import XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMProphetNetConfig
from .models.xlm_roberta import XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMRobertaConfig from .models.xlm_roberta import XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMRobertaConfig
...@@ -1381,7 +1400,6 @@ if TYPE_CHECKING: ...@@ -1381,7 +1400,6 @@ if TYPE_CHECKING:
CharSpan, CharSpan,
PreTrainedTokenizerBase, PreTrainedTokenizerBase,
SpecialTokensMixin, SpecialTokensMixin,
TensorType,
TokenSpan, TokenSpan,
) )
......
...@@ -20,8 +20,9 @@ from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union ...@@ -20,8 +20,9 @@ from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union
import torch import torch
from torch.nn.utils.rnn import pad_sequence from torch.nn.utils.rnn import pad_sequence
from ..file_utils import PaddingStrategy
from ..modeling_utils import PreTrainedModel from ..modeling_utils import PreTrainedModel
from ..tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTrainedTokenizerBase from ..tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase
InputDataClass = NewType("InputDataClass", Any) InputDataClass = NewType("InputDataClass", Any)
...@@ -89,7 +90,7 @@ class DataCollatorWithPadding: ...@@ -89,7 +90,7 @@ class DataCollatorWithPadding:
Args: Args:
tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`): tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
The tokenizer used for encoding the data. The tokenizer used for encoding the data.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`): padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding index) Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
among: among:
...@@ -138,7 +139,7 @@ class DataCollatorForTokenClassification: ...@@ -138,7 +139,7 @@ class DataCollatorForTokenClassification:
Args: Args:
tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`): tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
The tokenizer used for encoding the data. The tokenizer used for encoding the data.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`): padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding index) Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
among: among:
...@@ -238,7 +239,7 @@ class DataCollatorForSeq2Seq: ...@@ -238,7 +239,7 @@ class DataCollatorForSeq2Seq:
prepare the `decoder_input_ids` prepare the `decoder_input_ids`
This is useful when using `label_smoothing` to avoid calculating loss twice. This is useful when using `label_smoothing` to avoid calculating loss twice.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`): padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding index) Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
among: among:
......
This diff is collapsed.
...@@ -27,9 +27,10 @@ import shutil ...@@ -27,9 +27,10 @@ import shutil
import sys import sys
import tarfile import tarfile
import tempfile import tempfile
from collections import OrderedDict from collections import OrderedDict, UserDict
from contextlib import contextmanager from contextlib import contextmanager
from dataclasses import fields from dataclasses import fields
from enum import Enum
from functools import partial, wraps from functools import partial, wraps
from hashlib import sha256 from hashlib import sha256
from pathlib import Path from pathlib import Path
...@@ -211,6 +212,7 @@ TF2_WEIGHTS_NAME = "tf_model.h5" ...@@ -211,6 +212,7 @@ TF2_WEIGHTS_NAME = "tf_model.h5"
TF_WEIGHTS_NAME = "model.ckpt" TF_WEIGHTS_NAME = "model.ckpt"
FLAX_WEIGHTS_NAME = "flax_model.msgpack" FLAX_WEIGHTS_NAME = "flax_model.msgpack"
CONFIG_NAME = "config.json" CONFIG_NAME = "config.json"
FEATURE_EXTRACTOR_NAME = "preprocessor_config.json"
MODEL_CARD_NAME = "modelcard.json" MODEL_CARD_NAME = "modelcard.json"
SENTENCEPIECE_UNDERLINE = "▁" SENTENCEPIECE_UNDERLINE = "▁"
...@@ -1400,6 +1402,52 @@ def is_tensor(x): ...@@ -1400,6 +1402,52 @@ def is_tensor(x):
return isinstance(x, np.ndarray) return isinstance(x, np.ndarray)
def _is_numpy(x):
return isinstance(x, np.ndarray)
def _is_torch(x):
import torch
return isinstance(x, torch.Tensor)
def _is_torch_device(x):
import torch
return isinstance(x, torch.device)
def _is_tensorflow(x):
import tensorflow as tf
return isinstance(x, tf.Tensor)
def _is_jax(x):
import jax.numpy as jnp # noqa: F811
return isinstance(x, jnp.ndarray)
def to_py_obj(obj):
"""
Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a python list.
"""
if isinstance(obj, (dict, UserDict)):
return {k: to_py_obj(v) for k, v in obj.items()}
elif isinstance(obj, (list, tuple)):
return [to_py_obj(o) for o in obj]
elif is_tf_available() and _is_tensorflow(obj):
return obj.numpy().tolist()
elif is_torch_available() and _is_torch(obj):
return obj.detach().cpu().tolist()
elif isinstance(obj, np.ndarray):
return obj.tolist()
else:
return obj
class ModelOutput(OrderedDict): class ModelOutput(OrderedDict):
""" """
Base class for all model outputs as dataclass. Has a ``__getitem__`` that allows indexing by integer or slice (like Base class for all model outputs as dataclass. Has a ``__getitem__`` that allows indexing by integer or slice (like
...@@ -1489,6 +1537,42 @@ class ModelOutput(OrderedDict): ...@@ -1489,6 +1537,42 @@ class ModelOutput(OrderedDict):
return tuple(self[k] for k in self.keys()) return tuple(self[k] for k in self.keys())
class ExplicitEnum(Enum):
"""
Enum with more explicit error message for missing values.
"""
@classmethod
def _missing_(cls, value):
raise ValueError(
"%r is not a valid %s, please select one of %s"
% (value, cls.__name__, str(list(cls._value2member_map_.keys())))
)
class PaddingStrategy(ExplicitEnum):
"""
Possible values for the ``padding`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for tab-completion
in an IDE.
"""
LONGEST = "longest"
MAX_LENGTH = "max_length"
DO_NOT_PAD = "do_not_pad"
class TensorType(ExplicitEnum):
"""
Possible values for the ``return_tensors`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for
tab-completion in an IDE.
"""
PYTORCH = "pt"
TENSORFLOW = "tf"
NUMPY = "np"
JAX = "jax"
class _BaseLazyModule(ModuleType): class _BaseLazyModule(ModuleType):
""" """
Module class that surfaces all objects but only performs associated imports when the objects are requested. Module class that surfaces all objects but only performs associated imports when the objects are requested.
......
...@@ -52,7 +52,7 @@ from ..roberta.tokenization_roberta import RobertaTokenizer ...@@ -52,7 +52,7 @@ from ..roberta.tokenization_roberta import RobertaTokenizer
from ..squeezebert.tokenization_squeezebert import SqueezeBertTokenizer from ..squeezebert.tokenization_squeezebert import SqueezeBertTokenizer
from ..tapas.tokenization_tapas import TapasTokenizer from ..tapas.tokenization_tapas import TapasTokenizer
from ..transfo_xl.tokenization_transfo_xl import TransfoXLTokenizer from ..transfo_xl.tokenization_transfo_xl import TransfoXLTokenizer
from ..wav2vec2.tokenization_wav2vec2 import Wav2Vec2Tokenizer from ..wav2vec2.tokenization_wav2vec2 import Wav2Vec2CTCTokenizer
from ..xlm.tokenization_xlm import XLMTokenizer from ..xlm.tokenization_xlm import XLMTokenizer
from .configuration_auto import ( from .configuration_auto import (
AlbertConfig, AlbertConfig,
...@@ -244,7 +244,7 @@ TOKENIZER_MAPPING = OrderedDict( ...@@ -244,7 +244,7 @@ TOKENIZER_MAPPING = OrderedDict(
(TapasConfig, (TapasTokenizer, None)), (TapasConfig, (TapasTokenizer, None)),
(LEDConfig, (LEDTokenizer, LEDTokenizerFast)), (LEDConfig, (LEDTokenizer, LEDTokenizerFast)),
(ConvBertConfig, (ConvBertTokenizer, ConvBertTokenizerFast)), (ConvBertConfig, (ConvBertTokenizer, ConvBertTokenizerFast)),
(Wav2Vec2Config, (Wav2Vec2Tokenizer, None)), (Wav2Vec2Config, (Wav2Vec2CTCTokenizer, None)),
] ]
) )
......
...@@ -18,8 +18,8 @@ ...@@ -18,8 +18,8 @@
import collections import collections
from typing import List, Optional, Union from typing import List, Optional, Union
from ...file_utils import add_end_docstrings, add_start_docstrings from ...file_utils import TensorType, add_end_docstrings, add_start_docstrings
from ...tokenization_utils_base import BatchEncoding, TensorType from ...tokenization_utils_base import BatchEncoding
from ...utils import logging from ...utils import logging
from ..bert.tokenization_bert import BertTokenizer from ..bert.tokenization_bert import BertTokenizer
...@@ -147,7 +147,7 @@ CUSTOM_DPR_READER_DOCSTRING = r""" ...@@ -147,7 +147,7 @@ CUSTOM_DPR_READER_DOCSTRING = r"""
The passages titles to be encoded. This can be a string or a list of strings if there are several passages. The passages titles to be encoded. This can be a string or a list of strings if there are several passages.
texts (:obj:`str` or :obj:`List[str]`): texts (:obj:`str` or :obj:`List[str]`):
The passages texts to be encoded. This can be a string or a list of strings if there are several passages. The passages texts to be encoded. This can be a string or a list of strings if there are several passages.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`): padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
Activates and controls padding. Accepts the following values: Activates and controls padding. Accepts the following values:
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
...@@ -177,7 +177,7 @@ CUSTOM_DPR_READER_DOCSTRING = r""" ...@@ -177,7 +177,7 @@ CUSTOM_DPR_READER_DOCSTRING = r"""
If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
length is required by one of the truncation/padding parameters. If the model has no specific maximum length is required by one of the truncation/padding parameters. If the model has no specific maximum
input length (like XLNet) truncation/padding to a maximum length will be deactivated. input length (like XLNet) truncation/padding to a maximum length will be deactivated.
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`): return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
If set, will return tensors instead of list of python integers. Acceptable values are: If set, will return tensors instead of list of python integers. Acceptable values are:
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
......
...@@ -18,8 +18,8 @@ ...@@ -18,8 +18,8 @@
import collections import collections
from typing import List, Optional, Union from typing import List, Optional, Union
from ...file_utils import add_end_docstrings, add_start_docstrings from ...file_utils import TensorType, add_end_docstrings, add_start_docstrings
from ...tokenization_utils_base import BatchEncoding, TensorType from ...tokenization_utils_base import BatchEncoding
from ...utils import logging from ...utils import logging
from ..bert.tokenization_bert_fast import BertTokenizerFast from ..bert.tokenization_bert_fast import BertTokenizerFast
from .tokenization_dpr import DPRContextEncoderTokenizer, DPRQuestionEncoderTokenizer, DPRReaderTokenizer from .tokenization_dpr import DPRContextEncoderTokenizer, DPRQuestionEncoderTokenizer, DPRReaderTokenizer
...@@ -148,7 +148,7 @@ CUSTOM_DPR_READER_DOCSTRING = r""" ...@@ -148,7 +148,7 @@ CUSTOM_DPR_READER_DOCSTRING = r"""
The passages titles to be encoded. This can be a string or a list of strings if there are several passages. The passages titles to be encoded. This can be a string or a list of strings if there are several passages.
texts (:obj:`str` or :obj:`List[str]`): texts (:obj:`str` or :obj:`List[str]`):
The passages texts to be encoded. This can be a string or a list of strings if there are several passages. The passages texts to be encoded. This can be a string or a list of strings if there are several passages.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`): padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
Activates and controls padding. Accepts the following values: Activates and controls padding. Accepts the following values:
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
...@@ -178,7 +178,7 @@ CUSTOM_DPR_READER_DOCSTRING = r""" ...@@ -178,7 +178,7 @@ CUSTOM_DPR_READER_DOCSTRING = r"""
If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
length is required by one of the truncation/padding parameters. If the model has no specific maximum length is required by one of the truncation/padding parameters. If the model has no specific maximum
input length (like XLNet) truncation/padding to a maximum length will be deactivated. input length (like XLNet) truncation/padding to a maximum length will be deactivated.
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`): return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
If set, will return tensors instead of list of python integers. Acceptable values are: If set, will return tensors instead of list of python integers. Acceptable values are:
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
......
...@@ -573,7 +573,7 @@ class RagRetriever: ...@@ -573,7 +573,7 @@ class RagRetriever:
The prefix used by the generator's tokenizer. The prefix used by the generator's tokenizer.
n_docs (:obj:`int`, `optional`): n_docs (:obj:`int`, `optional`):
The number of docs retrieved per query. The number of docs retrieved per query.
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`, defaults to "pt"): return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`, defaults to "pt"):
If set, will return tensors instead of list of python integers. Acceptable values are: If set, will return tensors instead of list of python integers. Acceptable values are:
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
......
...@@ -28,16 +28,13 @@ from typing import Callable, Dict, Generator, List, Optional, Text, Tuple, Union ...@@ -28,16 +28,13 @@ from typing import Callable, Dict, Generator, List, Optional, Text, Tuple, Union
import numpy as np import numpy as np
from ...file_utils import add_end_docstrings, is_pandas_available from ...file_utils import ExplicitEnum, PaddingStrategy, TensorType, add_end_docstrings, is_pandas_available
from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
from ...tokenization_utils_base import ( from ...tokenization_utils_base import (
ENCODE_KWARGS_DOCSTRING, ENCODE_KWARGS_DOCSTRING,
BatchEncoding, BatchEncoding,
EncodedInput, EncodedInput,
ExplicitEnum,
PaddingStrategy,
PreTokenizedInput, PreTokenizedInput,
TensorType,
TextInput, TextInput,
) )
from ...utils import logging from ...utils import logging
...@@ -151,7 +148,7 @@ def whitespace_tokenize(text): ...@@ -151,7 +148,7 @@ def whitespace_tokenize(text):
TAPAS_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r""" TAPAS_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`): add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to encode the sequences with the special tokens relative to their model. Whether or not to encode the sequences with the special tokens relative to their model.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`): padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
Activates and controls padding. Accepts the following values: Activates and controls padding. Accepts the following values:
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
...@@ -180,7 +177,7 @@ TAPAS_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r""" ...@@ -180,7 +177,7 @@ TAPAS_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
pad_to_multiple_of (:obj:`int`, `optional`): pad_to_multiple_of (:obj:`int`, `optional`):
If set will pad the sequence to a multiple of the provided value. This is especially useful to enable If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`): return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
If set, will return tensors instead of list of python integers. Acceptable values are: If set, will return tensors instead of list of python integers. Acceptable values are:
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
......
...@@ -22,7 +22,9 @@ from ...file_utils import _BaseLazyModule, is_tokenizers_available, is_torch_ava ...@@ -22,7 +22,9 @@ from ...file_utils import _BaseLazyModule, is_tokenizers_available, is_torch_ava
_import_structure = { _import_structure = {
"configuration_wav2vec2": ["WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP", "Wav2Vec2Config"], "configuration_wav2vec2": ["WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP", "Wav2Vec2Config"],
"tokenization_wav2vec2": ["Wav2Vec2Tokenizer"], "tokenization_wav2vec2": ["Wav2Vec2CTCTokenizer", "Wav2Vec2Tokenizer"],
"feature_extraction_wav2vec2": ["Wav2Vec2FeatureExtractor"],
"processing_wav2vec2": ["Wav2Vec2Processor"],
} }
if is_torch_available(): if is_torch_available():
...@@ -37,7 +39,9 @@ if is_torch_available(): ...@@ -37,7 +39,9 @@ if is_torch_available():
if TYPE_CHECKING: if TYPE_CHECKING:
from .configuration_wav2vec2 import WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP, Wav2Vec2Config from .configuration_wav2vec2 import WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP, Wav2Vec2Config
from .tokenization_wav2vec2 import Wav2Vec2Tokenizer from .feature_extraction_wav2vec2 import Wav2Vec2FeatureExtractor
from .processing_wav2vec2 import Wav2Vec2Processor
from .tokenization_wav2vec2 import Wav2Vec2CTCTokenizer, Wav2Vec2Tokenizer
if is_torch_available(): if is_torch_available():
from .modeling_wav2vec2 import ( from .modeling_wav2vec2 import (
......
# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Feature extractor class for Wav2Vec2
"""
from typing import List, Optional, Union
import numpy as np
from ...feature_extraction_utils import BatchFeature, PreTrainedFeatureExtractor
from ...file_utils import PaddingStrategy, TensorType
from ...utils import logging
logger = logging.get_logger(__name__)
class Wav2Vec2FeatureExtractor(PreTrainedFeatureExtractor):
r"""
Constructs a Wav2Vec2 feature extractor.
This feature extractor inherits from :class:`~transformers.Wav2Vec2FeatureExtractor` which contains most of the
main methods. Users should refer to this superclass for more information regarding those methods.
Args:
feature_size (:obj:`int`, defaults to 1):
The feature dimension of the extracted features.
sampling_rate (:obj:`int`, defaults to 16000):
The sampling rate at which the audio files should be digitalized expressed in Hertz per second (Hz).
padding_value (:obj:`float`, defaults to 0.0):
The value that is used to fill the padding values.
do_normalize (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
improve the performance for some models, *e.g.*, `wav2vec2-lv60
<https://huggingface.co/models?search=lv60>`__.
return_attention_mask (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not :meth:`~transformers.Wav2Vec2Tokenizer.__call__` should return :obj:`attention_mask`.
.. note::
Wav2Vec2 models that have set ``config.feat_extract_norm == "group"``, such as `wav2vec2-base
<https://huggingface.co/facebook/wav2vec2-base-960h>`__, have **not** been trained using
:obj:`attention_mask`. For such models, :obj:`input_values` should simply be padded with 0 and no
:obj:`attention_mask` should be passed.
For Wav2Vec2 models that have set ``config.feat_extract_norm == "layer"``, such as `wav2vec2-lv60
<https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self>`__, :obj:`attention_mask` should be
passed for batched inference.
"""
model_input_names = ["input_values", "attention_mask"]
def __init__(
self,
feature_size=1,
sampling_rate=16000,
padding_value=0.0,
return_attention_mask=False,
do_normalize=True,
**kwargs
):
super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
self.return_attention_mask = return_attention_mask
self.do_normalize = do_normalize
@staticmethod
def zero_mean_unit_var_norm(input_values: List[np.ndarray]) -> List[np.ndarray]:
"""
Every array in the list is normalized to have zero mean and unit variance
"""
return [(x - np.mean(x)) / np.sqrt(np.var(x) + 1e-5) for x in input_values]
def __call__(
self,
raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
padding: Union[bool, str, PaddingStrategy] = False,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
sampling_rate: Optional[int] = None,
**kwargs
) -> BatchFeature:
"""
Main method to featurize and prepare for the model one or several sequence(s). sequences.
Args:
raw_speech (:obj:`np.ndarray`, :obj:`List[float]`, :obj:`List[np.ndarray]`, :obj:`List[List[float]]`):
The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
values, a list of numpy arrays or a list of list of float values.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding
index) among:
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
single sequence if provided).
* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
maximum acceptable input length for the model if that argument is not provided.
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
different lengths).
max_length (:obj:`int`, `optional`):
Maximum length of the returned list and optionally padding length (see above).
pad_to_multiple_of (:obj:`int`, `optional`):
If set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
>= 7.5 (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
return_attention_mask (:obj:`bool`, `optional`):
Whether to return the attention mask. If left to the default, will return the attention mask according
to the specific feature_extractor's default.
`What are attention masks? <../glossary.html#attention-mask>`__
.. note::
Wav2Vec2 models that have set ``config.feat_extract_norm == "group"``, such as `wav2vec2-base
<https://huggingface.co/facebook/wav2vec2-base-960h>`__, have **not** been trained using
:obj:`attention_mask`. For such models, :obj:`input_values` should simply be padded with 0 and no
:obj:`attention_mask` should be passed.
For Wav2Vec2 models that have set ``config.feat_extract_norm == "layer"``, such as `wav2vec2-lv60
<https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self>`__, :obj:`attention_mask` should be
passed for batched inference.
return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
If set, will return tensors instead of list of python integers. Acceptable values are:
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
* :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
* :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
sampling_rate (:obj:`int`, `optional`):
The sampling rate at which the ``raw_speech`` input was sampled. It is strongly recommended to pass
``sampling_rate`` at the forward call to prevent silent errors.
padding_value (:obj:`float`, defaults to 0.0):
"""
if sampling_rate is not None:
if sampling_rate != self.sampling_rate:
raise ValueError(
f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of {self.sampling_rate}."
f"Please make sure that the provided `raw_speech` input was sampled with {self.sampling_rate} and not {sampling_rate}."
)
else:
logger.warning(
"It is strongly recommended to pass the ``sampling_rate`` argument to this function."
"Failing to do so can result in silent errors that might be hard to debug."
)
is_batched = bool(
isinstance(raw_speech, (list, tuple))
and (isinstance(raw_speech[0], np.ndarray) or isinstance(raw_speech[0], (tuple, list)))
)
# make sure input is in list format
if is_batched and not isinstance(raw_speech[0], np.ndarray):
raw_speech = [np.asarray(speech) for speech in raw_speech]
elif not is_batched and not isinstance(raw_speech, np.ndarray):
raw_speech = np.asarray(raw_speech)
# always return batch
if not is_batched:
raw_speech = [raw_speech]
# zero-mean and unit-variance normalization
if self.do_normalize:
raw_speech = self.zero_mean_unit_var_norm(raw_speech)
# convert into correct format for padding
encoded_inputs = BatchFeature({"input_values": raw_speech})
padded_inputs = self.pad(
encoded_inputs,
padding=padding,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=self.return_attention_mask,
return_tensors=return_tensors,
)
return padded_inputs
...@@ -616,9 +616,9 @@ WAV_2_VEC_2_INPUTS_DOCSTRING = r""" ...@@ -616,9 +616,9 @@ WAV_2_VEC_2_INPUTS_DOCSTRING = r"""
input_values (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`): input_values (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
soundfile`). To prepare the array into `input_values`, the :class:`~transformers.Wav2Vec2Tokenizer` should soundfile`). To prepare the array into `input_values`, the :class:`~transformers.Wav2Vec2Processor` should
be used for padding and conversion into a tensor of type `torch.FloatTensor`. See be used for padding and conversion into a tensor of type `torch.FloatTensor`. See
:meth:`transformers.Wav2Vec2Tokenizer.__call__` for details. :meth:`transformers.Wav2Vec2Processor.__call__` for details.
attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Mask to avoid performing convolution and attention on padding token indices. Mask values selected in ``[0, Mask to avoid performing convolution and attention on padding token indices. Mask values selected in ``[0,
1]``: 1]``:
...@@ -629,8 +629,8 @@ WAV_2_VEC_2_INPUTS_DOCSTRING = r""" ...@@ -629,8 +629,8 @@ WAV_2_VEC_2_INPUTS_DOCSTRING = r"""
`What are attention masks? <../glossary.html#attention-mask>`__ `What are attention masks? <../glossary.html#attention-mask>`__
.. warning:: .. warning::
:obj:`attention_mask` should only be passed if the corresponding tokenizer has :obj:`attention_mask` should only be passed if the corresponding processor has
``config.return_attention_mask == True``. For all models whose tokenizer has ``config.return_attention_mask == True``. For all models whose processor has
``config.return_attention_mask == False``, such as `wav2vec2-base ``config.return_attention_mask == False``, such as `wav2vec2-base
<https://huggingface.co/facebook/wav2vec2-base-960h>`__, :obj:`attention_mask` should **not** be passed <https://huggingface.co/facebook/wav2vec2-base-960h>`__, :obj:`attention_mask` should **not** be passed
to avoid degraded performance when doing batched inference. For such models :obj:`input_values` should to avoid degraded performance when doing batched inference. For such models :obj:`input_values` should
...@@ -682,11 +682,11 @@ class Wav2Vec2Model(Wav2Vec2PreTrainedModel): ...@@ -682,11 +682,11 @@ class Wav2Vec2Model(Wav2Vec2PreTrainedModel):
Example:: Example::
>>> from transformers import Wav2Vec2Tokenizer, Wav2Vec2Model >>> from transformers import Wav2Vec2Processor, Wav2Vec2Model
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> import soundfile as sf >>> import soundfile as sf
>>> tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h") >>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
>>> model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h") >>> model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
>>> def map_to_array(batch): >>> def map_to_array(batch):
...@@ -697,7 +697,7 @@ class Wav2Vec2Model(Wav2Vec2PreTrainedModel): ...@@ -697,7 +697,7 @@ class Wav2Vec2Model(Wav2Vec2PreTrainedModel):
>>> ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation") >>> ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.map(map_to_array) >>> ds = ds.map(map_to_array)
>>> input_values = tokenizer(ds["speech"][0], return_tensors="pt").input_values # Batch size 1 >>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values # Batch size 1
>>> hidden_states = model(input_values).last_hidden_state >>> hidden_states = model(input_values).last_hidden_state
""" """
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
...@@ -780,11 +780,11 @@ class Wav2Vec2ForMaskedLM(Wav2Vec2PreTrainedModel): ...@@ -780,11 +780,11 @@ class Wav2Vec2ForMaskedLM(Wav2Vec2PreTrainedModel):
Example:: Example::
>>> from transformers import Wav2Vec2Tokenizer, Wav2Vec2Model >>> from transformers import Wav2Vec2Processor, Wav2Vec2Model
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> import soundfile as sf >>> import soundfile as sf
>>> tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h") >>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
>>> model = Wav2Vec2ForMaskedLM.from_pretrained("facebook/wav2vec2-base-960h") >>> model = Wav2Vec2ForMaskedLM.from_pretrained("facebook/wav2vec2-base-960h")
>>> def map_to_array(batch): >>> def map_to_array(batch):
...@@ -795,11 +795,11 @@ class Wav2Vec2ForMaskedLM(Wav2Vec2PreTrainedModel): ...@@ -795,11 +795,11 @@ class Wav2Vec2ForMaskedLM(Wav2Vec2PreTrainedModel):
>>> ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation") >>> ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.map(map_to_array) >>> ds = ds.map(map_to_array)
>>> input_values = tokenizer(ds["speech"][0], return_tensors="pt").input_values # Batch size 1 >>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values # Batch size 1
>>> logits = model(input_values).logits >>> logits = model(input_values).logits
>>> predicted_ids = torch.argmax(logits, dim=-1) >>> predicted_ids = torch.argmax(logits, dim=-1)
>>> transcription = tokenizer.decode(predicted_ids[0]) >>> transcription = processor.decode(predicted_ids[0])
""" """
return_dict = return_dict if return_dict is not None else self.config.use_return_dict return_dict = return_dict if return_dict is not None else self.config.use_return_dict
...@@ -856,11 +856,11 @@ class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel): ...@@ -856,11 +856,11 @@ class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel):
Example:: Example::
>>> import torch >>> import torch
>>> from transformers import Wav2Vec2Tokenizer, Wav2Vec2ForCTC >>> from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
>>> from datasets import load_dataset >>> from datasets import load_dataset
>>> import soundfile as sf >>> import soundfile as sf
>>> tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h") >>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
>>> model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") >>> model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
>>> def map_to_array(batch): >>> def map_to_array(batch):
...@@ -871,11 +871,11 @@ class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel): ...@@ -871,11 +871,11 @@ class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel):
>>> ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation") >>> ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.map(map_to_array) >>> ds = ds.map(map_to_array)
>>> input_values = tokenizer(ds["speech"][0], return_tensors="pt").input_values # Batch size 1 >>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values # Batch size 1
>>> logits = model(input_values).logits >>> logits = model(input_values).logits
>>> predicted_ids = torch.argmax(logits, dim=-1) >>> predicted_ids = torch.argmax(logits, dim=-1)
>>> transcription = tokenizer.decode(predicted_ids[0]) >>> transcription = processor.decode(predicted_ids[0])
""" """
return_dict = return_dict if return_dict is not None else self.config.use_return_dict return_dict = return_dict if return_dict is not None else self.config.use_return_dict
......
# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Speech processor class for Wav2Vec2
"""
from contextlib import contextmanager
from .feature_extraction_wav2vec2 import Wav2Vec2FeatureExtractor
from .tokenization_wav2vec2 import Wav2Vec2CTCTokenizer
class Wav2Vec2Processor:
r"""
Constructs a Wav2Vec2 processor which wraps a Wav2Vec2 feature extractor and a Wav2Vec2 CTC tokenizer into a single
processor.
:class:`~transformers.Wav2Vec2Processor` offers all the functionalities of
:class:`~transformers.Wav2Vec2FeatureExtractor` and :class:`~transformers.Wav2Vec2CTCTokenizer`. See the docstring
of :meth:`~transformers.Wav2Vec2Processor.__call__` and :meth:`~transformers.Wav2Vec2Processor.decode` for more
information.
Args:
feature_extractor (:obj:`Wav2Vec2FeatureExtractor`):
An instance of :class:`~transformers.Wav2Vec2FeatureExtractor`. The feature extractor is a required input.
tokenizer (:obj:`Wav2Vec2CTCTokenizer`):
An instance of :class:`~transformers.Wav2Vec2CTCTokenizer`. The tokenizer is a required input.
"""
def __init__(self, feature_extractor, tokenizer):
if not isinstance(feature_extractor, Wav2Vec2FeatureExtractor):
raise ValueError(
f"`feature_extractor` has to be of type {Wav2Vec2FeatureExtractor.__class__}, but is {type(feature_extractor)}"
)
if not isinstance(tokenizer, Wav2Vec2CTCTokenizer):
raise ValueError(
f"`tokenizer` has to be of type {Wav2Vec2CTCTokenizer.__class__}, but is {type(tokenizer)}"
)
self.feature_extractor = feature_extractor
self.tokenizer = tokenizer
self.current_processor = self.feature_extractor
def save_pretrained(self, save_directory):
"""
Save a Wav2Vec2 feature_extractor object and Wav2Vec2 tokenizer object to the directory ``save_directory``, so
that it can be re-loaded using the :func:`~transformers.Wav2Vec2Processor.from_pretrained` class method.
.. note::
This class method is simply calling :meth:`~transformers.PreTrainedFeatureExtractor.save_pretrained` and
:meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.save_pretrained`. Please refer to the
docstrings of the methods above for more information.
Args:
save_directory (:obj:`str` or :obj:`os.PathLike`):
Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
be created if it does not exist).
"""
self.feature_extractor.save_pretrained(save_directory)
self.tokenizer.save_pretrained(save_directory)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
r"""
Instantiate a :class:`~transformers.Wav2Vec2Processor` from a pretrained Wav2Vec2 processor.
.. note::
This class method is simply calling Wav2Vec2FeatureExtractor's
:meth:`~transformers.PreTrainedFeatureExtractor.from_pretrained` and Wav2Vec2CTCTokenizer's
:meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained`. Please refer to the
docstrings of the methods above for more information.
Args:
pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
This can be either:
- a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on
huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
- a path to a `directory` containing a feature extractor file saved using the
:meth:`~transformers.PreTrainedFeatureExtractor.save_pretrained` method, e.g.,
``./my_model_directory/``.
- a path or url to a saved feature extractor JSON `file`, e.g.,
``./my_model_directory/feature_extraction_config.json``.
**kwargs
Additional keyword arguments passed along to both :class:`~transformers.PreTrainedFeatureExtractor` and
:class:`~transformers.PreTrainedTokenizer`
"""
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
def __call__(self, *args, **kwargs):
"""
When used in normal mode, this method forwards all its arguments to Wav2Vec2FeatureExtractor's
:meth:`~transformers.Wav2Vec2FeatureExtractor.__call__` and returns its output. If used in the context
:meth:`~transformers.Wav2Vec2Processor.as_target_processor` this method forwards all its arguments to
Wav2Vec2CTCTokenizer's :meth:`~transformers.Wav2Vec2CTCTokenizer.__call__`. Please refer to the doctsring of
the above two methods for more information.
"""
return self.current_processor(*args, **kwargs)
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to Wav2Vec2CTCTokenizer's
:meth:`~transformers.PreTrainedTokenizer.batch_decode`. Please refer to the docstring of this method for more
information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to Wav2Vec2CTCTokenizer's
:meth:`~transformers.PreTrainedTokenizer.decode`. Please refer to the docstring of this method for more
information.
"""
return self.tokenizer.decode(*args, **kwargs)
@contextmanager
def as_target_processor(self):
"""
Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning
Wav2Vec2.
"""
self.current_processor = self.tokenizer
yield
self.current_processor = self.feature_extractor
...@@ -16,14 +16,16 @@ ...@@ -16,14 +16,16 @@
import json import json
import os import os
import sys
import warnings
from itertools import groupby from itertools import groupby
from typing import Dict, List, Optional, Tuple, Union from typing import Dict, List, Optional, Tuple, Union
import numpy as np import numpy as np
from ...file_utils import add_end_docstrings from ...file_utils import PaddingStrategy, TensorType, add_end_docstrings
from ...tokenization_utils import PreTrainedTokenizer from ...tokenization_utils import PreTrainedTokenizer
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, TensorType from ...tokenization_utils_base import BatchEncoding
from ...utils import logging from ...utils import logging
...@@ -37,7 +39,7 @@ VOCAB_FILES_NAMES = { ...@@ -37,7 +39,7 @@ VOCAB_FILES_NAMES = {
WAV2VEC2_KWARGS_DOCSTRING = r""" WAV2VEC2_KWARGS_DOCSTRING = r"""
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`): padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
Activates and controls padding. Accepts the following values: Activates and controls padding. Accepts the following values:
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
...@@ -55,7 +57,7 @@ WAV2VEC2_KWARGS_DOCSTRING = r""" ...@@ -55,7 +57,7 @@ WAV2VEC2_KWARGS_DOCSTRING = r"""
pad_to_multiple_of (:obj:`int`, `optional`): pad_to_multiple_of (:obj:`int`, `optional`):
If set will pad the sequence to a multiple of the provided value. This is especially useful to enable If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`): return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
If set, will return tensors instead of list of python integers. Acceptable values are: If set, will return tensors instead of list of python integers. Acceptable values are:
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
...@@ -66,6 +68,207 @@ WAV2VEC2_KWARGS_DOCSTRING = r""" ...@@ -66,6 +68,207 @@ WAV2VEC2_KWARGS_DOCSTRING = r"""
""" """
class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
"""
Constructs a Wav2Vec2CTC tokenizer.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains some of the main methods.
Users should refer to the superclass for more information regarding such methods.
Args:
vocab_file (:obj:`str`):
File containing the vocabulary.
bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
The beginning of sentence token.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The end of sentence token.
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
word_delimiter_token (:obj:`str`, `optional`, defaults to :obj:`"|"`):
The token used for defining the end of a word.
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to accept lowercase input and lowercase the output when decoding.
**kwargs
Additional keyword arguments passed along to :class:`~transformers.PreTrainedTokenizer`
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = {
"vocab_file": {
"facebook/wav2vec2-base-960h": "https://huggingface.co/facebook/wav2vec2-base-960h/resolve/main/vocab.json"
},
"tokenizer_config_file": {
"facebook/wav2vec2-base-960h": "https://huggingface.co/facebook/wav2vec2-base-960h/resolve/main/tokenizer.json",
},
}
# Wav2Vec2 has no max input length
max_model_input_sizes = {"facebook/wav2vec2-base-960h": sys.maxsize}
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file,
bos_token="<s>",
eos_token="</s>",
unk_token="<unk>",
pad_token="<pad>",
word_delimiter_token="|",
do_lower_case=False,
**kwargs
):
super().__init__(
unk_token=unk_token,
bos_token=bos_token,
eos_token=eos_token,
pad_token=pad_token,
do_lower_case=do_lower_case,
word_delimiter_token=word_delimiter_token,
**kwargs,
)
self._word_delimiter_token = word_delimiter_token
self.do_lower_case = do_lower_case
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}
@property
def word_delimiter_token(self) -> str:
"""
:obj:`str`: Padding token. Log an error if used while not having been set.
"""
if self._word_delimiter_token is None and self.verbose:
logger.error("Using word_delimiter_token, but it is not set yet.")
return None
return str(self._word_delimiter_token)
@property
def word_delimiter_token_id(self) -> Optional[int]:
"""
:obj:`Optional[int]`: Id of the word_delimiter_token in the vocabulary. Returns :obj:`None` if the token has
not been set.
"""
if self._word_delimiter_token is None:
return None
return self.convert_tokens_to_ids(self.word_delimiter_token)
@word_delimiter_token.setter
def word_delimiter_token(self, value):
self._word_delimiter_token = value
@word_delimiter_token_id.setter
def word_delimiter_token_id(self, value):
self._word_delimiter_token = self.convert_tokens_to_ids(value)
@property
def vocab_size(self) -> int:
return len(self.decoder)
def get_vocab(self) -> Dict:
return dict(self.encoder, **self.added_tokens_encoder)
def _tokenize(self, text, **kwargs):
"""
Converts a string in a sequence of tokens (string), using the tokenizer.
"""
if self.do_lower_case:
text = text.upper()
return list(text.replace(" ", self.word_delimiter_token))
def _convert_token_to_id(self, token: str) -> int:
"""Converts a token (str) in an index (integer) using the vocab."""
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index: int) -> str:
"""Converts an index (integer) in a token (str) using the vocab."""
result = self.decoder.get(index, self.unk_token)
return result
def convert_tokens_to_string(
self, tokens: List[str], group_tokens: bool = True, spaces_between_special_tokens: bool = False
) -> str:
"""
Converts a connectionist-temporal-classification (CTC) output tokens into a single string.
"""
# group same tokens into non-repeating tokens in CTC style decoding
if group_tokens:
tokens = [token_group[0] for token_group in groupby(tokens)]
# filter self.pad_token which is used as CTC-blank token
filtered_tokens = list(filter(lambda token: token != self.pad_token, tokens))
if spaces_between_special_tokens:
join_token = " "
else:
join_token = ""
# replace delimiter token
string = join_token.join(
[" " if token == self.word_delimiter_token else token for token in filtered_tokens]
).strip()
if self.do_lower_case:
string = string.lower()
return string
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
if is_split_into_words:
text = " " + text
return (text, kwargs)
def _decode(
self,
token_ids: List[int],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True,
group_tokens: bool = True,
spaces_between_special_tokens: bool = False,
) -> str:
"""
special _decode function is needed for Wav2Vec2Tokenizer because added tokens should be treated exactly the
same as tokens of the base vocabulary and therefore the function `convert_tokens_to_string` has to be called on
the whole token list and not individually on added tokens
"""
filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
result = []
for token in filtered_tokens:
if skip_special_tokens and token in self.all_special_ids:
continue
result.append(token)
text = self.convert_tokens_to_string(
result, group_tokens=group_tokens, spaces_between_special_tokens=spaces_between_special_tokens
)
if clean_up_tokenization_spaces:
clean_text = self.clean_up_tokenization(text)
return clean_text
else:
return text
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.encoder, ensure_ascii=False))
return (vocab_file,)
class Wav2Vec2Tokenizer(PreTrainedTokenizer): class Wav2Vec2Tokenizer(PreTrainedTokenizer):
""" """
Constructs a Wav2Vec2 tokenizer. Constructs a Wav2Vec2 tokenizer.
...@@ -146,6 +349,12 @@ class Wav2Vec2Tokenizer(PreTrainedTokenizer): ...@@ -146,6 +349,12 @@ class Wav2Vec2Tokenizer(PreTrainedTokenizer):
word_delimiter_token=word_delimiter_token, word_delimiter_token=word_delimiter_token,
**kwargs, **kwargs,
) )
warnings.warn(
"The class `Wav2Vec2Tokenizer` is deprecated and will be removed in version 5 of Transformers. Please use `Wav2Vec2Processor` or `Wav2Vec2CTCTokenizer` instead.",
FutureWarning,
)
self._word_delimiter_token = word_delimiter_token self._word_delimiter_token = word_delimiter_token
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment