"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "22a69f1d7d520d5fbccbdb163d05db56bf79724c"
Unverified Commit 1417978c authored by Anton Lozhkov's avatar Anton Lozhkov Committed by GitHub
Browse files

[SequenceFeatureExtractor] Rewrite padding logic from pure python to numpy (#13650)

* Test np padding

* Pass feature extraction tests

* Update type hints

* Fix flaky integration tests

* Try a more stable waveform

* Add to_numpy jax support

* int32 attention masks

* Refactor normalization tests
parent 8d533e6a
...@@ -27,7 +27,7 @@ from .file_utils import ( ...@@ -27,7 +27,7 @@ from .file_utils import (
_is_torch, _is_torch,
is_tf_available, is_tf_available,
is_torch_available, is_torch_available,
to_py_obj, to_numpy,
) )
from .utils import logging from .utils import logging
...@@ -151,7 +151,7 @@ class SequenceFeatureExtractor(FeatureExtractionMixin): ...@@ -151,7 +151,7 @@ class SequenceFeatureExtractor(FeatureExtractionMixin):
processed_features["attention_mask"] = [] processed_features["attention_mask"] = []
return processed_features return processed_features
# If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects # If we have PyTorch/TF tensors or lists as inputs, we cast them as Numpy arrays
# and rebuild them afterwards if no return_tensors is specified # and rebuild them afterwards if no return_tensors is specified
# Note that we lose the specific device the tensor may be on for PyTorch # Note that we lose the specific device the tensor may be on for PyTorch
...@@ -163,28 +163,31 @@ class SequenceFeatureExtractor(FeatureExtractionMixin): ...@@ -163,28 +163,31 @@ class SequenceFeatureExtractor(FeatureExtractionMixin):
index += 1 index += 1
if index < len(required_input): if index < len(required_input):
first_element = required_input[index][0] first_element = required_input[index][0]
# At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
if not isinstance(first_element, (float, int, list, tuple)): if return_tensors is None:
if is_tf_available() and _is_tensorflow(first_element): if is_tf_available() and _is_tensorflow(first_element):
return_tensors = "tf" if return_tensors is None else return_tensors return_tensors = "tf"
elif is_torch_available() and _is_torch(first_element): elif is_torch_available() and _is_torch(first_element):
return_tensors = "pt" if return_tensors is None else return_tensors return_tensors = "pt"
elif isinstance(first_element, np.ndarray): elif isinstance(first_element, (int, float, list, tuple, np.ndarray)):
return_tensors = "np" if return_tensors is None else return_tensors return_tensors = "np"
else: else:
raise ValueError( raise ValueError(
f"type of {first_element} unknown: {type(first_element)}. " f"type of {first_element} unknown: {type(first_element)}. "
f"Should be one of a python, numpy, pytorch or tensorflow object." f"Should be one of a python, numpy, pytorch or tensorflow object."
) )
for key, value in processed_features.items(): for key, value in processed_features.items():
processed_features[key] = to_py_obj(value) if isinstance(value[0], (int, float)):
processed_features[key] = to_numpy(value)
else:
processed_features[key] = [to_numpy(v) for v in value]
# Convert padding_strategy in PaddingStrategy # Convert padding_strategy in PaddingStrategy
padding_strategy = self._get_padding_strategies(padding=padding, max_length=max_length) padding_strategy = self._get_padding_strategies(padding=padding, max_length=max_length)
required_input = processed_features[self.model_input_names[0]] required_input = processed_features[self.model_input_names[0]]
if required_input and not isinstance(required_input[0], (list, tuple)): if required_input and not isinstance(required_input[0], np.ndarray):
# truncation # truncation
processed_features = self._truncate( processed_features = self._truncate(
processed_features, processed_features,
...@@ -203,9 +206,8 @@ class SequenceFeatureExtractor(FeatureExtractionMixin): ...@@ -203,9 +206,8 @@ class SequenceFeatureExtractor(FeatureExtractionMixin):
return BatchFeature(processed_features, tensor_type=return_tensors) return BatchFeature(processed_features, tensor_type=return_tensors)
batch_size = len(required_input) batch_size = len(required_input)
assert all( if not all(len(v) == batch_size for v in processed_features.values()):
len(v) == batch_size for v in processed_features.values() raise ValueError("Some items in the output dictionary have a different batch size than others.")
), "Some items in the output dictionary have a different batch size than others."
truncated_inputs = [] truncated_inputs = []
for i in range(batch_size): for i in range(batch_size):
...@@ -244,7 +246,7 @@ class SequenceFeatureExtractor(FeatureExtractionMixin): ...@@ -244,7 +246,7 @@ class SequenceFeatureExtractor(FeatureExtractionMixin):
def _pad( def _pad(
self, self,
processed_features: Union[Dict[str, List[float]], BatchFeature], processed_features: Union[Dict[str, np.ndarray], BatchFeature],
max_length: Optional[int] = None, max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None, pad_to_multiple_of: Optional[int] = None,
...@@ -254,7 +256,7 @@ class SequenceFeatureExtractor(FeatureExtractionMixin): ...@@ -254,7 +256,7 @@ class SequenceFeatureExtractor(FeatureExtractionMixin):
Pad inputs (on left/right and up to predefined length or max length in the batch) Pad inputs (on left/right and up to predefined length or max length in the batch)
Args: Args:
processed_features: Dictionary of input values (`List[float]`) / input vectors (`List[List[float]]`) or batch of inputs values (`List[List[int]]`) / input vectors (`List[List[List[int]]]`) processed_features: Dictionary of input values (`np.ndarray[float]`) / input vectors (`List[np.ndarray[float]]`) or batch of inputs values (`List[np.ndarray[int]]`) / input vectors (`List[np.ndarray[int]]`)
max_length: maximum length of the returned list and optionally padding length (see below) max_length: maximum length of the returned list and optionally padding length (see below)
padding_strategy: PaddingStrategy to use for padding. padding_strategy: PaddingStrategy to use for padding.
...@@ -278,42 +280,47 @@ class SequenceFeatureExtractor(FeatureExtractionMixin): ...@@ -278,42 +280,47 @@ class SequenceFeatureExtractor(FeatureExtractionMixin):
if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) < max_length
if needs_to_be_padded: if needs_to_be_padded:
difference = max_length - len(required_input) difference = max_length - len(required_input)
padding_vector = self.feature_size * [self.padding_value] if self.feature_size > 1 else self.padding_value
if self.padding_side == "right": if self.padding_side == "right":
if return_attention_mask: if return_attention_mask:
processed_features["attention_mask"] = [1] * len(required_input) + [0] * difference attention_mask = np.zeros(max_length, dtype=np.int32)
processed_features[self.model_input_names[0]] = required_input + [ attention_mask[: len(required_input)] = 1
padding_vector for _ in range(difference) processed_features["attention_mask"] = attention_mask
] padding_shape = ((0, difference), (0, 0)) if self.feature_size > 1 else (0, difference)
processed_features[self.model_input_names[0]] = np.pad(
required_input, padding_shape, "constant", constant_values=self.padding_value
)
elif self.padding_side == "left": elif self.padding_side == "left":
if return_attention_mask: if return_attention_mask:
processed_features["attention_mask"] = [0] * difference + [1] * len(required_input) attention_mask = np.zeros(max_length, dtype=np.int32)
processed_features[self.model_input_names[0]] = [ attention_mask[-len(required_input) :] = 1
padding_vector for _ in range(difference) processed_features["attention_mask"] = attention_mask
] + required_input padding_shape = ((difference, 0), (0, 0)) if self.feature_size > 1 else (difference, 0)
processed_features[self.model_input_names[0]] = np.pad(
required_input, padding_shape, "constant", constant_values=self.padding_value
)
else: else:
raise ValueError("Invalid padding strategy:" + str(self.padding_side)) raise ValueError("Invalid padding strategy:" + str(self.padding_side))
elif return_attention_mask and "attention_mask" not in processed_features: elif return_attention_mask and "attention_mask" not in processed_features:
processed_features["attention_mask"] = [1] * len(required_input) processed_features["attention_mask"] = np.ones(len(required_input), dtype=np.int32)
return processed_features return processed_features
def _truncate( def _truncate(
self, self,
processed_features: Union[Dict[str, List[float]], BatchFeature], processed_features: Union[Dict[str, np.ndarray], BatchFeature],
max_length: Optional[int] = None, max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None, pad_to_multiple_of: Optional[int] = None,
truncation: Optional[bool] = None, truncation: Optional[bool] = None,
): ):
""" """
Pad inputs (on left/right and up to predefined length or max length in the batch) Truncate inputs to predefined length or max length in the batch
Args: Args:
processed_features: Dictionary of input values (`List[float]`) / input vectors (`List[List[float]]`) or batch of inputs values (`List[List[int]]`) / input vectors (`List[List[List[int]]]`) processed_features: Dictionary of input values (`np.ndarray[float]`) / input vectors (`List[np.ndarray[float]]`) or batch of inputs values (`List[np.ndarray[int]]`) / input vectors (`List[np.ndarray[int]]`)
max_length: maximum length of the returned list and optionally padding length (see below) max_length: maximum length of the returned list and optionally padding length (see below)
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
......
...@@ -1855,12 +1855,32 @@ def to_py_obj(obj): ...@@ -1855,12 +1855,32 @@ def to_py_obj(obj):
return obj.numpy().tolist() return obj.numpy().tolist()
elif is_torch_available() and _is_torch(obj): elif is_torch_available() and _is_torch(obj):
return obj.detach().cpu().tolist() return obj.detach().cpu().tolist()
elif is_flax_available() and _is_jax(obj):
return np.asarray(obj).tolist()
elif isinstance(obj, np.ndarray): elif isinstance(obj, np.ndarray):
return obj.tolist() return obj.tolist()
else: else:
return obj return obj
def to_numpy(obj):
"""
Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a Numpy array.
"""
if isinstance(obj, (dict, UserDict)):
return {k: to_numpy(v) for k, v in obj.items()}
elif isinstance(obj, (list, tuple)):
return np.array(obj)
elif is_tf_available() and _is_tensorflow(obj):
return obj.numpy()
elif is_torch_available() and _is_torch(obj):
return obj.detach().cpu().numpy()
elif is_flax_available() and _is_jax(obj):
return np.asarray(obj)
else:
return obj
class ModelOutput(OrderedDict): class ModelOutput(OrderedDict):
""" """
Base class for all model outputs as dataclass. Has a ``__getitem__`` that allows indexing by integer or slice (like Base class for all model outputs as dataclass. Has a ``__getitem__`` that allows indexing by integer or slice (like
......
...@@ -99,15 +99,12 @@ class Speech2TextFeatureExtractor(SequenceFeatureExtractor): ...@@ -99,15 +99,12 @@ class Speech2TextFeatureExtractor(SequenceFeatureExtractor):
normalize_vars: Optional[bool] = True, normalize_vars: Optional[bool] = True,
padding_value: float = 0.0, padding_value: float = 0.0,
) -> np.ndarray: ) -> np.ndarray:
# make sure we normalie float32 arrays # make sure we normalize float32 arrays
mean = x[:input_length].mean(axis=0)
square_sums = (x[:input_length] ** 2).sum(axis=0)
if normalize_means: if normalize_means:
mean = x[:input_length].mean(axis=0)
x = np.subtract(x, mean) x = np.subtract(x, mean)
if normalize_vars: if normalize_vars:
var = square_sums / x[:input_length].shape[0] - mean ** 2 std = x[:input_length].std(axis=0)
std = np.sqrt(np.maximum(var, 1e-10))
x = np.divide(x, std) x = np.divide(x, std)
if input_length < x.shape[0]: if input_length < x.shape[0]:
...@@ -206,10 +203,10 @@ class Speech2TextFeatureExtractor(SequenceFeatureExtractor): ...@@ -206,10 +203,10 @@ class Speech2TextFeatureExtractor(SequenceFeatureExtractor):
and (isinstance(raw_speech[0], np.ndarray) or isinstance(raw_speech[0], (tuple, list))) and (isinstance(raw_speech[0], np.ndarray) or isinstance(raw_speech[0], (tuple, list)))
) )
if is_batched and not isinstance(raw_speech[0], np.ndarray): if is_batched:
raw_speech = [np.asarray(speech) for speech in raw_speech] raw_speech = [np.asarray(speech, dtype=np.float32) for speech in raw_speech]
elif not is_batched and not isinstance(raw_speech, np.ndarray): elif not is_batched and not isinstance(raw_speech, np.ndarray):
raw_speech = np.asarray(raw_speech) raw_speech = np.asarray(raw_speech, dtype=np.float32)
elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64): elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
raw_speech = raw_speech.astype(np.float32) raw_speech = raw_speech.astype(np.float32)
......
...@@ -110,6 +110,10 @@ class Speech2TextFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unitt ...@@ -110,6 +110,10 @@ class Speech2TextFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unitt
def setUp(self): def setUp(self):
self.feat_extract_tester = Speech2TextFeatureExtractionTester(self) self.feat_extract_tester = Speech2TextFeatureExtractionTester(self)
def _check_zero_mean_unit_variance(self, input_vector):
self.assertTrue(np.all(np.mean(input_vector, axis=0) < 1e-3))
self.assertTrue(np.all(np.abs(np.var(input_vector, axis=0) - 1) < 1e-3))
def test_call(self): def test_call(self):
# Tests that all call wrap to encode_plus and batch_encode_plus # Tests that all call wrap to encode_plus and batch_encode_plus
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
...@@ -137,17 +141,9 @@ class Speech2TextFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unitt ...@@ -137,17 +141,9 @@ class Speech2TextFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unitt
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
# TODO(Patrick, Suraj, Anton) - It's surprising that "non-padded/non-numpified" padding paddings = ["longest", "max_length", "do_not_pad"]
# results in quite inaccurate variance computation after (see 5e-1 tolerance) max_lengths = [None, 16, None]
# Issue is filed and PR is underway: https://github.com/huggingface/transformers/issues/13539 for max_length, padding in zip(max_lengths, paddings):
# paddings = ["longest", "max_length", "do_not_pad"]
# max_lengths = [None, 16, None]
# var_tolerances = [1e-3, 1e-3, 5e-1]
paddings = ["longest", "max_length"]
max_lengths = [None, 16]
var_tolerances = [1e-3, 1e-3]
for max_length, padding, var_tol in zip(max_lengths, paddings, var_tolerances):
inputs = feature_extractor( inputs = feature_extractor(
speech_inputs, padding=padding, max_length=max_length, return_attention_mask=True speech_inputs, padding=padding, max_length=max_length, return_attention_mask=True
) )
...@@ -155,28 +151,17 @@ class Speech2TextFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unitt ...@@ -155,28 +151,17 @@ class Speech2TextFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unitt
attention_mask = inputs.attention_mask attention_mask = inputs.attention_mask
fbank_feat_lengths = [np.sum(x) for x in attention_mask] fbank_feat_lengths = [np.sum(x) for x in attention_mask]
def _check_zero_mean_unit_variance(input_vector, var_tol=1e-3): self._check_zero_mean_unit_variance(input_features[0][: fbank_feat_lengths[0]])
self.assertTrue(np.all(np.mean(input_vector, axis=0) < 1e-3)) self._check_zero_mean_unit_variance(input_features[1][: fbank_feat_lengths[1]])
self.assertTrue(np.all(np.abs(np.var(input_vector, axis=0) - 1) < var_tol)) self._check_zero_mean_unit_variance(input_features[2][: fbank_feat_lengths[2]])
_check_zero_mean_unit_variance(input_features[0][: fbank_feat_lengths[0]], var_tol)
_check_zero_mean_unit_variance(input_features[1][: fbank_feat_lengths[1]], var_tol)
_check_zero_mean_unit_variance(input_features[2][: fbank_feat_lengths[2]], var_tol)
def test_cepstral_mean_and_variance_normalization_np(self): def test_cepstral_mean_and_variance_normalization_np(self):
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
# TODO(Patrick, Suraj, Anton) - It's surprising that "non-padded/non-numpified" padding paddings = ["longest", "max_length", "do_not_pad"]
# results in quite inaccurate variance computation after (see 5e-1 tolerance) max_lengths = [None, 16, None]
# Issue is filed and PR is underway: https://github.com/huggingface/transformers/issues/13539 for max_length, padding in zip(max_lengths, paddings):
# paddings = ["longest", "max_length", "do_not_pad"]
# max_lengths = [None, 16, None]
# var_tolerances = [1e-3, 1e-3, 5e-1]
paddings = ["longest", "max_length"]
max_lengths = [None, 16]
var_tolerances = [1e-3, 1e-3]
for max_length, padding, var_tol in zip(max_lengths, paddings, var_tolerances):
inputs = feature_extractor( inputs = feature_extractor(
speech_inputs, max_length=max_length, padding=padding, return_tensors="np", return_attention_mask=True speech_inputs, max_length=max_length, padding=padding, return_tensors="np", return_attention_mask=True
) )
...@@ -184,15 +169,11 @@ class Speech2TextFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unitt ...@@ -184,15 +169,11 @@ class Speech2TextFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unitt
attention_mask = inputs.attention_mask attention_mask = inputs.attention_mask
fbank_feat_lengths = [np.sum(x) for x in attention_mask] fbank_feat_lengths = [np.sum(x) for x in attention_mask]
def _check_zero_mean_unit_variance(input_vector, var_tol=1e-3): self._check_zero_mean_unit_variance(input_features[0][: fbank_feat_lengths[0]])
self.assertTrue(np.all(np.mean(input_vector, axis=0) < 1e-3))
self.assertTrue(np.all(np.abs(np.var(input_vector, axis=0) - 1) < var_tol))
_check_zero_mean_unit_variance(input_features[0][: fbank_feat_lengths[0]], var_tol)
self.assertTrue(input_features[0][fbank_feat_lengths[0] :].sum() < 1e-6) self.assertTrue(input_features[0][fbank_feat_lengths[0] :].sum() < 1e-6)
_check_zero_mean_unit_variance(input_features[1][: fbank_feat_lengths[1]], var_tol) self._check_zero_mean_unit_variance(input_features[1][: fbank_feat_lengths[1]])
self.assertTrue(input_features[0][fbank_feat_lengths[1] :].sum() < 1e-6) self.assertTrue(input_features[0][fbank_feat_lengths[1] :].sum() < 1e-6)
_check_zero_mean_unit_variance(input_features[2][: fbank_feat_lengths[2]], var_tol) self._check_zero_mean_unit_variance(input_features[2][: fbank_feat_lengths[2]])
def test_cepstral_mean_and_variance_normalization_trunc_max_length(self): def test_cepstral_mean_and_variance_normalization_trunc_max_length(self):
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
...@@ -209,13 +190,9 @@ class Speech2TextFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unitt ...@@ -209,13 +190,9 @@ class Speech2TextFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unitt
attention_mask = inputs.attention_mask attention_mask = inputs.attention_mask
fbank_feat_lengths = np.sum(attention_mask == 1, axis=1) fbank_feat_lengths = np.sum(attention_mask == 1, axis=1)
def _check_zero_mean_unit_variance(input_vector): self._check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]])
self.assertTrue(np.all(np.mean(input_vector, axis=0) < 1e-3)) self._check_zero_mean_unit_variance(input_features[1])
self.assertTrue(np.all(np.abs(np.var(input_vector, axis=0) - 1) < 1e-3)) self._check_zero_mean_unit_variance(input_features[2])
_check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]])
_check_zero_mean_unit_variance(input_features[1])
_check_zero_mean_unit_variance(input_features[2])
def test_cepstral_mean_and_variance_normalization_trunc_longest(self): def test_cepstral_mean_and_variance_normalization_trunc_longest(self):
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
...@@ -232,13 +209,9 @@ class Speech2TextFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unitt ...@@ -232,13 +209,9 @@ class Speech2TextFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unitt
attention_mask = inputs.attention_mask attention_mask = inputs.attention_mask
fbank_feat_lengths = np.sum(attention_mask == 1, axis=1) fbank_feat_lengths = np.sum(attention_mask == 1, axis=1)
def _check_zero_mean_unit_variance(input_vector): self._check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]])
self.assertTrue(np.all(np.mean(input_vector, axis=0) < 1e-3)) self._check_zero_mean_unit_variance(input_features[1, : fbank_feat_lengths[1]])
self.assertTrue(np.all(np.abs(np.var(input_vector, axis=0) - 1) < 1e-3)) self._check_zero_mean_unit_variance(input_features[2])
_check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]])
_check_zero_mean_unit_variance(input_features[1, : fbank_feat_lengths[1]])
_check_zero_mean_unit_variance(input_features[2])
# make sure that if max_length < longest -> then pad to max_length # make sure that if max_length < longest -> then pad to max_length
self.assertEqual(input_features.shape, (3, 4, 24)) self.assertEqual(input_features.shape, (3, 4, 24))
...@@ -256,9 +229,9 @@ class Speech2TextFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unitt ...@@ -256,9 +229,9 @@ class Speech2TextFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unitt
attention_mask = inputs.attention_mask attention_mask = inputs.attention_mask
fbank_feat_lengths = np.sum(attention_mask == 1, axis=1) fbank_feat_lengths = np.sum(attention_mask == 1, axis=1)
_check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]]) self._check_zero_mean_unit_variance(input_features[0, : fbank_feat_lengths[0]])
_check_zero_mean_unit_variance(input_features[1, : fbank_feat_lengths[1]]) self._check_zero_mean_unit_variance(input_features[1, : fbank_feat_lengths[1]])
_check_zero_mean_unit_variance(input_features[2]) self._check_zero_mean_unit_variance(input_features[2])
# make sure that if max_length < longest -> then pad to max_length # make sure that if max_length < longest -> then pad to max_length
self.assertEqual(input_features.shape, (3, 6, 24)) self.assertEqual(input_features.shape, (3, 6, 24))
...@@ -102,6 +102,10 @@ class Wav2Vec2FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest ...@@ -102,6 +102,10 @@ class Wav2Vec2FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest
def setUp(self): def setUp(self):
self.feat_extract_tester = Wav2Vec2FeatureExtractionTester(self) self.feat_extract_tester = Wav2Vec2FeatureExtractionTester(self)
def _check_zero_mean_unit_variance(self, input_vector):
self.assertTrue(np.all(np.mean(input_vector, axis=0) < 1e-3))
self.assertTrue(np.all(np.abs(np.var(input_vector, axis=0) - 1) < 1e-3))
def test_call(self): def test_call(self):
# Tests that all call wrap to encode_plus and batch_encode_plus # Tests that all call wrap to encode_plus and batch_encode_plus
feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
...@@ -130,15 +134,11 @@ class Wav2Vec2FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest ...@@ -130,15 +134,11 @@ class Wav2Vec2FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest
processed = feat_extract(speech_inputs, padding=padding, max_length=max_length, return_tensors="np") processed = feat_extract(speech_inputs, padding=padding, max_length=max_length, return_tensors="np")
input_values = processed.input_values input_values = processed.input_values
def _check_zero_mean_unit_variance(input_vector): self._check_zero_mean_unit_variance(input_values[0][:800])
self.assertTrue(np.abs(np.mean(input_vector)) < 1e-3)
self.assertTrue(np.abs(np.var(input_vector) - 1) < 1e-3)
_check_zero_mean_unit_variance(input_values[0][:800])
self.assertTrue(input_values[0][800:].sum() < 1e-6) self.assertTrue(input_values[0][800:].sum() < 1e-6)
_check_zero_mean_unit_variance(input_values[1][:1000]) self._check_zero_mean_unit_variance(input_values[1][:1000])
self.assertTrue(input_values[0][1000:].sum() < 1e-6) self.assertTrue(input_values[0][1000:].sum() < 1e-6)
_check_zero_mean_unit_variance(input_values[2][:1200]) self._check_zero_mean_unit_variance(input_values[2][:1200])
def test_zero_mean_unit_variance_normalization(self): def test_zero_mean_unit_variance_normalization(self):
feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
...@@ -152,13 +152,9 @@ class Wav2Vec2FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest ...@@ -152,13 +152,9 @@ class Wav2Vec2FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest
processed = feat_extract(speech_inputs, max_length=max_length, padding=padding) processed = feat_extract(speech_inputs, max_length=max_length, padding=padding)
input_values = processed.input_values input_values = processed.input_values
def _check_zero_mean_unit_variance(input_vector): self._check_zero_mean_unit_variance(input_values[0][:800])
self.assertTrue(np.abs(np.mean(input_vector)) < 1e-3) self._check_zero_mean_unit_variance(input_values[1][:1000])
self.assertTrue(np.abs(np.var(input_vector) - 1) < 1e-3) self._check_zero_mean_unit_variance(input_values[2][:1200])
_check_zero_mean_unit_variance(input_values[0][:800])
_check_zero_mean_unit_variance(input_values[1][:1000])
_check_zero_mean_unit_variance(input_values[2][:1200])
def test_zero_mean_unit_variance_normalization_trunc_np_max_length(self): def test_zero_mean_unit_variance_normalization_trunc_np_max_length(self):
feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
...@@ -168,13 +164,9 @@ class Wav2Vec2FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest ...@@ -168,13 +164,9 @@ class Wav2Vec2FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest
) )
input_values = processed.input_values input_values = processed.input_values
def _check_zero_mean_unit_variance(input_vector): self._check_zero_mean_unit_variance(input_values[0, :800])
self.assertTrue(np.abs(np.mean(input_vector)) < 1e-3) self._check_zero_mean_unit_variance(input_values[1])
self.assertTrue(np.abs(np.var(input_vector) - 1) < 1e-3) self._check_zero_mean_unit_variance(input_values[2])
_check_zero_mean_unit_variance(input_values[0, :800])
_check_zero_mean_unit_variance(input_values[1])
_check_zero_mean_unit_variance(input_values[2])
def test_zero_mean_unit_variance_normalization_trunc_np_longest(self): def test_zero_mean_unit_variance_normalization_trunc_np_longest(self):
feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
...@@ -184,13 +176,9 @@ class Wav2Vec2FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest ...@@ -184,13 +176,9 @@ class Wav2Vec2FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest
) )
input_values = processed.input_values input_values = processed.input_values
def _check_zero_mean_unit_variance(input_vector): self._check_zero_mean_unit_variance(input_values[0, :800])
self.assertTrue(np.abs(np.mean(input_vector)) < 1e-3) self._check_zero_mean_unit_variance(input_values[1, :1000])
self.assertTrue(np.abs(np.var(input_vector) - 1) < 1e-3) self._check_zero_mean_unit_variance(input_values[2])
_check_zero_mean_unit_variance(input_values[0, :800])
_check_zero_mean_unit_variance(input_values[1, :1000])
_check_zero_mean_unit_variance(input_values[2])
# make sure that if max_length < longest -> then pad to max_length # make sure that if max_length < longest -> then pad to max_length
self.assertTrue(input_values.shape == (3, 1000)) self.assertTrue(input_values.shape == (3, 1000))
...@@ -201,9 +189,9 @@ class Wav2Vec2FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest ...@@ -201,9 +189,9 @@ class Wav2Vec2FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest
) )
input_values = processed.input_values input_values = processed.input_values
_check_zero_mean_unit_variance(input_values[0, :800]) self._check_zero_mean_unit_variance(input_values[0, :800])
_check_zero_mean_unit_variance(input_values[1, :1000]) self._check_zero_mean_unit_variance(input_values[1, :1000])
_check_zero_mean_unit_variance(input_values[2]) self._check_zero_mean_unit_variance(input_values[2])
# make sure that if max_length > longest -> then pad to longest # make sure that if max_length > longest -> then pad to longest
self.assertTrue(input_values.shape == (3, 1200)) self.assertTrue(input_values.shape == (3, 1200))
......
...@@ -724,7 +724,7 @@ class Speech2TextModelIntegrationTests(unittest.TestCase): ...@@ -724,7 +724,7 @@ class Speech2TextModelIntegrationTests(unittest.TestCase):
return batch return batch
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
ds = ds.select(range(num_samples)).map(map_to_array) ds = ds.sort("id").select(range(num_samples)).map(map_to_array)
return ds["speech"][:num_samples] return ds["speech"][:num_samples]
...@@ -740,7 +740,9 @@ class Speech2TextModelIntegrationTests(unittest.TestCase): ...@@ -740,7 +740,9 @@ class Speech2TextModelIntegrationTests(unittest.TestCase):
generated_ids = model.generate(input_features) generated_ids = model.generate(input_features)
generated_transcript = processor.batch_decode(generated_ids, skip_special_tokens=True) generated_transcript = processor.batch_decode(generated_ids, skip_special_tokens=True)
EXPECTED_TRANSCRIPTIONS = ["a man said to the universe sir i exist"] EXPECTED_TRANSCRIPTIONS = [
"mister quilter is the apostle of the middle classes and we are glad to welcome his gospel"
]
self.assertListEqual(generated_transcript, EXPECTED_TRANSCRIPTIONS) self.assertListEqual(generated_transcript, EXPECTED_TRANSCRIPTIONS)
def test_generation_librispeech_batched(self): def test_generation_librispeech_batched(self):
...@@ -759,10 +761,10 @@ class Speech2TextModelIntegrationTests(unittest.TestCase): ...@@ -759,10 +761,10 @@ class Speech2TextModelIntegrationTests(unittest.TestCase):
generated_transcripts = processor.batch_decode(generated_ids, skip_special_tokens=True) generated_transcripts = processor.batch_decode(generated_ids, skip_special_tokens=True)
EXPECTED_TRANSCRIPTIONS = [ EXPECTED_TRANSCRIPTIONS = [
"a man said to the universe sir i exist", "mister quilter is the apostle of the middle classes and we are glad to welcome his gospel",
"sweat covered brion's body trickling into the titleing cloth that was the only garment he wore", "nor is mister cultar's manner less interesting than his matter",
"the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with the thousands of spectators were trivialities not worth thinking about", "he tells us that at this festive season of the year with christmas and roast beef looming before us similes drawn from eating and its results occur most readily to the mind",
"his instant of panic was followed by a small sharp blow high on his chest", "he has grave doubts whether sir frederick leyton's work is really greek after all and can discover in it but little of rocky ithaca",
] ]
self.assertListEqual(generated_transcripts, EXPECTED_TRANSCRIPTIONS) self.assertListEqual(generated_transcripts, EXPECTED_TRANSCRIPTIONS)
...@@ -42,9 +42,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): ...@@ -42,9 +42,9 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
tokenizer="facebook/s2t-small-mustc-en-fr-st", tokenizer="facebook/s2t-small-mustc-en-fr-st",
framework="pt", framework="pt",
) )
waveform = np.zeros((34000,)) waveform = np.tile(np.arange(1000, dtype=np.float32), 34)
output = speech_recognizer(waveform) output = speech_recognizer(waveform)
self.assertEqual(output, {"text": "C'est ce que j'ai fait à ce moment-là."}) self.assertEqual(output, {"text": "(Applaudissements)"})
@require_torch @require_torch
def test_torch_small_no_tokenizer_files(self): def test_torch_small_no_tokenizer_files(self):
...@@ -68,14 +68,14 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): ...@@ -68,14 +68,14 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
tokenizer="facebook/wav2vec2-base-960h", tokenizer="facebook/wav2vec2-base-960h",
framework="pt", framework="pt",
) )
waveform = np.zeros((34000,)) waveform = np.tile(np.arange(1000, dtype=np.float32), 34)
output = speech_recognizer(waveform) output = speech_recognizer(waveform)
self.assertEqual(output, {"text": ""}) self.assertEqual(output, {"text": ""})
from datasets import load_dataset from datasets import load_dataset
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation").sort("id")
filename = ds[0]["file"] filename = ds[40]["file"]
output = speech_recognizer(filename) output = speech_recognizer(filename)
self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"}) self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"})
...@@ -92,8 +92,8 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): ...@@ -92,8 +92,8 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
from datasets import load_dataset from datasets import load_dataset
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation").sort("id")
filename = ds[0]["file"] filename = ds[40]["file"]
output = speech_recognizer(filename) output = speech_recognizer(filename)
self.assertEqual(output, {"text": 'Ein Mann sagte zum Universum : " Sir, ich existiert! "'}) self.assertEqual(output, {"text": 'Ein Mann sagte zum Universum : " Sir, ich existiert! "'})
...@@ -110,16 +110,16 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): ...@@ -110,16 +110,16 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
asr = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor) asr = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor)
waveform = np.zeros((34000,)) waveform = np.tile(np.arange(1000, dtype=np.float32), 34)
output = asr(waveform) output = asr(waveform)
self.assertEqual(output, {"text": ""}) self.assertEqual(output, {"text": ""})
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation").sort("id")
filename = ds[0]["file"] filename = ds[40]["file"]
output = asr(filename) output = asr(filename)
self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"}) self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"})
filename = ds[0]["file"] filename = ds[40]["file"]
with open(filename, "rb") as f: with open(filename, "rb") as f:
data = f.read() data = f.read()
output = asr(data) output = asr(data)
...@@ -139,17 +139,17 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): ...@@ -139,17 +139,17 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
asr = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor) asr = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor)
waveform = np.zeros((34000,)) waveform = np.tile(np.arange(1000, dtype=np.float32), 34)
output = asr(waveform) output = asr(waveform)
self.assertEqual(output, {"text": "E questo è il motivo per cui non ci siamo mai incontrati."}) self.assertEqual(output, {"text": "(Applausi)"})
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation") ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation").sort("id")
filename = ds[0]["file"] filename = ds[40]["file"]
output = asr(filename) output = asr(filename)
self.assertEqual(output, {"text": "Un uomo disse all'universo: \"Signore, io esisto."}) self.assertEqual(output, {"text": "Un uomo disse all'universo: \"Signore, io esisto."})
filename = ds[0]["file"] filename = ds[40]["file"]
with open(filename, "rb") as f: with open(filename, "rb") as f:
data = f.read() data = f.read()
output = asr(data) output = asr(data)
......
...@@ -372,7 +372,7 @@ class SequenceFeatureExtractionTestMixin(FeatureExtractionSavingTestMixin): ...@@ -372,7 +372,7 @@ class SequenceFeatureExtractionTestMixin(FeatureExtractionSavingTestMixin):
input_np = feat_extract.pad(processed_features, padding="longest", return_tensors="np")[input_name] input_np = feat_extract.pad(processed_features, padding="longest", return_tensors="np")[input_name]
input_pt = feat_extract.pad(processed_features, padding="longest", return_tensors="pt")[input_name] input_pt = feat_extract.pad(processed_features, padding="longest", return_tensors="pt")[input_name]
self.assertTrue(abs(input_np.astype(np.float32).sum() - input_pt.numpy().sum()) < 1e-2) self.assertTrue(abs(input_np.astype(np.float32).sum() - input_pt.numpy().astype(np.float32).sum()) < 1e-2)
@require_tf @require_tf
def test_padding_accepts_tensors_tf(self): def test_padding_accepts_tensors_tf(self):
...@@ -385,7 +385,7 @@ class SequenceFeatureExtractionTestMixin(FeatureExtractionSavingTestMixin): ...@@ -385,7 +385,7 @@ class SequenceFeatureExtractionTestMixin(FeatureExtractionSavingTestMixin):
input_np = feat_extract.pad(processed_features, padding="longest", return_tensors="np")[input_name] input_np = feat_extract.pad(processed_features, padding="longest", return_tensors="np")[input_name]
input_tf = feat_extract.pad(processed_features, padding="longest", return_tensors="tf")[input_name] input_tf = feat_extract.pad(processed_features, padding="longest", return_tensors="tf")[input_name]
self.assertTrue(abs(input_np.astype(np.float32).sum() - input_tf.numpy().sum()) < 1e-2) self.assertTrue(abs(input_np.astype(np.float32).sum() - input_tf.numpy().astype(np.float32).sum()) < 1e-2)
def test_attention_mask(self): def test_attention_mask(self):
feat_dict = self.feat_extract_dict feat_dict = self.feat_extract_dict
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment