Add Audio Spectogram Transformer (#19981)

* First draft * Make conversion script work * Add id2label mapping, run code quality * Fix copies * Add first draft of feature extractor * Update conversion script to use feature extractor * Make more tests pass * Add docs * update input_features to input_values + pad by default to max length * Fix doc tests * Add feature extractor tests * Add proper padding/truncation to feature extractor * Add support for conversion of all audioset checkpoints * Improve docs and extend conversion script * Fix README * Rename spectogram to spectrogram * Fix copies * Add integration test * Remove dummy conv * Update to ast * Update organization * Fix init * Rename model to AST * Add require_torchaudio annotator * Move import of ASTFeatureExtractor under a is_speech_available * Fix rebase * Add pipeline config * Update name of classifier head * Rename time_dimension and frequency_dimension for clarity * Remove print statement * Fix pipeline test * Fix pipeline test * Fix index table * Fix init * Fix conversion script * Rename to ForAudioClassification * Fix index table Co-authored-by: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>

Add Audio Spectogram Transformer (#19981)
* First draft * Make conversion script work * Add id2label mapping, run code quality * Fix copies * Add first draft of feature extractor * Update conversion script to use feature extractor * Make more tests pass * Add docs * update input_features to input_values + pad by default to max length * Fix doc tests * Add feature extractor tests * Add proper padding/truncation to feature extractor * Add support for conversion of all audioset checkpoints * Improve docs and extend conversion script * Fix README * Rename spectogram to spectrogram * Fix copies * Add integration test * Remove dummy conv * Update to ast * Update organization * Fix init * Rename model to AST * Add require_torchaudio annotator * Move import of ASTFeatureExtractor under a is_speech_available * Fix rebase * Add pipeline config * Update name of classifier head * Rename time_dimension and frequency_dimension for clarity * Remove print statement * Fix pipeline test * Fix pipeline test * Fix index table * Fix init * Fix conversion script * Rename to ForAudioClassification * Fix index table Co-authored-by: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
4973d2a0 · NielsRogge · GitHub · 1e3f17b5 · 4973d2a0 · 4973d2a0
Unverified Commit 4973d2a0 authored Nov 21, 2022 by NielsRogge Committed by GitHub Nov 21, 2022
8 changed files
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -350,6 +350,30 @@ def load_tf_weights_in_albert(*args, **kwargs):
    requires_backends(load_tf_weights_in_albert, ["torch"])
+AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
+class ASTForAudioClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+class ASTModel(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+class ASTPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
 MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING = None

--- a/src/transformers/utils/dummy_speech_objects.py
+++ b/src/transformers/utils/dummy_speech_objects.py
@@ -3,6 +3,13 @@
 from ..utils import DummyObject, requires_backends
+class ASTFeatureExtractor(metaclass=DummyObject):
+    _backends = ["speech"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["speech"])
 class MCTCTFeatureExtractor(metaclass=DummyObject):
    _backends = ["speech"]

--- a/tests/models/audio_spectrogram_transformer/__init__.py
+++ b/tests/models/audio_spectrogram_transformer/__init__.py
--- a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py
+++ b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import itertools
+import random
+import unittest
+import numpy as np
+from transformers import ASTFeatureExtractor
+from transformers.testing_utils import require_torch, require_torchaudio
+from transformers.utils.import_utils import is_torch_available
+from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
+global_rng = random.Random()
+if is_torch_available():
+    import torch
+def floats_list(shape, scale=1.0, rng=None, name=None):
+    """Creates a random float32 tensor"""
+    if rng is None:
+        rng = global_rng
+    values = []
+    for batch_idx in range(shape[0]):
+        values.append([])
+        for _ in range(shape[1]):
+            values[-1].append(rng.random() * scale)
+    return values
+class ASTFeatureExtractionTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        min_seq_length=400,
+        max_seq_length=2000,
+        feature_size=1,
+        padding_value=0.0,
+        sampling_rate=16000,
+        return_attention_mask=True,
+        do_normalize=True,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.min_seq_length = min_seq_length
+        self.max_seq_length = max_seq_length
+        self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
+        self.feature_size = feature_size
+        self.padding_value = padding_value
+        self.sampling_rate = sampling_rate
+        self.return_attention_mask = return_attention_mask
+        self.do_normalize = do_normalize
+    def prepare_feat_extract_dict(self):
+        return {
+            "feature_size": self.feature_size,
+            "padding_value": self.padding_value,
+            "sampling_rate": self.sampling_rate,
+            "return_attention_mask": self.return_attention_mask,
+            "do_normalize": self.do_normalize,
+        }
+    def prepare_inputs_for_common(self, equal_length=False, numpify=False):
+        def _flatten(list_of_lists):
+            return list(itertools.chain(*list_of_lists))
+        if equal_length:
+            speech_inputs = floats_list((self.batch_size, self.max_seq_length))
+        else:
+            # make sure that inputs increase in size
+            speech_inputs = [
+                _flatten(floats_list((x, self.feature_size)))
+                for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
+            ]
+        if numpify:
+            speech_inputs = [np.asarray(x) for x in speech_inputs]
+        return speech_inputs
+@require_torch
+@require_torchaudio
+class ASTFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
+    feature_extraction_class = ASTFeatureExtractor
+    def setUp(self):
+        self.feat_extract_tester = ASTFeatureExtractionTester(self)
+    def test_call(self):
+        # Tests that all call wrap to encode_plus and batch_encode_plus
+        feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        # create three inputs of length 800, 1000, and 1200
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
+        # Test not batched input
+        encoded_sequences_1 = feat_extract(speech_inputs[0], return_tensors="np").input_values
+        encoded_sequences_2 = feat_extract(np_speech_inputs[0], return_tensors="np").input_values
+        self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
+        # Test batched
+        encoded_sequences_1 = feat_extract(speech_inputs, padding=True, return_tensors="np").input_values
+        encoded_sequences_2 = feat_extract(np_speech_inputs, padding=True, return_tensors="np").input_values
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+    @require_torch
+    def test_double_precision_pad(self):
+        import torch
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        np_speech_inputs = np.random.rand(100).astype(np.float64)
+        py_speech_inputs = np_speech_inputs.tolist()
+        for inputs in [py_speech_inputs, np_speech_inputs]:
+            np_processed = feature_extractor.pad([{"input_values": inputs}], return_tensors="np")
+            self.assertTrue(np_processed.input_values.dtype == np.float32)
+            pt_processed = feature_extractor.pad([{"input_values": inputs}], return_tensors="pt")
+            self.assertTrue(pt_processed.input_values.dtype == torch.float32)
+    def _load_datasamples(self, num_samples):
+        from datasets import load_dataset
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        # automatic decoding with librispeech
+        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+        return [x["array"] for x in speech_samples]
+    @require_torch
+    def test_integration(self):
+        # fmt: off
+        EXPECTED_INPUT_VALUES = torch.tensor(
+            [-0.9894, -1.2776, -0.9066, -1.2776, -0.9349, -1.2609, -1.0386, -1.2776,
+             -1.1561, -1.2776, -1.2052, -1.2723, -1.2190, -1.2132, -1.2776, -1.1133,
+             -1.1953, -1.1343, -1.1584, -1.2203, -1.1770, -1.2474, -1.2381, -1.1936,
+             -0.9270, -0.8317, -0.8049, -0.7706, -0.7565, -0.7869]
+        )
+        # fmt: on
+        input_speech = self._load_datasamples(1)
+        feaure_extractor = ASTFeatureExtractor()
+        input_values = feaure_extractor(input_speech, return_tensors="pt").input_values
+        self.assertTrue(torch.allclose(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, atol=1e-4))
--- a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
+++ b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Audio Spectrogram Transformer (AST) model. """
+import inspect
+import unittest
+from huggingface_hub import hf_hub_download
+from transformers import ASTConfig
+from transformers.testing_utils import require_torch, require_torchaudio, slow, torch_device
+from transformers.utils import cached_property, is_torch_available, is_torchaudio_available
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+if is_torch_available():
+    import torch
+    from torch import nn
+    from transformers import ASTForAudioClassification, ASTModel
+    from transformers.models.audio_spectrogram_transformer.modeling_audio_spectrogram_transformer import (
+        AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+    )
+if is_torchaudio_available():
+    import torchaudio
+    from transformers import ASTFeatureExtractor
+class ASTModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        patch_size=2,
+        max_length=24,
+        num_mel_bins=16,
+        is_training=True,
+        use_labels=True,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        type_sequence_label_size=10,
+        initializer_range=0.02,
+        scope=None,
+        frequency_stride=2,
+        time_stride=2,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.patch_size = patch_size
+        self.max_length = max_length
+        self.num_mel_bins = num_mel_bins
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.scope = scope
+        self.frequency_stride = frequency_stride
+        self.time_stride = time_stride
+        # in AST, the seq length equals the number of patches + 2 (we add 2 for the [CLS] and distillation tokens)
+        frequency_out_dimension = (self.num_mel_bins - self.patch_size) // self.frequency_stride + 1
+        time_out_dimension = (self.max_length - self.patch_size) // self.time_stride + 1
+        num_patches = frequency_out_dimension * time_out_dimension
+        self.seq_length = num_patches + 2
+    def prepare_config_and_inputs(self):
+        input_values = floats_tensor([self.batch_size, self.max_length, self.num_mel_bins])
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+        config = self.get_config()
+        return config, input_values, labels
+    def get_config(self):
+        return ASTConfig(
+            patch_size=self.patch_size,
+            max_length=self.max_length,
+            num_mel_bins=self.num_mel_bins,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+            frequency_stride=self.frequency_stride,
+            time_stride=self.time_stride,
+        )
+    def create_and_check_model(self, config, input_values, labels):
+        model = ASTModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_values)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_values,
+            labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_values": input_values}
+        return config, inputs_dict
+@require_torch
+class ASTModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as AST does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+    all_model_classes = (
+        (
+            ASTModel,
+            ASTForAudioClassification,
+        )
+        if is_torch_available()
+        else ()
+    )
+    fx_compatible = False
+    test_pruning = False
+    test_resize_embeddings = False
+    test_head_masking = False
+    def setUp(self):
+        self.model_tester = ASTModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ASTConfig, has_text_modality=False, hidden_size=37)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    @unittest.skip(reason="AST does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+            expected_arg_names = ["input_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = ASTModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+# We will verify our results on some audio from AudioSet
+def prepare_audio():
+    filepath = hf_hub_download(
+        repo_id="nielsr/audio-spectogram-transformer-checkpoint", filename="sample_audio.flac", repo_type="dataset"
+    )
+    audio, sampling_rate = torchaudio.load(filepath)
+    return audio, sampling_rate
+@require_torch
+@require_torchaudio
+class ASTModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_feature_extractor(self):
+        return (
+            ASTFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
+            if is_torchaudio_available()
+            else None
+        )
+    @slow
+    def test_inference_audio_classification(self):
+        feature_extractor = self.default_feature_extractor
+        model = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593").to(torch_device)
+        feature_extractor = self.default_feature_extractor
+        audio, sampling_rate = prepare_audio()
+        audio = audio.squeeze().numpy()
+        inputs = feature_extractor(audio, sampling_rate=sampling_rate, return_tensors="pt").to(torch_device)
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+        # verify the logits
+        expected_shape = torch.Size((1, 527))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+        expected_slice = torch.tensor([-0.8760, -7.0042, -8.6602]).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -155,6 +155,12 @@ def get_tiny_feature_extractor_from_checkpoint(checkpoint, tiny_config, feature_
    if hasattr(tiny_config, "image_size") and feature_extractor:
        feature_extractor = feature_extractor.__class__(size=tiny_config.image_size, crop_size=tiny_config.image_size)
+    # Audio Spectogram Transformer specific.
+    if feature_extractor.__class__.__name__ == "ASTFeatureExtractor":
+        feature_extractor = feature_extractor.__class__(
+            max_length=tiny_config.max_length, num_mel_bins=tiny_config.num_mel_bins
+        )
    # Speech2TextModel specific.
    if hasattr(tiny_config, "input_feat_per_channel") and feature_extractor:
        feature_extractor = feature_extractor.__class__(

--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -91,6 +91,7 @@ if is_torch_available():
    from test_module.custom_modeling import CustomModel, NoSuperInitModel
    from transformers import (
        BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+        MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
        MODEL_FOR_AUDIO_XVECTOR_MAPPING,
        MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
        MODEL_FOR_CAUSAL_LM_MAPPING,
@@ -223,6 +224,7 @@ class ModelTesterMixin:
                *get_values(MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING),
                *get_values(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING),
                *get_values(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING),
+                *get_values(MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING),
            ]:
                inputs_dict["labels"] = torch.zeros(
                    self.model_tester.batch_size, dtype=torch.long, device=torch_device

--- a/utils/documentation_tests.txt
+++ b/utils/documentation_tests.txt
@@ -18,6 +18,7 @@ src/transformers/generation/utils.py
 src/transformers/models/albert/configuration_albert.py
 src/transformers/models/albert/modeling_albert.py
 src/transformers/models/albert/modeling_tf_albert.py
+src/transformers/models/audio_spectogram_transformer/modeling_audio_spectogram_transformer.py
 src/transformers/models/bart/configuration_bart.py
 src/transformers/models/bart/modeling_bart.py
 src/transformers/models/beit/configuration_beit.py