test_feature_extraction_pop2piano.py

# coding=utf-8
# Copyright 2023 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import tempfile
import unittest

import numpy as np
from datasets import load_dataset

from transformers.testing_utils import (
    check_json_file_has_correct_format,
    require_essentia,
    require_librosa,
    require_scipy,
    require_tf,
    require_torch,
)
from transformers.utils.import_utils import (
    is_essentia_available,
    is_librosa_available,
    is_scipy_available,
    is_torch_available,
)

from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin


requirements_available = (
    is_torch_available() and is_essentia_available() and is_scipy_available() and is_librosa_available()
)

if requirements_available:
    import torch

    from transformers import Pop2PianoFeatureExtractor


class Pop2PianoFeatureExtractionTester(unittest.TestCase):
    def __init__(
        self,
        parent,
        n_bars=2,
        sample_rate=22050,
        use_mel=True,
        padding_value=0,
        vocab_size_special=4,
        vocab_size_note=128,
        vocab_size_velocity=2,
        vocab_size_time=100,
    ):
        self.parent = parent
        self.n_bars = n_bars
        self.sample_rate = sample_rate
        self.use_mel = use_mel
        self.padding_value = padding_value
        self.vocab_size_special = vocab_size_special
        self.vocab_size_note = vocab_size_note
        self.vocab_size_velocity = vocab_size_velocity
        self.vocab_size_time = vocab_size_time

    def prepare_feat_extract_dict(self):
        return {
            "n_bars": self.n_bars,
            "sample_rate": self.sample_rate,
            "use_mel": self.use_mel,
            "padding_value": self.padding_value,
            "vocab_size_special": self.vocab_size_special,
            "vocab_size_note": self.vocab_size_note,
            "vocab_size_velocity": self.vocab_size_velocity,
            "vocab_size_time": self.vocab_size_time,
        }


@require_torch
@require_essentia
@require_librosa
@require_scipy
class Pop2PianoFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
    feature_extraction_class = Pop2PianoFeatureExtractor if requirements_available else None

    def setUp(self):
        self.feat_extract_tester = Pop2PianoFeatureExtractionTester(self)

    def test_feat_extract_from_and_save_pretrained(self):
        feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)

        with tempfile.TemporaryDirectory() as tmpdirname:
            saved_file = feat_extract_first.save_pretrained(tmpdirname)[0]
            check_json_file_has_correct_format(saved_file)
            feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname)

        dict_first = feat_extract_first.to_dict()
        dict_second = feat_extract_second.to_dict()
        mel_1 = feat_extract_first.use_mel
        mel_2 = feat_extract_second.use_mel
        self.assertTrue(np.allclose(mel_1, mel_2))
        self.assertEqual(dict_first, dict_second)

    def test_feat_extract_to_json_file(self):
        feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)

        with tempfile.TemporaryDirectory() as tmpdirname:
            json_file_path = os.path.join(tmpdirname, "feat_extract.json")
            feat_extract_first.to_json_file(json_file_path)
            feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path)

        dict_first = feat_extract_first.to_dict()
        dict_second = feat_extract_second.to_dict()
        mel_1 = feat_extract_first.use_mel
        mel_2 = feat_extract_second.use_mel
        self.assertTrue(np.allclose(mel_1, mel_2))
        self.assertEqual(dict_first, dict_second)

    def test_call(self):
        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
        speech_input = np.zeros([1000000], dtype=np.float32)

        input_features = feature_extractor(speech_input, sampling_rate=16_000, return_tensors="np")
        self.assertTrue(input_features.input_features.ndim == 3)
        self.assertEqual(input_features.input_features.shape[-1], 512)

        self.assertTrue(input_features.beatsteps.ndim == 2)
        self.assertTrue(input_features.extrapolated_beatstep.ndim == 2)

    def test_integration(self):
        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        speech_samples = ds.sort("id").select([0])["audio"]
        input_speech = [x["array"] for x in speech_samples][0]
        sampling_rate = [x["sampling_rate"] for x in speech_samples][0]
        feaure_extractor = Pop2PianoFeatureExtractor.from_pretrained("sweetcocoa/pop2piano")
        input_features = feaure_extractor(
            input_speech, sampling_rate=sampling_rate, return_tensors="pt"
        ).input_features

        EXPECTED_INPUT_FEATURES = torch.tensor(
            [[-7.1493, -6.8701, -4.3214], [-5.9473, -5.7548, -3.8438], [-6.1324, -5.9018, -4.3778]]
        )
        self.assertTrue(torch.allclose(input_features[0, :3, :3], EXPECTED_INPUT_FEATURES, atol=1e-4))

    def test_attention_mask(self):
        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
        speech_input1 = np.zeros([1_000_000], dtype=np.float32)
        speech_input2 = np.random.randint(low=0, high=10, size=500_000).astype(np.float32)
        input_features = feature_extractor(
            [speech_input1, speech_input2],
            sampling_rate=[44_100, 16_000],
            return_tensors="np",
            return_attention_mask=True,
        )

        self.assertTrue(hasattr(input_features, "attention_mask"))

        # check shapes
        self.assertTrue(input_features["attention_mask"].ndim == 2)
        self.assertEqual(input_features["attention_mask_beatsteps"].shape[0], 2)
        self.assertEqual(input_features["attention_mask_extrapolated_beatstep"].shape[0], 2)

        # check if they are any values except 0 and 1
        self.assertTrue(np.max(input_features["attention_mask"]) == 1)
        self.assertTrue(np.max(input_features["attention_mask_beatsteps"]) == 1)
        self.assertTrue(np.max(input_features["attention_mask_extrapolated_beatstep"]) == 1)

        self.assertTrue(np.min(input_features["attention_mask"]) == 0)
        self.assertTrue(np.min(input_features["attention_mask_beatsteps"]) == 0)
        self.assertTrue(np.min(input_features["attention_mask_extrapolated_beatstep"]) == 0)

    def test_batch_feature(self):
        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
        speech_input1 = np.zeros([1_000_000], dtype=np.float32)
        speech_input2 = np.ones([2_000_000], dtype=np.float32)
        speech_input3 = np.random.randint(low=0, high=10, size=500_000).astype(np.float32)

        input_features = feature_extractor(
            [speech_input1, speech_input2, speech_input3],
            sampling_rate=[44_100, 16_000, 48_000],
            return_attention_mask=True,
        )

        self.assertEqual(len(input_features["input_features"].shape), 3)
        # check shape
        self.assertEqual(input_features["beatsteps"].shape[0], 3)
        self.assertEqual(input_features["extrapolated_beatstep"].shape[0], 3)

    def test_batch_feature_np(self):
        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
        speech_input1 = np.zeros([1_000_000], dtype=np.float32)
        speech_input2 = np.ones([2_000_000], dtype=np.float32)
        speech_input3 = np.random.randint(low=0, high=10, size=500_000).astype(np.float32)

        input_features = feature_extractor(
            [speech_input1, speech_input2, speech_input3],
            sampling_rate=[44_100, 16_000, 48_000],
            return_tensors="np",
            return_attention_mask=True,
        )

        # check np array or not
        self.assertEqual(type(input_features["input_features"]), np.ndarray)

        # check shape
        self.assertEqual(len(input_features["input_features"].shape), 3)

    def test_batch_feature_pt(self):
        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
        speech_input1 = np.zeros([1_000_000], dtype=np.float32)
        speech_input2 = np.ones([2_000_000], dtype=np.float32)
        speech_input3 = np.random.randint(low=0, high=10, size=500_000).astype(np.float32)

        input_features = feature_extractor(
            [speech_input1, speech_input2, speech_input3],
            sampling_rate=[44_100, 16_000, 48_000],
            return_tensors="pt",
            return_attention_mask=True,
        )

        # check pt tensor or not
        self.assertEqual(type(input_features["input_features"]), torch.Tensor)

        # check shape
        self.assertEqual(len(input_features["input_features"].shape), 3)

    @require_tf
    def test_batch_feature_tf(self):
        import tensorflow as tf

        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
        speech_input1 = np.zeros([1_000_000], dtype=np.float32)
        speech_input2 = np.ones([2_000_000], dtype=np.float32)
        speech_input3 = np.random.randint(low=0, high=10, size=500_000).astype(np.float32)

        input_features = feature_extractor(
            [speech_input1, speech_input2, speech_input3],
            sampling_rate=[44_100, 16_000, 48_000],
            return_tensors="tf",
            return_attention_mask=True,
        )

        # check tf tensor or not
        self.assertTrue(tf.is_tensor(input_features["input_features"]))

        # check shape
        self.assertEqual(len(input_features["input_features"].shape), 3)

    @unittest.skip(
        "Pop2PianoFeatureExtractor does not supports padding externally (while processing audios in batches padding is automatically applied to max_length)"
    )
    def test_padding_accepts_tensors_pt(self):
        pass

    @unittest.skip(
        "Pop2PianoFeatureExtractor does not supports padding externally (while processing audios in batches padding is automatically applied to max_length)"
    )
    def test_padding_accepts_tensors_tf(self):
        pass

    @unittest.skip(
        "Pop2PianoFeatureExtractor does not supports padding externally (while processing audios in batches padding is automatically applied to max_length)"
    )
    def test_padding_from_list(self):
        pass

    @unittest.skip(
        "Pop2PianoFeatureExtractor does not supports padding externally (while processing audios in batches padding is automatically applied to max_length)"
    )
    def test_padding_from_array(self):
        pass

    @unittest.skip("Pop2PianoFeatureExtractor does not support truncation")
    def test_attention_mask_with_truncation(self):
        pass

    @unittest.skip("Pop2PianoFeatureExtractor does not supports truncation")
    def test_truncation_from_array(self):
        pass

    @unittest.skip("Pop2PianoFeatureExtractor does not supports truncation")
    def test_truncation_from_list(self):
        pass