mel.py 5.63 KB
Newer Older
Patrick von Platen's avatar
Patrick von Platen committed
1
# Copyright 2023 The HuggingFace Team. All rights reserved.
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


Will Berman's avatar
Will Berman committed
16
import numpy as np  # noqa: E402
17
18
19
20
21

from ...configuration_utils import ConfigMixin, register_to_config
from ...schedulers.scheduling_utils import SchedulerMixin


Patrick von Platen's avatar
Patrick von Platen committed
22
23
24
25
26
27
28
29
30
31
32
33
try:
    import librosa  # noqa: E402

    _librosa_can_be_imported = True
    _import_error = ""
except Exception as e:
    _librosa_can_be_imported = False
    _import_error = (
        f"Cannot import librosa because {e}. Make sure to correctly install librosa to be able to install it."
    )


34
35
36
37
38
39
from PIL import Image  # noqa: E402


class Mel(ConfigMixin, SchedulerMixin):
    """
    Parameters:
40
41
42
43
44
45
46
47
48
49
50
51
52
53
        x_res (`int`):
            x resolution of spectrogram (time).
        y_res (`int`):
            y resolution of spectrogram (frequency bins).
        sample_rate (`int`):
            Sample rate of audio.
        n_fft (`int`):
            Number of Fast Fourier Transforms.
        hop_length (`int`):
            Hop length (a higher number is recommended if `y_res` < 256).
        top_db (`int`):
            Loudest decibel value.
        n_iter (`int`):
            Number of iterations for Griffin-Lim Mel inversion.
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
    """

    config_name = "mel_config.json"

    @register_to_config
    def __init__(
        self,
        x_res: int = 256,
        y_res: int = 256,
        sample_rate: int = 22050,
        n_fft: int = 2048,
        hop_length: int = 512,
        top_db: int = 80,
        n_iter: int = 32,
    ):
        self.hop_length = hop_length
        self.sr = sample_rate
        self.n_fft = n_fft
        self.top_db = top_db
        self.n_iter = n_iter
        self.set_resolution(x_res, y_res)
        self.audio = None

Patrick von Platen's avatar
Patrick von Platen committed
77
78
79
        if not _librosa_can_be_imported:
            raise ValueError(_import_error)

80
81
82
83
    def set_resolution(self, x_res: int, y_res: int):
        """Set resolution.

        Args:
84
85
86
87
            x_res (`int`):
                x resolution of spectrogram (time).
            y_res (`int`):
                y resolution of spectrogram (frequency bins).
88
89
90
91
92
93
94
95
96
97
        """
        self.x_res = x_res
        self.y_res = y_res
        self.n_mels = self.y_res
        self.slice_size = self.x_res * self.hop_length - 1

    def load_audio(self, audio_file: str = None, raw_audio: np.ndarray = None):
        """Load audio.

        Args:
98
99
100
101
            audio_file (`str`):
                An audio file that must be on disk due to [Librosa](https://librosa.org/) limitation.
            raw_audio (`np.ndarray`):
                The raw audio file as a NumPy array.
102
103
104
105
106
107
108
109
110
111
112
113
114
115
        """
        if audio_file is not None:
            self.audio, _ = librosa.load(audio_file, mono=True, sr=self.sr)
        else:
            self.audio = raw_audio

        # Pad with silence if necessary.
        if len(self.audio) < self.x_res * self.hop_length:
            self.audio = np.concatenate([self.audio, np.zeros((self.x_res * self.hop_length - len(self.audio),))])

    def get_number_of_slices(self) -> int:
        """Get number of slices in audio.

        Returns:
116
117
            `int`:
                Number of spectograms audio can be sliced into.
118
119
120
121
122
123
124
        """
        return len(self.audio) // self.slice_size

    def get_audio_slice(self, slice: int = 0) -> np.ndarray:
        """Get slice of audio.

        Args:
125
126
            slice (`int`):
                Slice number of audio (out of `get_number_of_slices()`).
127
128

        Returns:
129
130
            `np.ndarray`:
                The audio slice as a NumPy array.
131
132
133
134
        """
        return self.audio[self.slice_size * slice : self.slice_size * (slice + 1)]

    def get_sample_rate(self) -> int:
135
        """Get sample rate.
136
137

        Returns:
138
139
            `int`:
                Sample rate of audio.
140
141
142
143
144
145
146
        """
        return self.sr

    def audio_slice_to_image(self, slice: int) -> Image.Image:
        """Convert slice of audio to spectrogram.

        Args:
147
148
            slice (`int`):
                Slice number of audio to convert (out of `get_number_of_slices()`).
149
150

        Returns:
151
152
            `PIL Image`:
                A grayscale image of `x_res x y_res`.
153
154
155
156
157
158
159
160
161
162
163
164
165
        """
        S = librosa.feature.melspectrogram(
            y=self.get_audio_slice(slice), sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length, n_mels=self.n_mels
        )
        log_S = librosa.power_to_db(S, ref=np.max, top_db=self.top_db)
        bytedata = (((log_S + self.top_db) * 255 / self.top_db).clip(0, 255) + 0.5).astype(np.uint8)
        image = Image.fromarray(bytedata)
        return image

    def image_to_audio(self, image: Image.Image) -> np.ndarray:
        """Converts spectrogram to audio.

        Args:
166
167
            image (`PIL Image`):
                An grayscale image of `x_res x y_res`.
168
169

        Returns:
170
171
            audio (`np.ndarray`):
                The audio as a NumPy array.
172
173
174
175
176
177
178
179
        """
        bytedata = np.frombuffer(image.tobytes(), dtype="uint8").reshape((image.height, image.width))
        log_S = bytedata.astype("float") * self.top_db / 255 - self.top_db
        S = librosa.db_to_power(log_S)
        audio = librosa.feature.inverse.mel_to_audio(
            S, sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length, n_iter=self.n_iter
        )
        return audio