speech.py 8.71 KB
Newer Older
SWHL's avatar
SWHL committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains the speech segment class."""
import numpy as np

from .audio import AudioSegment


class SpeechSegment(AudioSegment):
    """Speech Segment with Text

    Args:
        AudioSegment (AudioSegment): Audio Segment
    """

    def __init__(self,
                 samples,
                 sample_rate,
                 transcript,
                 tokens=None,
                 token_ids=None):
        """Speech segment abstraction, a subclass of AudioSegment,
            with an additional transcript.

        Args:
            samples (ndarray.float32): Audio samples [num_samples x num_channels].
            sample_rate (int): Audio sample rate.
            transcript (str): Transcript text for the speech.
            tokens (List[str], optinal): Transcript tokens for the speech.
            token_ids (List[int], optional): Transcript token ids for the speech.
        """
        AudioSegment.__init__(self, samples, sample_rate)
        self._transcript = transcript
        # must init `tokens` with `token_ids` at the same time
        self._tokens = tokens
        self._token_ids = token_ids

    def __eq__(self, other):
        """Return whether two objects are equal.

        Returns:
            bool: True, when equal to other
        """
        if not AudioSegment.__eq__(self, other):
            return False
        if self._transcript != other._transcript:
            return False
        if self.has_token and other.has_token:
            if self._tokens != other._tokens:
                return False
            if self._token_ids != other._token_ids:
                return False
        return True

    def __ne__(self, other):
        """Return whether two objects are unequal."""
        return not self.__eq__(other)

    @classmethod
    def from_file(cls,
                  filepath,
                  transcript,
                  tokens=None,
                  token_ids=None,
                  infos=None):
        """Create speech segment from audio file and corresponding transcript.

        Args:
            filepath (str|file): Filepath or file object to audio file.
            transcript (str): Transcript text for the speech.
            tokens (List[str], optional): text tokens. Defaults to None.
            token_ids (List[int], optional): text token ids. Defaults to None.
            infos (TarLocalData, optional): tar2obj and tar2infos. Defaults to None.

        Returns:
            SpeechSegment: Speech segment instance.
        """
        audio = AudioSegment.from_file(filepath, infos)
        return cls(audio.samples, audio.sample_rate, transcript, tokens,
                   token_ids)

    @classmethod
    def from_bytes(cls, bytes, transcript, tokens=None, token_ids=None):
        """Create speech segment from a byte string and corresponding

        Args:
            filepath (str|file): Filepath or file object to audio file.
            transcript (str): Transcript text for the speech.
            tokens (List[str], optional): text tokens. Defaults to None.
            token_ids (List[int], optional): text token ids. Defaults to None.

        Returns:
            SpeechSegment: Speech segment instance.
        """
        audio = AudioSegment.from_bytes(bytes)
        return cls(audio.samples, audio.sample_rate, transcript, tokens,
                   token_ids)

    @classmethod
    def from_pcm(cls, samples, sample_rate, transcript,
                 tokens=None, token_ids=None):
        """Create speech segment from pcm on online mode
        Args:
            samples (numpy.ndarray): Audio samples [num_samples x num_channels].
            sample_rate (int): Audio sample rate.
            transcript (str): Transcript text for the speech.
            tokens (List[str], optional): text tokens. Defaults to None.
            token_ids (List[int], optional): text token ids. Defaults to None.
        Returns:
            SpeechSegment: Speech segment instance.
        """
        audio = AudioSegment.from_pcm(samples, sample_rate)
        return cls(audio.samples, audio.sample_rate, transcript, tokens,
                   token_ids)

    @classmethod
    def concatenate(cls, *segments):
        """Concatenate an arbitrary number of speech segments together, both
        audio and transcript will be concatenated.

        :param *segments: Input speech segments to be concatenated.
        :type *segments: tuple of SpeechSegment
        :return: Speech segment instance.
        :rtype: SpeechSegment
        :raises ValueError: If the number of segments is zero, or if the
                            sample_rate of any two segments does not match.
        :raises TypeError: If any segment is not SpeechSegment instance.
        """
        if len(segments) == 0:
            raise ValueError("No speech segments are given to concatenate.")
        sample_rate = segments[0]._sample_rate
        transcripts = ""
        tokens = []
        token_ids = []
        for seg in segments:
            if sample_rate != seg._sample_rate:
                raise ValueError("Can't concatenate segments with "
                                 "different sample rates")
            if not isinstance(seg, cls):
                raise TypeError("Only speech segments of the same type "
                                "instance can be concatenated.")
            transcripts += seg._transcript
            if self.has_token:
                tokens += seg._tokens
                token_ids += seg._token_ids
        samples = np.concatenate([seg.samples for seg in segments])
        return cls(samples, sample_rate, transcripts, tokens, token_ids)

    @classmethod
    def slice_from_file(cls,
                        filepath,
                        transcript,
                        tokens=None,
                        token_ids=None,
                        start=None,
                        end=None):
        """Loads a small section of an speech without having to load
        the entire file into the memory which can be incredibly wasteful.

        :param filepath: Filepath or file object to audio file.
        :type filepath: str|file
        :param start: Start time in seconds. If start is negative, it wraps
                      around from the end. If not provided, this function
                      reads from the very beginning.
        :type start: float
        :param end: End time in seconds. If end is negative, it wraps around
                    from the end. If not provided, the default behvaior is
                    to read to the end of the file.
        :type end: float
        :param transcript: Transcript text for the speech. if not provided,
                           the defaults is an empty string.
        :type transript: str
        :return: SpeechSegment instance of the specified slice of the input
                 speech file.
        :rtype: SpeechSegment
        """
        audio = AudioSegment.slice_from_file(filepath, start, end)
        return cls(audio.samples, audio.sample_rate, transcript, tokens,
                   token_ids)

    @classmethod
    def make_silence(cls, duration, sample_rate):
        """Creates a silent speech segment of the given duration and
        sample rate, transcript will be an empty string.

        Args:
            duration (float): Length of silence in seconds.
            sample_rate (float): Sample rate.

        Returns:
            SpeechSegment: Silence of the given duration.
        """
        audio = AudioSegment.make_silence(duration, sample_rate)
        return cls(audio.samples, audio.sample_rate, "")

    @property
    def has_token(self):
        if self._tokens and self._token_ids:
            return True
        return False

    @property
    def transcript(self):
        """Return the transcript text.

        Returns:
            str: Transcript text for the speech.
        """

        return self._transcript

    @property
    def tokens(self):
        """Return the transcript text tokens.

        Returns:
            List[str]: text tokens.
        """
        return self._tokens

    @property
    def token_ids(self):
        """Return the transcript text token ids.

        Returns:
            List[int]: text token ids.
        """
        return self._token_ids