Unverified Commit 3e39fd09 authored by Sanchit Gandhi's avatar Sanchit Gandhi Committed by GitHub
Browse files

[Audio Processor] Only pass sr to feat extractor (#20022)

* [Audio Processor] Only pass sr to feat extractor

* move out of if/else

* copy to other processors
parent fb1c8db7
...@@ -58,6 +58,7 @@ class MCTCTProcessor(ProcessorMixin): ...@@ -58,6 +58,7 @@ class MCTCTProcessor(ProcessorMixin):
audio = kwargs.pop("raw_speech") audio = kwargs.pop("raw_speech")
else: else:
audio = kwargs.pop("audio", None) audio = kwargs.pop("audio", None)
sampling_rate = kwargs.pop("sampling_rate", None)
text = kwargs.pop("text", None) text = kwargs.pop("text", None)
if len(args) > 0: if len(args) > 0:
audio = args[0] audio = args[0]
...@@ -67,7 +68,7 @@ class MCTCTProcessor(ProcessorMixin): ...@@ -67,7 +68,7 @@ class MCTCTProcessor(ProcessorMixin):
raise ValueError("You need to specify either an `audio` or `text` input to process.") raise ValueError("You need to specify either an `audio` or `text` input to process.")
if audio is not None: if audio is not None:
inputs = self.feature_extractor(audio, *args, **kwargs) inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
if text is not None: if text is not None:
encodings = self.tokenizer(text, **kwargs) encodings = self.tokenizer(text, **kwargs)
......
...@@ -61,6 +61,7 @@ class Speech2TextProcessor(ProcessorMixin): ...@@ -61,6 +61,7 @@ class Speech2TextProcessor(ProcessorMixin):
audio = kwargs.pop("raw_speech") audio = kwargs.pop("raw_speech")
else: else:
audio = kwargs.pop("audio", None) audio = kwargs.pop("audio", None)
sampling_rate = kwargs.pop("sampling_rate", None)
text = kwargs.pop("text", None) text = kwargs.pop("text", None)
if len(args) > 0: if len(args) > 0:
audio = args[0] audio = args[0]
...@@ -70,7 +71,7 @@ class Speech2TextProcessor(ProcessorMixin): ...@@ -70,7 +71,7 @@ class Speech2TextProcessor(ProcessorMixin):
raise ValueError("You need to specify either an `audio` or `text` input to process.") raise ValueError("You need to specify either an `audio` or `text` input to process.")
if audio is not None: if audio is not None:
inputs = self.feature_extractor(audio, *args, **kwargs) inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
if text is not None: if text is not None:
encodings = self.tokenizer(text, **kwargs) encodings = self.tokenizer(text, **kwargs)
......
...@@ -60,6 +60,7 @@ class Speech2Text2Processor(ProcessorMixin): ...@@ -60,6 +60,7 @@ class Speech2Text2Processor(ProcessorMixin):
audio = kwargs.pop("raw_speech") audio = kwargs.pop("raw_speech")
else: else:
audio = kwargs.pop("audio", None) audio = kwargs.pop("audio", None)
sampling_rate = kwargs.pop("sampling_rate", None)
text = kwargs.pop("text", None) text = kwargs.pop("text", None)
if len(args) > 0: if len(args) > 0:
audio = args[0] audio = args[0]
...@@ -69,7 +70,7 @@ class Speech2Text2Processor(ProcessorMixin): ...@@ -69,7 +70,7 @@ class Speech2Text2Processor(ProcessorMixin):
raise ValueError("You need to specify either an `audio` or `text` input to process.") raise ValueError("You need to specify either an `audio` or `text` input to process.")
if audio is not None: if audio is not None:
inputs = self.feature_extractor(audio, *args, **kwargs) inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
if text is not None: if text is not None:
encodings = self.tokenizer(text, **kwargs) encodings = self.tokenizer(text, **kwargs)
......
...@@ -80,6 +80,7 @@ class Wav2Vec2Processor(ProcessorMixin): ...@@ -80,6 +80,7 @@ class Wav2Vec2Processor(ProcessorMixin):
audio = kwargs.pop("raw_speech") audio = kwargs.pop("raw_speech")
else: else:
audio = kwargs.pop("audio", None) audio = kwargs.pop("audio", None)
sampling_rate = kwargs.pop("sampling_rate", None)
text = kwargs.pop("text", None) text = kwargs.pop("text", None)
if len(args) > 0: if len(args) > 0:
audio = args[0] audio = args[0]
...@@ -89,7 +90,7 @@ class Wav2Vec2Processor(ProcessorMixin): ...@@ -89,7 +90,7 @@ class Wav2Vec2Processor(ProcessorMixin):
raise ValueError("You need to specify either an `audio` or `text` input to process.") raise ValueError("You need to specify either an `audio` or `text` input to process.")
if audio is not None: if audio is not None:
inputs = self.feature_extractor(audio, *args, **kwargs) inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
if text is not None: if text is not None:
encodings = self.tokenizer(text, **kwargs) encodings = self.tokenizer(text, **kwargs)
......
...@@ -228,6 +228,7 @@ class Wav2Vec2ProcessorWithLM(ProcessorMixin): ...@@ -228,6 +228,7 @@ class Wav2Vec2ProcessorWithLM(ProcessorMixin):
audio = kwargs.pop("raw_speech") audio = kwargs.pop("raw_speech")
else: else:
audio = kwargs.pop("audio", None) audio = kwargs.pop("audio", None)
sampling_rate = kwargs.pop("sampling_rate", None)
text = kwargs.pop("text", None) text = kwargs.pop("text", None)
if len(args) > 0: if len(args) > 0:
audio = args[0] audio = args[0]
...@@ -237,7 +238,7 @@ class Wav2Vec2ProcessorWithLM(ProcessorMixin): ...@@ -237,7 +238,7 @@ class Wav2Vec2ProcessorWithLM(ProcessorMixin):
raise ValueError("You need to specify either an `audio` or `text` input to process.") raise ValueError("You need to specify either an `audio` or `text` input to process.")
if audio is not None: if audio is not None:
inputs = self.feature_extractor(audio, *args, **kwargs) inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
if text is not None: if text is not None:
encodings = self.tokenizer(text, **kwargs) encodings = self.tokenizer(text, **kwargs)
......
...@@ -85,6 +85,7 @@ class WhisperProcessor(ProcessorMixin): ...@@ -85,6 +85,7 @@ class WhisperProcessor(ProcessorMixin):
return self.current_processor(*args, **kwargs) return self.current_processor(*args, **kwargs)
audio = kwargs.pop("audio", None) audio = kwargs.pop("audio", None)
sampling_rate = kwargs.pop("sampling_rate", None)
text = kwargs.pop("text", None) text = kwargs.pop("text", None)
if len(args) > 0: if len(args) > 0:
audio = args[0] audio = args[0]
...@@ -94,7 +95,7 @@ class WhisperProcessor(ProcessorMixin): ...@@ -94,7 +95,7 @@ class WhisperProcessor(ProcessorMixin):
raise ValueError("You need to specify either an `audio` or `text` input to process.") raise ValueError("You need to specify either an `audio` or `text` input to process.")
if audio is not None: if audio is not None:
inputs = self.feature_extractor(audio, *args, **kwargs) inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
if text is not None: if text is not None:
encodings = self.tokenizer(text, **kwargs) encodings = self.tokenizer(text, **kwargs)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment