Unverified Commit f720aec0 authored by Vincent QB's avatar Vincent QB Committed by GitHub
Browse files

lint. (#266)

parent 962c6b0f
...@@ -19,18 +19,16 @@ from speech to silence or vice versa. ...@@ -19,18 +19,16 @@ from speech to silence or vice versa.
from collections import deque from collections import deque
import librosa
import numpy as np import numpy as np
import torch import torch
from six.moves import queue
import librosa
import pyaudio import pyaudio
import torchaudio import torchaudio
from six.moves import queue
def compute_spectral_flatness(frame, epsilon=0.01): def compute_spectral_flatness(frame, epsilon=0.01):
n = frame.nonzero().size(0)
# epsilon protects against log(0) # epsilon protects against log(0)
geometric_mean = torch.exp((frame + epsilon).log().mean(-1)) - epsilon geometric_mean = torch.exp((frame + epsilon).log().mean(-1)) - epsilon
arithmetic_mean = frame.mean(-1) arithmetic_mean = frame.mean(-1)
...@@ -240,8 +238,6 @@ def get_microphone_chunks( ...@@ -240,8 +238,6 @@ def get_microphone_chunks(
): ):
vad = VoiceActivityDetection() vad = VoiceActivityDetection()
speech_frames = []
chunks = []
cumulated = [] cumulated = []
precumulated = deque(maxlen=precumulate) precumulated = deque(maxlen=precumulate)
...@@ -250,7 +246,6 @@ def get_microphone_chunks( ...@@ -250,7 +246,6 @@ def get_microphone_chunks(
audio_generator = stream.generator() audio_generator = stream.generator()
chunk_length = stream._chunk chunk_length = stream._chunk
waveform = torch.zeros(max_to_visualize * chunk_length) waveform = torch.zeros(max_to_visualize * chunk_length)
speechform = torch.zeros(max_to_visualize * chunk_length)
for chunk in audio_generator: for chunk in audio_generator:
# Is speech? # Is speech?
......
...@@ -11,44 +11,93 @@ class ASRTest(unittest.TestCase): ...@@ -11,44 +11,93 @@ class ASRTest(unittest.TestCase):
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
arguments_dict = { arguments_dict = {
'path': '/scratch/jamarshon/downloads/model.pt', "path": "/scratch/jamarshon/downloads/model.pt",
'input_file': '/scratch/jamarshon/audio/examples/interactive_asr/data/sample.wav', "input_file": "/scratch/jamarshon/audio/examples/interactive_asr/data/sample.wav",
'data': '/scratch/jamarshon/downloads', "data": "/scratch/jamarshon/downloads",
'user_dir': '/scratch/jamarshon/fairseq-py/examples/speech_recognition', "user_dir": "/scratch/jamarshon/fairseq-py/examples/speech_recognition",
'no_progress_bar': False, 'log_interval': 1000, 'log_format': None, "no_progress_bar": False,
'tensorboard_logdir': '', 'tbmf_wrapper': False, 'seed': 1, 'cpu': True, "log_interval": 1000,
'fp16': False, 'memory_efficient_fp16': False, 'fp16_init_scale': 128, "log_format": None,
'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, "tensorboard_logdir": "",
'min_loss_scale': 0.0001, 'threshold_loss_scale': None, "tbmf_wrapper": False,
'criterion': 'cross_entropy', 'tokenizer': None, 'bpe': None, 'optimizer': "seed": 1,
'nag', 'lr_scheduler': 'fixed', 'task': 'speech_recognition', 'num_workers': 0, "cpu": True,
'skip_invalid_size_inputs_valid_test': False, 'max_tokens': 10000000, "fp16": False,
'max_sentences': None, 'required_batch_size_multiple': 8, 'dataset_impl': None, "memory_efficient_fp16": False,
'gen_subset': 'test', 'num_shards': 1, 'shard_id': 0, "fp16_init_scale": 128,
'remove_bpe': None, 'quiet': False, 'model_overrides': '{}', "fp16_scale_window": None,
'results_path': None, 'beam': 40, 'nbest': 1, 'max_len_a': 0, "fp16_scale_tolerance": 0.0,
'max_len_b': 200, 'min_len': 1, 'match_source_len': False, "min_loss_scale": 0.0001,
'no_early_stop': False, 'unnormalized': False, 'no_beamable_mm': False, "threshold_loss_scale": None,
'lenpen': 1, 'unkpen': 0, 'replace_unk': None, 'sacrebleu': False, "criterion": "cross_entropy",
'score_reference': False, 'prefix_size': 0, 'no_repeat_ngram_size': 0, "tokenizer": None,
'sampling': False, 'sampling_topk': -1, 'sampling_topp': -1.0, "bpe": None,
'temperature': 1.0, 'diverse_beam_groups': -1, 'diverse_beam_strength': 0.5, "optimizer": "nag",
'print_alignment': False, 'ctc': False, "lr_scheduler": "fixed",
'rnnt': False, 'kspmodel': None, 'wfstlm': None, 'rnnt_decoding_type': 'greedy', "task": "speech_recognition",
'lm_weight': 0.2, 'rnnt_len_penalty': -0.5, 'momentum': 0.99, 'weight_decay': 0.0, "num_workers": 0,
'force_anneal': None, 'lr_shrink': 0.1, 'warmup_updates': 0} "skip_invalid_size_inputs_valid_test": False,
"max_tokens": 10000000,
arguments_dict['path'] = os.environ.get('ASR_MODEL_PATH', None) "max_sentences": None,
arguments_dict['input_file'] = os.environ.get('ASR_INPUT_FILE', None) "required_batch_size_multiple": 8,
arguments_dict['data'] = os.environ.get('ASR_DATA_PATH', None) "dataset_impl": None,
arguments_dict['user_dir'] = os.environ.get('ASR_USER_DIR', None) "gen_subset": "test",
"num_shards": 1,
"shard_id": 0,
"remove_bpe": None,
"quiet": False,
"model_overrides": "{}",
"results_path": None,
"beam": 40,
"nbest": 1,
"max_len_a": 0,
"max_len_b": 200,
"min_len": 1,
"match_source_len": False,
"no_early_stop": False,
"unnormalized": False,
"no_beamable_mm": False,
"lenpen": 1,
"unkpen": 0,
"replace_unk": None,
"sacrebleu": False,
"score_reference": False,
"prefix_size": 0,
"no_repeat_ngram_size": 0,
"sampling": False,
"sampling_topk": -1,
"sampling_topp": -1.0,
"temperature": 1.0,
"diverse_beam_groups": -1,
"diverse_beam_strength": 0.5,
"print_alignment": False,
"ctc": False,
"rnnt": False,
"kspmodel": None,
"wfstlm": None,
"rnnt_decoding_type": "greedy",
"lm_weight": 0.2,
"rnnt_len_penalty": -0.5,
"momentum": 0.99,
"weight_decay": 0.0,
"force_anneal": None,
"lr_shrink": 0.1,
"warmup_updates": 0,
}
arguments_dict["path"] = os.environ.get("ASR_MODEL_PATH", None)
arguments_dict["input_file"] = os.environ.get("ASR_INPUT_FILE", None)
arguments_dict["data"] = os.environ.get("ASR_DATA_PATH", None)
arguments_dict["user_dir"] = os.environ.get("ASR_USER_DIR", None)
args = argparse.Namespace(**arguments_dict) args = argparse.Namespace(**arguments_dict)
def test_transcribe_file(self): def test_transcribe_file(self):
task, generator, models, sp, tgt_dict = setup_asr(self.args, self.logger) task, generator, models, sp, tgt_dict = setup_asr(self.args, self.logger)
_, transcription = transcribe_file(self.args, task, generator, models, sp, tgt_dict) _, transcription = transcribe_file(
self.args, task, generator, models, sp, tgt_dict
)
expected_transcription = [['THE QUICK BROWN FOX JUMPS OVER THE LAZY DOG']] expected_transcription = [["THE QUICK BROWN FOX JUMPS OVER THE LAZY DOG"]]
self.assertEqual(transcription, expected_transcription, msg=str(transcription)) self.assertEqual(transcription, expected_transcription, msg=str(transcription))
......
...@@ -53,7 +53,6 @@ def read_audio(fp, downsample=True): ...@@ -53,7 +53,6 @@ def read_audio(fp, downsample=True):
def load_txts(dir): def load_txts(dir):
"""Create a dictionary with all the text of the audio transcriptions.""" """Create a dictionary with all the text of the audio transcriptions."""
utterences = dict() utterences = dict()
txts = []
dir = os.path.expanduser(dir) dir = os.path.expanduser(dir)
for target in sorted(os.listdir(dir)): for target in sorted(os.listdir(dir)):
d = os.path.join(dir, target) d = os.path.join(dir, target)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment