Commit e95c8f5a authored by David Pollack's avatar David Pollack Committed by Soumith Chintala
Browse files

fixes for OSX, behavior still inconsistent

parent 301e2e98
...@@ -46,35 +46,35 @@ class Test_SoxEffectsChain(unittest.TestCase): ...@@ -46,35 +46,35 @@ class Test_SoxEffectsChain(unittest.TestCase):
def test_ulaw_and_siginfo(self): def test_ulaw_and_siginfo(self):
si_out = torchaudio.sox_signalinfo_t() si_out = torchaudio.sox_signalinfo_t()
ei_out = torchaudio.sox_encodinginfo_t() ei_out = torchaudio.sox_encodinginfo_t()
si_out.rate = 16000
si_out.channels = 1
si_out.precision = 8 si_out.precision = 8
ei_out.encoding = torchaudio.get_sox_encoding_t(9) ei_out.encoding = torchaudio.get_sox_encoding_t(9)
ei_out.bits_per_sample = 8 ei_out.bits_per_sample = 8
si_in, ei_in = torchaudio.info(self.test_filepath) si_in, ei_in = torchaudio.info(self.test_filepath)
si_out.rate = 44100
si_out.channels = 2
E = torchaudio.sox_effects.SoxEffectsChain(out_siginfo=si_out, out_encinfo=ei_out) E = torchaudio.sox_effects.SoxEffectsChain(out_siginfo=si_out, out_encinfo=ei_out)
E.set_input_file(self.test_filepath) E.set_input_file(self.test_filepath)
x, sr = E.sox_build_flow_effects() x, sr = E.sox_build_flow_effects()
# Note: the sample rate is reported as "changed", but no downsampling occured # Note: the output was encoded into ulaw because the
# also the number of channels has not changed. Run rate and channels effects
# to make those changes. However, the output was encoded into ulaw because the
# number of unique values in the output is less than 256. # number of unique values in the output is less than 256.
self.assertLess(x.unique().size(0), 2**8) self.assertLess(x.unique().size(0), 2**8)
self.assertEqual(x.size(0), si_in.channels)
self.assertEqual(sr, si_out.rate)
self.assertEqual(x.numel(), si_in.length) self.assertEqual(x.numel(), si_in.length)
def test_band_chorus(self): def test_band_chorus(self):
si_in, ei_in = torchaudio.info(self.test_filepath) si_in, ei_in = torchaudio.info(self.test_filepath)
ei_in.encoding = torchaudio.get_sox_encoding_t(1)
E = torchaudio.sox_effects.SoxEffectsChain(out_encinfo=ei_in, out_siginfo=si_in) E = torchaudio.sox_effects.SoxEffectsChain(out_encinfo=ei_in, out_siginfo=si_in)
E.set_input_file(self.test_filepath) E.set_input_file(self.test_filepath)
E.append_effect_to_chain("band", ["-n", "10k", "3.5k"]) E.append_effect_to_chain("band", ["-n", "10k", "3.5k"])
E.append_effect_to_chain("chorus", [.5, .7, 55, 0.4, .25, 2, '-s']) E.append_effect_to_chain("chorus", [.5, .7, 55, 0.4, .25, 2, '-s'])
E.append_effect_to_chain("rate", [si_in.rate])
E.append_effect_to_chain("channels", [si_in.channels])
x, sr = E.sox_build_flow_effects() x, sr = E.sox_build_flow_effects()
#print(x.size(), sr) #print(x.size(), sr)
def test_synth(self): def test_synth(self):
si_in, ei_in = torchaudio.info(self.test_filepath) si_in, ei_in = torchaudio.info(self.test_filepath)
ei_in.encoding = torchaudio.get_sox_encoding_t(1)
E = torchaudio.sox_effects.SoxEffectsChain(out_encinfo=ei_in, out_siginfo=si_in) E = torchaudio.sox_effects.SoxEffectsChain(out_encinfo=ei_in, out_siginfo=si_in)
E.set_input_file(self.test_filepath) E.set_input_file(self.test_filepath)
E.append_effect_to_chain("synth", ["1", "pinknoise", "mix"]) E.append_effect_to_chain("synth", ["1", "pinknoise", "mix"])
......
...@@ -259,17 +259,18 @@ int build_flow_effects(const std::string& file_name, ...@@ -259,17 +259,18 @@ int build_flow_effects(const std::string& file_name,
// create interm_signal for effects, intermediate steps change this in-place // create interm_signal for effects, intermediate steps change this in-place
sox_signalinfo_t interm_signal = input->signal; sox_signalinfo_t interm_signal = input->signal;
// create buffer and buffer_size for output in memwrite
char* buffer;
size_t buffer_size;
#ifdef __APPLE__ #ifdef __APPLE__
// According to Mozilla Deepspeech sox_open_memstream_write doesn't work // According to Mozilla Deepspeech sox_open_memstream_write doesn't work
// with OSX // with OSX
char* tmp_name = tmpnam(NULL); char tmp_name[] = "/tmp/fileXXXXXX";
assert(tmp_name); int tmp_fd = mkstemp(tmp_name);
sox_format_t* output = sox_open_write(tmp_name, &target_signal, close(tmp_fd);
&target_encoding, file_type, nullptr, nullptr); sox_format_t* output = sox_open_write(tmp_name, target_signal,
target_encoding, file_type, nullptr, nullptr);
#else #else
// create buffer and buffer_size for output in memwrite
char* buffer;
size_t buffer_size;
// in-memory descriptor (this may not work for OSX) // in-memory descriptor (this may not work for OSX)
sox_format_t* output = sox_open_memstream_write(&buffer, sox_format_t* output = sox_open_memstream_write(&buffer,
&buffer_size, &buffer_size,
...@@ -303,10 +304,13 @@ int build_flow_effects(const std::string& file_name, ...@@ -303,10 +304,13 @@ int build_flow_effects(const std::string& file_name,
sox_args[i] = (char*) tae.eopts[i].c_str(); sox_args[i] = (char*) tae.eopts[i].c_str();
} }
if(sox_effect_options(e, num_opts, sox_args) != SOX_SUCCESS) { if(sox_effect_options(e, num_opts, sox_args) != SOX_SUCCESS) {
#ifdef __APPLE__
unlink(tmp_name);
#endif
throw std::runtime_error("invalid effect options, see SoX docs for details"); throw std::runtime_error("invalid effect options, see SoX docs for details");
} }
} }
sox_add_effect(chain, e, &interm_signal, &input->signal); sox_add_effect(chain, e, &interm_signal, &output->signal);
free(e); free(e);
} }
...@@ -324,6 +328,24 @@ int build_flow_effects(const std::string& file_name, ...@@ -324,6 +328,24 @@ int build_flow_effects(const std::string& file_name,
sox_close(output); sox_close(output);
sox_close(input); sox_close(input);
int sr;
// Read the in-memory audio buffer or temp file that we just wrote.
#ifdef __APPLE__
if (target_signal->length > 0) {
if (target_signal->channels != output->signal.channels) {
//std::cout << "output: " << output->signal.channels << "|" << output->signal.length << "\n";
//std::cout << "target: " << target_signal->channels << "|" << target_signal->length << "\n";
unlink(tmp_name);
throw std::runtime_error("unexpected number of audio channels");
}
sr = read_audio_file(tmp_name, otensor, ch_first, 0, 0,
&output->signal, &output->encoding, file_type);
} else {
sr = read_audio_file(tmp_name, otensor, ch_first, 0, 0,
target_signal, target_encoding, file_type);
}
unlink(tmp_name);
#else
// Resize output tensor to desired dimensions, different effects result in output->signal.length, // Resize output tensor to desired dimensions, different effects result in output->signal.length,
// interm_signal.length and buffer size being inconsistent with the result of the file output. // interm_signal.length and buffer size being inconsistent with the result of the file output.
// We prioritize in the order: output->signal.length > interm_signal.length > buffer_size // We prioritize in the order: output->signal.length > interm_signal.length > buffer_size
...@@ -341,14 +363,7 @@ int build_flow_effects(const std::string& file_name, ...@@ -341,14 +363,7 @@ int build_flow_effects(const std::string& file_name,
} }
otensor.resize_({ns/nc, nc}); otensor.resize_({ns/nc, nc});
otensor = otensor.contiguous(); otensor = otensor.contiguous();
// Read the in-memory audio buffer or temp file that we just wrote.
#ifdef __APPLE__
buffer_size = (size_t) ns * 2; // sizeof(char)? dependent on bit precision?
input = sox_open_read(tmp_name, target_signal, target_encoding, file_type);
#else
input = sox_open_mem_read(buffer, buffer_size, target_signal, target_encoding, file_type); input = sox_open_mem_read(buffer, buffer_size, target_signal, target_encoding, file_type);
#endif
std::vector<sox_sample_t> samples(buffer_size); std::vector<sox_sample_t> samples(buffer_size);
const int64_t samples_read = sox_read(input, samples.data(), buffer_size); const int64_t samples_read = sox_read(input, samples.data(), buffer_size);
// buffer size is twice signal length, but half the buffer is empty so correct // buffer size is twice signal length, but half the buffer is empty so correct
...@@ -358,19 +373,17 @@ int build_flow_effects(const std::string& file_name, ...@@ -358,19 +373,17 @@ int build_flow_effects(const std::string& file_name,
auto* data = otensor.data<scalar_t>(); auto* data = otensor.data<scalar_t>();
std::copy(samples.begin(), samples.begin() + samples_read, data); std::copy(samples.begin(), samples.begin() + samples_read, data);
}); });
// free buffer and quit sox
sox_close(input); sox_close(input);
#ifdef __APPLE__
unlink(tmp_name)
#endif
free(buffer);
if (ch_first) { if (ch_first) {
otensor.transpose_(1, 0); otensor.transpose_(1, 0);
} }
sr = target_signal->rate;
// free buffer
free(buffer);
#endif
return (int) target_signal->rate; return sr;
} }
} // namespace audio } // namespace audio
} // namespace torch } // namespace torch
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment