effects.cpp 7.36 KB
Newer Older
1
#include <sox.h>
moto's avatar
moto committed
2
3
4
#include <torchaudio/csrc/sox/effects.h>
#include <torchaudio/csrc/sox/effects_chain.h>
#include <torchaudio/csrc/sox/utils.h>
5

moto's avatar
moto committed
6
using namespace torchaudio::sox_utils;
7
8
9
10
11
12
13
14

namespace torchaudio {
namespace sox_effects {

namespace {

enum SoxEffectsResourceState { NotInitialized, Initialized, ShutDown };
SoxEffectsResourceState SOX_RESOURCE_STATE = NotInitialized;
moto's avatar
moto committed
15
std::mutex SOX_RESOUCE_STATE_MUTEX;
16
17
18
19

} // namespace

void initialize_sox_effects() {
moto's avatar
moto committed
20
21
22
23
24
25
26
27
28
29
30
31
32
  const std::lock_guard<std::mutex> lock(SOX_RESOUCE_STATE_MUTEX);

  switch (SOX_RESOURCE_STATE) {
    case NotInitialized:
      if (sox_init() != SOX_SUCCESS) {
        throw std::runtime_error("Failed to initialize sox effects.");
      };
      SOX_RESOURCE_STATE = Initialized;
    case Initialized:
      break;
    case ShutDown:
      throw std::runtime_error(
          "SoX Effects has been shut down. Cannot initialize again.");
33
34
35
36
  }
};

void shutdown_sox_effects() {
moto's avatar
moto committed
37
38
39
40
41
42
43
44
45
46
47
48
49
  const std::lock_guard<std::mutex> lock(SOX_RESOUCE_STATE_MUTEX);

  switch (SOX_RESOURCE_STATE) {
    case NotInitialized:
      throw std::runtime_error(
          "SoX Effects is not initialized. Cannot shutdown.");
    case Initialized:
      if (sox_quit() != SOX_SUCCESS) {
        throw std::runtime_error("Failed to initialize sox effects.");
      };
      SOX_RESOURCE_STATE = ShutDown;
    case ShutDown:
      break;
50
  }
moto's avatar
moto committed
51
52
53
54
55
56
57
58
59
60
61
}

c10::intrusive_ptr<TensorSignal> apply_effects_tensor(
    const c10::intrusive_ptr<TensorSignal>& input_signal,
    std::vector<std::vector<std::string>> effects) {
  auto in_tensor = input_signal->getTensor();
  validate_input_tensor(in_tensor);

  // Create SoxEffectsChain
  const auto dtype = in_tensor.dtype();
  torchaudio::sox_effects_chain::SoxEffectsChain chain(
62
63
      /*input_encoding=*/get_encodinginfo("wav", dtype),
      /*output_encoding=*/get_encodinginfo("wav", dtype));
moto's avatar
moto committed
64
65
66
67
68
69
70
71
72

  // Prepare output buffer
  std::vector<sox_sample_t> out_buffer;
  out_buffer.reserve(in_tensor.numel());

  // Build and run effects chain
  chain.addInputTensor(input_signal.get());
  for (const auto& effect : effects) {
    chain.addEffect(effect);
73
  }
moto's avatar
moto committed
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
  chain.addOutputBuffer(&out_buffer);
  chain.run();

  // Create tensor from buffer
  const auto channels_first = input_signal->getChannelsFirst();
  auto out_tensor = convert_to_tensor(
      /*buffer=*/out_buffer.data(),
      /*num_samples=*/out_buffer.size(),
      /*num_channels=*/chain.getOutputNumChannels(),
      dtype,
      /*noramlize=*/false,
      channels_first);

  return c10::make_intrusive<TensorSignal>(
      out_tensor, chain.getOutputSampleRate(), channels_first);
89
90
}

moto's avatar
moto committed
91
92
93
c10::intrusive_ptr<TensorSignal> apply_effects_file(
    const std::string path,
    std::vector<std::vector<std::string>> effects,
94
    c10::optional<bool>& normalize,
95
96
    c10::optional<bool>& channels_first,
    c10::optional<std::string>& format) {
moto's avatar
moto committed
97
98
99
100
101
  // Open input file
  SoxFormat sf(sox_open_read(
      path.c_str(),
      /*signal=*/nullptr,
      /*encoding=*/nullptr,
102
      /*filetype=*/format.has_value() ? format.value().c_str() : nullptr));
moto's avatar
moto committed
103
104
105
106
107
108
109
110
111
112
113
114

  validate_input_file(sf);

  const auto dtype = get_dtype(sf->encoding.encoding, sf->signal.precision);

  // Prepare output
  std::vector<sox_sample_t> out_buffer;
  out_buffer.reserve(sf->signal.length);

  // Create and run SoxEffectsChain
  torchaudio::sox_effects_chain::SoxEffectsChain chain(
      /*input_encoding=*/sf->encoding,
115
      /*output_encoding=*/get_encodinginfo("wav", dtype));
moto's avatar
moto committed
116
117
118
119

  chain.addInputFile(sf);
  for (const auto& effect : effects) {
    chain.addEffect(effect);
120
  }
moto's avatar
moto committed
121
122
123
124
  chain.addOutputBuffer(&out_buffer);
  chain.run();

  // Create tensor from buffer
125
  bool channels_first_ = channels_first.value_or(true);
moto's avatar
moto committed
126
127
128
129
130
  auto tensor = convert_to_tensor(
      /*buffer=*/out_buffer.data(),
      /*num_samples=*/out_buffer.size(),
      /*num_channels=*/chain.getOutputNumChannels(),
      dtype,
131
132
      normalize.value_or(true),
      channels_first_);
moto's avatar
moto committed
133
134

  return c10::make_intrusive<TensorSignal>(
135
      tensor, chain.getOutputSampleRate(), channels_first_);
136
137
}

138
139
140
141
142
143
144
145
#ifdef TORCH_API_INCLUDE_EXTENSION_H

std::tuple<torch::Tensor, int64_t> apply_effects_fileobj(
    py::object fileobj,
    std::vector<std::vector<std::string>> effects,
    c10::optional<bool>& normalize,
    c10::optional<bool>& channels_first,
    c10::optional<std::string>& format) {
moto's avatar
moto committed
146
147
  // Streaming decoding over file-like object is tricky because libsox operates
  // on FILE pointer. The folloing is what `sox` and `play` commands do
148
149
150
151
  //  - file input -> FILE pointer
  //  - URL input -> call wget in suprocess and pipe the data -> FILE pointer
  //  - stdin -> FILE pointer
  //
moto's avatar
moto committed
152
153
  // We want to, instead, fetch byte strings chunk by chunk, consume them, and
  // discard.
154
155
  //
  // Here is the approach
moto's avatar
moto committed
156
157
158
159
160
161
162
163
164
165
166
  // 1. Initialize sox_format_t using sox_open_mem_read, providing the initial
  // chunk of byte string
  //    This will perform header-based format detection, if necessary, then fill
  //    the metadata of sox_format_t. Internally, sox_open_mem_read uses
  //    fmemopen, which returns FILE* which points the buffer of the provided
  //    byte string.
  // 2. Each time sox reads a chunk from the FILE*, we update the underlying
  // buffer in a way that it
  //    starts with unseen data, and append the new data read from the given
  //    fileobj. This will trick libsox as if it keeps reading from the FILE*
  //    continuously.
167
168
169
170
171
172
173
174
175
176

  // Prepare the buffer used throughout the lifecycle of SoxEffectChain.
  // Using std::string and let it manage memory.
  // 4096 is minimum size requried by auto_detect_format
  // https://github.com/dmkrepo/libsox/blob/b9dd1a86e71bbd62221904e3e59dfaa9e5e72046/src/formats.c#L40-L48
  const size_t in_buffer_size = 4096;
  std::string in_buffer(in_buffer_size, 'x');
  auto* in_buf = const_cast<char*>(in_buffer.data());

  // Fetch the header, and copy it to the buffer.
moto's avatar
moto committed
177
178
179
180
181
182
  auto header = static_cast<std::string>(
      static_cast<py::bytes>(fileobj.attr("read")(4096)));
  memcpy(
      static_cast<void*>(in_buf),
      static_cast<void*>(const_cast<char*>(header.data())),
      header.length());
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202

  // Open file (this starts reading the header)
  SoxFormat sf(sox_open_mem_read(
      in_buf,
      in_buffer_size,
      /*signal=*/nullptr,
      /*encoding=*/nullptr,
      /*filetype=*/format.has_value() ? format.value().c_str() : nullptr));

  // In case of streamed data, length can be 0
  validate_input_file(sf, /*check_length=*/false);

  // Prepare output buffer
  std::vector<sox_sample_t> out_buffer;
  out_buffer.reserve(sf->signal.length);

  // Create and run SoxEffectsChain
  const auto dtype = get_dtype(sf->encoding.encoding, sf->signal.precision);
  torchaudio::sox_effects_chain::SoxEffectsChain chain(
      /*input_encoding=*/sf->encoding,
203
      /*output_encoding=*/get_encodinginfo("wav", dtype));
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
  chain.addInputFileObj(sf, in_buf, in_buffer_size, &fileobj);
  for (const auto& effect : effects) {
    chain.addEffect(effect);
  }
  chain.addOutputBuffer(&out_buffer);
  chain.run();

  // Create tensor from buffer
  bool channels_first_ = channels_first.value_or(true);
  auto tensor = convert_to_tensor(
      /*buffer=*/out_buffer.data(),
      /*num_samples=*/out_buffer.size(),
      /*num_channels=*/chain.getOutputNumChannels(),
      dtype,
      normalize.value_or(true),
      channels_first_);

  return std::make_tuple(
moto's avatar
moto committed
222
      tensor, static_cast<int64_t>(chain.getOutputSampleRate()));
223
224
225
226
}

#endif // TORCH_API_INCLUDE_EXTENSION_H

227
228
} // namespace sox_effects
} // namespace torchaudio