video.cpp 12.2 KB
Newer Older
1
#include "video.h"
2

3
4
#include <regex>

5
6
using namespace ffmpeg;

7
8
9
10
namespace vision {
namespace video {

namespace {
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34

const size_t decoderTimeoutMs = 600000;
const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24;

// returns number of written bytes
template <typename T>
size_t fillTensorList(DecoderOutputMessage& msgs, torch::Tensor& frame) {
  const auto& msg = msgs;
  T* frameData = frame.numel() > 0 ? frame.data_ptr<T>() : nullptr;
  if (frameData) {
    auto sizeInBytes = msg.payload->length();
    memcpy(frameData, msg.payload->data(), sizeInBytes);
  }
  return sizeof(T);
}

size_t fillVideoTensor(DecoderOutputMessage& msgs, torch::Tensor& videoFrame) {
  return fillTensorList<uint8_t>(msgs, videoFrame);
}

size_t fillAudioTensor(DecoderOutputMessage& msgs, torch::Tensor& audioFrame) {
  return fillTensorList<float>(msgs, audioFrame);
}

35
36
std::array<std::pair<std::string, ffmpeg::MediaType>, 4>::const_iterator
_parse_type(const std::string& stream_string) {
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
  static const std::array<std::pair<std::string, MediaType>, 4> types = {{
      {"video", TYPE_VIDEO},
      {"audio", TYPE_AUDIO},
      {"subtitle", TYPE_SUBTITLE},
      {"cc", TYPE_CC},
  }};
  auto device = std::find_if(
      types.begin(),
      types.end(),
      [stream_string](const std::pair<std::string, MediaType>& p) {
        return p.first == stream_string;
      });
  if (device != types.end()) {
    return device;
  }
vfdev's avatar
vfdev committed
52
53
  TORCH_CHECK(
      false, "Expected one of [audio, video, subtitle, cc] ", stream_string);
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
}

std::string parse_type_to_string(const std::string& stream_string) {
  auto device = _parse_type(stream_string);
  return device->first;
}

MediaType parse_type_to_mt(const std::string& stream_string) {
  auto device = _parse_type(stream_string);
  return device->second;
}

std::tuple<std::string, long> _parseStream(const std::string& streamString) {
  TORCH_CHECK(!streamString.empty(), "Stream string must not be empty");
  static const std::regex regex("([a-zA-Z_]+)(?::([1-9]\\d*|0))?");
  std::smatch match;

  TORCH_CHECK(
      std::regex_match(streamString, match, regex),
      "Invalid stream string: '",
      streamString,
      "'");

  std::string type_ = "video";
  type_ = parse_type_to_string(match[1].str());
  long index_ = -1;
  if (match[2].matched) {
    try {
82
      index_ = std::stoi(match[2].str());
83
    } catch (const std::exception&) {
vfdev's avatar
vfdev committed
84
85
      TORCH_CHECK(
          false,
86
87
88
89
90
91
92
93
94
95
          "Could not parse device index '",
          match[2].str(),
          "' in device string '",
          streamString,
          "'");
    }
  }
  return std::make_tuple(type_, index_);
}

96
97
} // namespace

98
99
100
101
void Video::_getDecoderParams(
    double videoStartS,
    int64_t getPtsOnly,
    std::string stream,
102
    long stream_id = -1,
103
    bool fastSeek = true,
104
    bool all_streams = false,
105
    int64_t num_threads = 1,
106
    double seekFrameMarginUs = 10) {
107
108
109
110
111
  int64_t videoStartUs = int64_t(videoStartS * 1e6);

  params.timeoutMs = decoderTimeoutMs;
  params.startOffset = videoStartUs;
  params.seekAccuracy = seekFrameMarginUs;
112
  params.fastSeek = fastSeek;
113
  params.headerOnly = false;
114
  params.numThreads = num_threads;
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160

  params.preventStaleness = false; // not sure what this is about

  if (all_streams == true) {
    MediaFormat format;
    format.stream = -2;
    format.type = TYPE_AUDIO;
    params.formats.insert(format);

    format.type = TYPE_VIDEO;
    format.stream = -2;
    format.format.video.width = 0;
    format.format.video.height = 0;
    format.format.video.cropImage = 0;
    format.format.video.format = defaultVideoPixelFormat;
    params.formats.insert(format);

    format.type = TYPE_SUBTITLE;
    format.stream = -2;
    params.formats.insert(format);

    format.type = TYPE_CC;
    format.stream = -2;
    params.formats.insert(format);
  } else {
    // parse stream type
    MediaType stream_type = parse_type_to_mt(stream);

    // TODO: reset params.formats
    std::set<MediaFormat> formats;
    params.formats = formats;
    // Define new format
    MediaFormat format;
    format.type = stream_type;
    format.stream = stream_id;
    if (stream_type == TYPE_VIDEO) {
      format.format.video.width = 0;
      format.format.video.height = 0;
      format.format.video.cropImage = 0;
      format.format.video.format = defaultVideoPixelFormat;
    }
    params.formats.insert(format);
  }

} // _get decoder params

161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
void Video::initFromFile(
    std::string videoPath,
    std::string stream,
    int64_t numThreads) {
  TORCH_CHECK(!initialized, "Video object can only be initialized once");
  initialized = true;
  params.uri = videoPath;
  _init(stream, numThreads);
}

void Video::initFromMemory(
    torch::Tensor videoTensor,
    std::string stream,
    int64_t numThreads) {
  TORCH_CHECK(!initialized, "Video object can only be initialized once");
  initialized = true;
  callback = MemoryBuffer::getCallback(
      videoTensor.data_ptr<uint8_t>(), videoTensor.size(0));
  _init(stream, numThreads);
}

void Video::_init(std::string stream, int64_t numThreads) {
183
184
  // set number of threads global
  numThreads_ = numThreads;
185
186
187
  // parse stream information
  current_stream = _parseStream(stream);
  // note that in the initial call we want to get all streams
188
  _getDecoderParams(
189
190
      0, // video start
      0, // headerOnly
191
      std::get<0>(current_stream), // stream info - remove that
192
      long(-1), // stream_id parsed from info above change to -2
193
      false, // fastseek: we're using the default param here
194
195
      true, // read all streams
      numThreads_ // global number of Threads for decoding
196
197
198
199
200
  );

  std::string logMessage, logType;

  // locals
201
  std::vector<double> audioFPS, videoFPS;
202
203
204
205
  std::vector<double> audioDuration, videoDuration, ccDuration, subsDuration;
  std::vector<double> audioTB, videoTB, ccTB, subsTB;
  c10::Dict<std::string, std::vector<double>> audioMetadata;
  c10::Dict<std::string, std::vector<double>> videoMetadata;
206
207
  c10::Dict<std::string, std::vector<double>> ccMetadata;
  c10::Dict<std::string, std::vector<double>> subsMetadata;
208

209
  // callback and metadata defined in struct
210
211
  DecoderInCallback tmp_callback = callback;
  succeeded = decoder.init(params, std::move(tmp_callback), &metadata);
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
  if (succeeded) {
    for (const auto& header : metadata) {
      double fps = double(header.fps);
      double duration = double(header.duration) * 1e-6; // * timeBase;

      if (header.format.type == TYPE_VIDEO) {
        videoFPS.push_back(fps);
        videoDuration.push_back(duration);
      } else if (header.format.type == TYPE_AUDIO) {
        audioFPS.push_back(fps);
        audioDuration.push_back(duration);
      } else if (header.format.type == TYPE_CC) {
        ccDuration.push_back(duration);
      } else if (header.format.type == TYPE_SUBTITLE) {
        subsDuration.push_back(duration);
      };
    }
  }
230
  // audio
231
232
  audioMetadata.insert("duration", audioDuration);
  audioMetadata.insert("framerate", audioFPS);
233
  // video
234
235
  videoMetadata.insert("duration", videoDuration);
  videoMetadata.insert("fps", videoFPS);
236
237
238
239
240
  // subs
  subsMetadata.insert("duration", subsDuration);
  // cc
  ccMetadata.insert("duration", ccDuration);
  // put all to a data
241
242
  streamsMetadata.insert("video", videoMetadata);
  streamsMetadata.insert("audio", audioMetadata);
243
244
  streamsMetadata.insert("subtitles", subsMetadata);
  streamsMetadata.insert("cc", ccMetadata);
245

246
  succeeded = setCurrentStream(stream);
247
  LOG(INFO) << "\nDecoder inited with: " << succeeded << "\n";
248
  if (std::get<1>(current_stream) != -1) {
249
    LOG(INFO)
250
        << "Stream index set to " << std::get<1>(current_stream)
251
252
        << ". If you encounter trouble, consider switching it to automatic stream discovery. \n";
  }
253
254
255
256
257
258
259
}

Video::Video(std::string videoPath, std::string stream, int64_t numThreads) {
  C10_LOG_API_USAGE_ONCE("torchvision.csrc.io.video.video.Video");
  if (!videoPath.empty()) {
    initFromFile(videoPath, stream, numThreads);
  }
260
261
} // video

262
bool Video::setCurrentStream(std::string stream = "video") {
263
  TORCH_CHECK(initialized, "Video object has to be initialized first");
264
265
266
267
268
269
270
271
272
273
274
275
  if ((!stream.empty()) && (_parseStream(stream) != current_stream)) {
    current_stream = _parseStream(stream);
  }

  double ts = 0;
  if (seekTS > 0) {
    ts = seekTS;
  }

  _getDecoderParams(
      ts, // video start
      0, // headerOnly
276
277
      std::get<0>(current_stream), // stream
      long(std::get<1>(
278
          current_stream)), // stream_id parsed from info above change to -2
279
      false, // fastseek param set to 0 false by default (changed in seek)
280
281
      false, // read all streams
      numThreads_ // global number of threads
282
283
  );

284
  // callback and metadata defined in Video.h
285
286
  DecoderInCallback tmp_callback = callback;
  return (decoder.init(params, std::move(tmp_callback), &metadata));
287
288
289
}

std::tuple<std::string, int64_t> Video::getCurrentStream() const {
290
  TORCH_CHECK(initialized, "Video object has to be initialized first");
291
292
293
294
295
  return current_stream;
}

c10::Dict<std::string, c10::Dict<std::string, std::vector<double>>> Video::
    getStreamMetadata() const {
296
  TORCH_CHECK(initialized, "Video object has to be initialized first");
297
298
299
  return streamsMetadata;
}

300
void Video::Seek(double ts, bool fastSeek = false) {
301
  TORCH_CHECK(initialized, "Video object has to be initialized first");
302
303
304
305
  // initialize the class variables used for seeking and retrurn
  _getDecoderParams(
      ts, // video start
      0, // headerOnly
306
307
      std::get<0>(current_stream), // stream
      long(std::get<1>(
308
          current_stream)), // stream_id parsed from info above change to -2
309
      fastSeek, // fastseek
310
311
      false, // read all streams
      numThreads_ // global number of threads
312
313
  );

314
  // callback and metadata defined in Video.h
315
316
317
  DecoderInCallback tmp_callback = callback;
  succeeded = decoder.init(params, std::move(tmp_callback), &metadata);

318
319
320
321
  LOG(INFO) << "Decoder init at seek " << succeeded << "\n";
}

std::tuple<torch::Tensor, double> Video::Next() {
322
  TORCH_CHECK(initialized, "Video object has to be initialized first");
323
  // if failing to decode simply return a null tensor (note, should we
324
  // raise an exception?)
325
326
327
328
329
330
  double frame_pts_s;
  torch::Tensor outFrame = torch::zeros({0}, torch::kByte);

  // decode single frame
  DecoderOutputMessage out;
  int64_t res = decoder.decode(&out, decoderTimeoutMs);
331
  // if successful
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
  if (res == 0) {
    frame_pts_s = double(double(out.header.pts) * 1e-6);

    auto header = out.header;
    const auto& format = header.format;

    // initialize the output variables based on type

    if (format.type == TYPE_VIDEO) {
      // note: this can potentially be optimized
      // by having the global tensor that we fill at decode time
      // (would avoid allocations)
      int outHeight = format.format.video.height;
      int outWidth = format.format.video.width;
      int numChannels = 3;
      outFrame = torch::zeros({outHeight, outWidth, numChannels}, torch::kByte);
348
      fillVideoTensor(out, outFrame);
349
350
351
352
353
354
355
356
      outFrame = outFrame.permute({2, 0, 1});

    } else if (format.type == TYPE_AUDIO) {
      int outAudioChannels = format.format.audio.channels;
      int bytesPerSample = av_get_bytes_per_sample(
          static_cast<AVSampleFormat>(format.format.audio.format));
      int frameSizeTotal = out.payload->length();

357
      TORCH_CHECK_EQ(frameSizeTotal % (outAudioChannels * bytesPerSample), 0);
358
359
360
361
362
363
      int numAudioSamples =
          frameSizeTotal / (outAudioChannels * bytesPerSample);

      outFrame =
          torch::zeros({numAudioSamples, outAudioChannels}, torch::kFloat);

364
      fillAudioTensor(out, outFrame);
365
366
367
368
    }
    // currently not supporting other formats (will do soon)

    out.payload.reset();
369
370
  } else if (res == ENODATA) {
    LOG(INFO) << "Decoder ran out of frames (ENODATA)\n";
371
372
373
374
  } else {
    LOG(ERROR) << "Decoder failed with ERROR_CODE " << res;
  }

vfdev's avatar
vfdev committed
375
  return std::make_tuple(outFrame, frame_pts_s);
376
}
377
378
379

static auto registerVideo =
    torch::class_<Video>("torchvision", "Video")
380
        .def(torch::init<std::string, std::string, int64_t>())
381
382
        .def("init_from_file", &Video::initFromFile)
        .def("init_from_memory", &Video::initFromMemory)
383
384
385
386
387
388
389
390
        .def("get_current_stream", &Video::getCurrentStream)
        .def("set_current_stream", &Video::setCurrentStream)
        .def("get_metadata", &Video::getStreamMetadata)
        .def("seek", &Video::Seek)
        .def("next", &Video::Next);

} // namespace video
} // namespace vision