video.cpp 12.2 KB
Newer Older
1
#include "video.h"
2

3
4
5
6
7
8
#include <regex>

namespace vision {
namespace video {

namespace {
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32

const size_t decoderTimeoutMs = 600000;
const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24;

// returns number of written bytes
template <typename T>
size_t fillTensorList(DecoderOutputMessage& msgs, torch::Tensor& frame) {
  const auto& msg = msgs;
  T* frameData = frame.numel() > 0 ? frame.data_ptr<T>() : nullptr;
  if (frameData) {
    auto sizeInBytes = msg.payload->length();
    memcpy(frameData, msg.payload->data(), sizeInBytes);
  }
  return sizeof(T);
}

size_t fillVideoTensor(DecoderOutputMessage& msgs, torch::Tensor& videoFrame) {
  return fillTensorList<uint8_t>(msgs, videoFrame);
}

size_t fillAudioTensor(DecoderOutputMessage& msgs, torch::Tensor& audioFrame) {
  return fillTensorList<float>(msgs, audioFrame);
}

33
34
std::array<std::pair<std::string, ffmpeg::MediaType>, 4>::const_iterator
_parse_type(const std::string& stream_string) {
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
  static const std::array<std::pair<std::string, MediaType>, 4> types = {{
      {"video", TYPE_VIDEO},
      {"audio", TYPE_AUDIO},
      {"subtitle", TYPE_SUBTITLE},
      {"cc", TYPE_CC},
  }};
  auto device = std::find_if(
      types.begin(),
      types.end(),
      [stream_string](const std::pair<std::string, MediaType>& p) {
        return p.first == stream_string;
      });
  if (device != types.end()) {
    return device;
  }
vfdev's avatar
vfdev committed
50
51
  TORCH_CHECK(
      false, "Expected one of [audio, video, subtitle, cc] ", stream_string);
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
}

std::string parse_type_to_string(const std::string& stream_string) {
  auto device = _parse_type(stream_string);
  return device->first;
}

MediaType parse_type_to_mt(const std::string& stream_string) {
  auto device = _parse_type(stream_string);
  return device->second;
}

std::tuple<std::string, long> _parseStream(const std::string& streamString) {
  TORCH_CHECK(!streamString.empty(), "Stream string must not be empty");
  static const std::regex regex("([a-zA-Z_]+)(?::([1-9]\\d*|0))?");
  std::smatch match;

  TORCH_CHECK(
      std::regex_match(streamString, match, regex),
      "Invalid stream string: '",
      streamString,
      "'");

  std::string type_ = "video";
  type_ = parse_type_to_string(match[1].str());
  long index_ = -1;
  if (match[2].matched) {
    try {
80
      index_ = std::stoi(match[2].str());
81
    } catch (const std::exception&) {
vfdev's avatar
vfdev committed
82
83
      TORCH_CHECK(
          false,
84
85
86
87
88
89
90
91
92
93
          "Could not parse device index '",
          match[2].str(),
          "' in device string '",
          streamString,
          "'");
    }
  }
  return std::make_tuple(type_, index_);
}

94
95
} // namespace

96
97
98
99
void Video::_getDecoderParams(
    double videoStartS,
    int64_t getPtsOnly,
    std::string stream,
100
    long stream_id = -1,
101
    bool fastSeek = true,
102
    bool all_streams = false,
103
    int64_t num_threads = 1,
104
    double seekFrameMarginUs = 10) {
105
106
107
108
109
  int64_t videoStartUs = int64_t(videoStartS * 1e6);

  params.timeoutMs = decoderTimeoutMs;
  params.startOffset = videoStartUs;
  params.seekAccuracy = seekFrameMarginUs;
110
  params.fastSeek = fastSeek;
111
  params.headerOnly = false;
112
  params.numThreads = num_threads;
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158

  params.preventStaleness = false; // not sure what this is about

  if (all_streams == true) {
    MediaFormat format;
    format.stream = -2;
    format.type = TYPE_AUDIO;
    params.formats.insert(format);

    format.type = TYPE_VIDEO;
    format.stream = -2;
    format.format.video.width = 0;
    format.format.video.height = 0;
    format.format.video.cropImage = 0;
    format.format.video.format = defaultVideoPixelFormat;
    params.formats.insert(format);

    format.type = TYPE_SUBTITLE;
    format.stream = -2;
    params.formats.insert(format);

    format.type = TYPE_CC;
    format.stream = -2;
    params.formats.insert(format);
  } else {
    // parse stream type
    MediaType stream_type = parse_type_to_mt(stream);

    // TODO: reset params.formats
    std::set<MediaFormat> formats;
    params.formats = formats;
    // Define new format
    MediaFormat format;
    format.type = stream_type;
    format.stream = stream_id;
    if (stream_type == TYPE_VIDEO) {
      format.format.video.width = 0;
      format.format.video.height = 0;
      format.format.video.cropImage = 0;
      format.format.video.format = defaultVideoPixelFormat;
    }
    params.formats.insert(format);
  }

} // _get decoder params

159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
void Video::initFromFile(
    std::string videoPath,
    std::string stream,
    int64_t numThreads) {
  TORCH_CHECK(!initialized, "Video object can only be initialized once");
  initialized = true;
  params.uri = videoPath;
  _init(stream, numThreads);
}

void Video::initFromMemory(
    torch::Tensor videoTensor,
    std::string stream,
    int64_t numThreads) {
  TORCH_CHECK(!initialized, "Video object can only be initialized once");
  initialized = true;
  callback = MemoryBuffer::getCallback(
      videoTensor.data_ptr<uint8_t>(), videoTensor.size(0));
  _init(stream, numThreads);
}

void Video::_init(std::string stream, int64_t numThreads) {
181
182
  // set number of threads global
  numThreads_ = numThreads;
183
184
185
  // parse stream information
  current_stream = _parseStream(stream);
  // note that in the initial call we want to get all streams
186
  _getDecoderParams(
187
188
      0, // video start
      0, // headerOnly
189
      std::get<0>(current_stream), // stream info - remove that
190
      long(-1), // stream_id parsed from info above change to -2
191
      false, // fastseek: we're using the default param here
192
193
      true, // read all streams
      numThreads_ // global number of Threads for decoding
194
195
196
197
198
  );

  std::string logMessage, logType;

  // locals
199
  std::vector<double> audioFPS, videoFPS;
200
201
202
203
  std::vector<double> audioDuration, videoDuration, ccDuration, subsDuration;
  std::vector<double> audioTB, videoTB, ccTB, subsTB;
  c10::Dict<std::string, std::vector<double>> audioMetadata;
  c10::Dict<std::string, std::vector<double>> videoMetadata;
204
205
  c10::Dict<std::string, std::vector<double>> ccMetadata;
  c10::Dict<std::string, std::vector<double>> subsMetadata;
206

207
  // callback and metadata defined in struct
208
209
  DecoderInCallback tmp_callback = callback;
  succeeded = decoder.init(params, std::move(tmp_callback), &metadata);
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
  if (succeeded) {
    for (const auto& header : metadata) {
      double fps = double(header.fps);
      double duration = double(header.duration) * 1e-6; // * timeBase;

      if (header.format.type == TYPE_VIDEO) {
        videoFPS.push_back(fps);
        videoDuration.push_back(duration);
      } else if (header.format.type == TYPE_AUDIO) {
        audioFPS.push_back(fps);
        audioDuration.push_back(duration);
      } else if (header.format.type == TYPE_CC) {
        ccDuration.push_back(duration);
      } else if (header.format.type == TYPE_SUBTITLE) {
        subsDuration.push_back(duration);
      };
    }
  }
228
  // audio
229
230
  audioMetadata.insert("duration", audioDuration);
  audioMetadata.insert("framerate", audioFPS);
231
  // video
232
233
  videoMetadata.insert("duration", videoDuration);
  videoMetadata.insert("fps", videoFPS);
234
235
236
237
238
  // subs
  subsMetadata.insert("duration", subsDuration);
  // cc
  ccMetadata.insert("duration", ccDuration);
  // put all to a data
239
240
  streamsMetadata.insert("video", videoMetadata);
  streamsMetadata.insert("audio", audioMetadata);
241
242
  streamsMetadata.insert("subtitles", subsMetadata);
  streamsMetadata.insert("cc", ccMetadata);
243

244
  succeeded = setCurrentStream(stream);
245
  LOG(INFO) << "\nDecoder inited with: " << succeeded << "\n";
246
  if (std::get<1>(current_stream) != -1) {
247
    LOG(INFO)
248
        << "Stream index set to " << std::get<1>(current_stream)
249
250
        << ". If you encounter trouble, consider switching it to automatic stream discovery. \n";
  }
251
252
253
254
255
256
257
}

Video::Video(std::string videoPath, std::string stream, int64_t numThreads) {
  C10_LOG_API_USAGE_ONCE("torchvision.csrc.io.video.video.Video");
  if (!videoPath.empty()) {
    initFromFile(videoPath, stream, numThreads);
  }
258
259
} // video

260
bool Video::setCurrentStream(std::string stream = "video") {
261
  TORCH_CHECK(initialized, "Video object has to be initialized first");
262
263
264
265
266
267
268
269
270
271
272
273
  if ((!stream.empty()) && (_parseStream(stream) != current_stream)) {
    current_stream = _parseStream(stream);
  }

  double ts = 0;
  if (seekTS > 0) {
    ts = seekTS;
  }

  _getDecoderParams(
      ts, // video start
      0, // headerOnly
274
275
      std::get<0>(current_stream), // stream
      long(std::get<1>(
276
          current_stream)), // stream_id parsed from info above change to -2
277
      false, // fastseek param set to 0 false by default (changed in seek)
278
279
      false, // read all streams
      numThreads_ // global number of threads
280
281
  );

282
  // callback and metadata defined in Video.h
283
284
  DecoderInCallback tmp_callback = callback;
  return (decoder.init(params, std::move(tmp_callback), &metadata));
285
286
287
}

std::tuple<std::string, int64_t> Video::getCurrentStream() const {
288
  TORCH_CHECK(initialized, "Video object has to be initialized first");
289
290
291
292
293
  return current_stream;
}

c10::Dict<std::string, c10::Dict<std::string, std::vector<double>>> Video::
    getStreamMetadata() const {
294
  TORCH_CHECK(initialized, "Video object has to be initialized first");
295
296
297
  return streamsMetadata;
}

298
void Video::Seek(double ts, bool fastSeek = false) {
299
  TORCH_CHECK(initialized, "Video object has to be initialized first");
300
301
302
303
  // initialize the class variables used for seeking and retrurn
  _getDecoderParams(
      ts, // video start
      0, // headerOnly
304
305
      std::get<0>(current_stream), // stream
      long(std::get<1>(
306
          current_stream)), // stream_id parsed from info above change to -2
307
      fastSeek, // fastseek
308
309
      false, // read all streams
      numThreads_ // global number of threads
310
311
  );

312
  // callback and metadata defined in Video.h
313
314
315
  DecoderInCallback tmp_callback = callback;
  succeeded = decoder.init(params, std::move(tmp_callback), &metadata);

316
317
318
319
  LOG(INFO) << "Decoder init at seek " << succeeded << "\n";
}

std::tuple<torch::Tensor, double> Video::Next() {
320
  TORCH_CHECK(initialized, "Video object has to be initialized first");
321
  // if failing to decode simply return a null tensor (note, should we
322
  // raise an exception?)
323
324
325
326
327
328
  double frame_pts_s;
  torch::Tensor outFrame = torch::zeros({0}, torch::kByte);

  // decode single frame
  DecoderOutputMessage out;
  int64_t res = decoder.decode(&out, decoderTimeoutMs);
329
  // if successful
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
  if (res == 0) {
    frame_pts_s = double(double(out.header.pts) * 1e-6);

    auto header = out.header;
    const auto& format = header.format;

    // initialize the output variables based on type

    if (format.type == TYPE_VIDEO) {
      // note: this can potentially be optimized
      // by having the global tensor that we fill at decode time
      // (would avoid allocations)
      int outHeight = format.format.video.height;
      int outWidth = format.format.video.width;
      int numChannels = 3;
      outFrame = torch::zeros({outHeight, outWidth, numChannels}, torch::kByte);
346
      fillVideoTensor(out, outFrame);
347
348
349
350
351
352
353
354
      outFrame = outFrame.permute({2, 0, 1});

    } else if (format.type == TYPE_AUDIO) {
      int outAudioChannels = format.format.audio.channels;
      int bytesPerSample = av_get_bytes_per_sample(
          static_cast<AVSampleFormat>(format.format.audio.format));
      int frameSizeTotal = out.payload->length();

355
      TORCH_CHECK_EQ(frameSizeTotal % (outAudioChannels * bytesPerSample), 0);
356
357
358
359
360
361
      int numAudioSamples =
          frameSizeTotal / (outAudioChannels * bytesPerSample);

      outFrame =
          torch::zeros({numAudioSamples, outAudioChannels}, torch::kFloat);

362
      fillAudioTensor(out, outFrame);
363
364
365
366
    }
    // currently not supporting other formats (will do soon)

    out.payload.reset();
367
368
  } else if (res == ENODATA) {
    LOG(INFO) << "Decoder ran out of frames (ENODATA)\n";
369
370
371
372
  } else {
    LOG(ERROR) << "Decoder failed with ERROR_CODE " << res;
  }

vfdev's avatar
vfdev committed
373
  return std::make_tuple(outFrame, frame_pts_s);
374
}
375
376
377

static auto registerVideo =
    torch::class_<Video>("torchvision", "Video")
378
        .def(torch::init<std::string, std::string, int64_t>())
379
380
        .def("init_from_file", &Video::initFromFile)
        .def("init_from_memory", &Video::initFromMemory)
381
382
383
384
385
386
387
388
        .def("get_current_stream", &Video::getCurrentStream)
        .def("set_current_stream", &Video::setCurrentStream)
        .def("get_metadata", &Video::getStreamMetadata)
        .def("seek", &Video::Seek)
        .def("next", &Video::Next);

} // namespace video
} // namespace vision