video.cpp 10.9 KB
Newer Older
1
#include "video.h"
2

3
4
5
6
7
8
#include <regex>

namespace vision {
namespace video {

namespace {
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32

const size_t decoderTimeoutMs = 600000;
const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24;

// returns number of written bytes
template <typename T>
size_t fillTensorList(DecoderOutputMessage& msgs, torch::Tensor& frame) {
  const auto& msg = msgs;
  T* frameData = frame.numel() > 0 ? frame.data_ptr<T>() : nullptr;
  if (frameData) {
    auto sizeInBytes = msg.payload->length();
    memcpy(frameData, msg.payload->data(), sizeInBytes);
  }
  return sizeof(T);
}

size_t fillVideoTensor(DecoderOutputMessage& msgs, torch::Tensor& videoFrame) {
  return fillTensorList<uint8_t>(msgs, videoFrame);
}

size_t fillAudioTensor(DecoderOutputMessage& msgs, torch::Tensor& audioFrame) {
  return fillTensorList<float>(msgs, audioFrame);
}

33
34
std::array<std::pair<std::string, ffmpeg::MediaType>, 4>::const_iterator
_parse_type(const std::string& stream_string) {
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
  static const std::array<std::pair<std::string, MediaType>, 4> types = {{
      {"video", TYPE_VIDEO},
      {"audio", TYPE_AUDIO},
      {"subtitle", TYPE_SUBTITLE},
      {"cc", TYPE_CC},
  }};
  auto device = std::find_if(
      types.begin(),
      types.end(),
      [stream_string](const std::pair<std::string, MediaType>& p) {
        return p.first == stream_string;
      });
  if (device != types.end()) {
    return device;
  }
vfdev's avatar
vfdev committed
50
51
  TORCH_CHECK(
      false, "Expected one of [audio, video, subtitle, cc] ", stream_string);
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
}

std::string parse_type_to_string(const std::string& stream_string) {
  auto device = _parse_type(stream_string);
  return device->first;
}

MediaType parse_type_to_mt(const std::string& stream_string) {
  auto device = _parse_type(stream_string);
  return device->second;
}

std::tuple<std::string, long> _parseStream(const std::string& streamString) {
  TORCH_CHECK(!streamString.empty(), "Stream string must not be empty");
  static const std::regex regex("([a-zA-Z_]+)(?::([1-9]\\d*|0))?");
  std::smatch match;

  TORCH_CHECK(
      std::regex_match(streamString, match, regex),
      "Invalid stream string: '",
      streamString,
      "'");

  std::string type_ = "video";
  type_ = parse_type_to_string(match[1].str());
  long index_ = -1;
  if (match[2].matched) {
    try {
      index_ = c10::stoi(match[2].str());
    } catch (const std::exception&) {
vfdev's avatar
vfdev committed
82
83
      TORCH_CHECK(
          false,
84
85
86
87
88
89
90
91
92
93
          "Could not parse device index '",
          match[2].str(),
          "' in device string '",
          streamString,
          "'");
    }
  }
  return std::make_tuple(type_, index_);
}

94
95
} // namespace

96
97
98
99
void Video::_getDecoderParams(
    double videoStartS,
    int64_t getPtsOnly,
    std::string stream,
100
    long stream_id = -1,
101
    bool fastSeek = true,
102
    bool all_streams = false,
103
    int64_t num_threads = 1,
104
    double seekFrameMarginUs = 10) {
105
106
107
108
109
  int64_t videoStartUs = int64_t(videoStartS * 1e6);

  params.timeoutMs = decoderTimeoutMs;
  params.startOffset = videoStartUs;
  params.seekAccuracy = seekFrameMarginUs;
110
  params.fastSeek = fastSeek;
111
  params.headerOnly = false;
112
  params.numThreads = num_threads;
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158

  params.preventStaleness = false; // not sure what this is about

  if (all_streams == true) {
    MediaFormat format;
    format.stream = -2;
    format.type = TYPE_AUDIO;
    params.formats.insert(format);

    format.type = TYPE_VIDEO;
    format.stream = -2;
    format.format.video.width = 0;
    format.format.video.height = 0;
    format.format.video.cropImage = 0;
    format.format.video.format = defaultVideoPixelFormat;
    params.formats.insert(format);

    format.type = TYPE_SUBTITLE;
    format.stream = -2;
    params.formats.insert(format);

    format.type = TYPE_CC;
    format.stream = -2;
    params.formats.insert(format);
  } else {
    // parse stream type
    MediaType stream_type = parse_type_to_mt(stream);

    // TODO: reset params.formats
    std::set<MediaFormat> formats;
    params.formats = formats;
    // Define new format
    MediaFormat format;
    format.type = stream_type;
    format.stream = stream_id;
    if (stream_type == TYPE_VIDEO) {
      format.format.video.width = 0;
      format.format.video.height = 0;
      format.format.video.cropImage = 0;
      format.format.video.format = defaultVideoPixelFormat;
    }
    params.formats.insert(format);
  }

} // _get decoder params

159
Video::Video(std::string videoPath, std::string stream, int64_t numThreads) {
Kai Zhang's avatar
Kai Zhang committed
160
  C10_LOG_API_USAGE_ONCE("torchvision.csrc.io.video.video.Video");
161
162
  // set number of threads global
  numThreads_ = numThreads;
163
164
165
166
167
168
  // parse stream information
  current_stream = _parseStream(stream);
  // note that in the initial call we want to get all streams
  Video::_getDecoderParams(
      0, // video start
      0, // headerOnly
169
      std::get<0>(current_stream), // stream info - remove that
170
      long(-1), // stream_id parsed from info above change to -2
171
      false, // fastseek: we're using the default param here
172
173
      true, // read all streams
      numThreads_ // global number of Threads for decoding
174
175
176
177
178
179
180
181
182
183
  );

  std::string logMessage, logType;

  // TODO: add read from memory option
  params.uri = videoPath;
  logType = "file";
  logMessage = videoPath;

  // locals
184
  std::vector<double> audioFPS, videoFPS;
185
186
187
188
  std::vector<double> audioDuration, videoDuration, ccDuration, subsDuration;
  std::vector<double> audioTB, videoTB, ccTB, subsTB;
  c10::Dict<std::string, std::vector<double>> audioMetadata;
  c10::Dict<std::string, std::vector<double>> videoMetadata;
189
190
  c10::Dict<std::string, std::vector<double>> ccMetadata;
  c10::Dict<std::string, std::vector<double>> subsMetadata;
191

192
  // callback and metadata defined in struct
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
  succeeded = decoder.init(params, std::move(callback), &metadata);
  if (succeeded) {
    for (const auto& header : metadata) {
      double fps = double(header.fps);
      double duration = double(header.duration) * 1e-6; // * timeBase;

      if (header.format.type == TYPE_VIDEO) {
        videoFPS.push_back(fps);
        videoDuration.push_back(duration);
      } else if (header.format.type == TYPE_AUDIO) {
        audioFPS.push_back(fps);
        audioDuration.push_back(duration);
      } else if (header.format.type == TYPE_CC) {
        ccDuration.push_back(duration);
      } else if (header.format.type == TYPE_SUBTITLE) {
        subsDuration.push_back(duration);
      };
    }
  }
212
  // audio
213
214
  audioMetadata.insert("duration", audioDuration);
  audioMetadata.insert("framerate", audioFPS);
215
  // video
216
217
  videoMetadata.insert("duration", videoDuration);
  videoMetadata.insert("fps", videoFPS);
218
219
220
221
222
  // subs
  subsMetadata.insert("duration", subsDuration);
  // cc
  ccMetadata.insert("duration", ccDuration);
  // put all to a data
223
224
  streamsMetadata.insert("video", videoMetadata);
  streamsMetadata.insert("audio", audioMetadata);
225
226
  streamsMetadata.insert("subtitles", subsMetadata);
  streamsMetadata.insert("cc", ccMetadata);
227
228
229

  succeeded = Video::setCurrentStream(stream);
  LOG(INFO) << "\nDecoder inited with: " << succeeded << "\n";
230
  if (std::get<1>(current_stream) != -1) {
231
    LOG(INFO)
232
        << "Stream index set to " << std::get<1>(current_stream)
233
234
235
236
        << ". If you encounter trouble, consider switching it to automatic stream discovery. \n";
  }
} // video

237
bool Video::setCurrentStream(std::string stream = "video") {
238
239
240
241
242
243
244
245
246
247
248
249
  if ((!stream.empty()) && (_parseStream(stream) != current_stream)) {
    current_stream = _parseStream(stream);
  }

  double ts = 0;
  if (seekTS > 0) {
    ts = seekTS;
  }

  _getDecoderParams(
      ts, // video start
      0, // headerOnly
250
251
      std::get<0>(current_stream), // stream
      long(std::get<1>(
252
          current_stream)), // stream_id parsed from info above change to -2
253
      false, // fastseek param set to 0 false by default (changed in seek)
254
255
      false, // read all streams
      numThreads_ // global number of threads
256
257
  );

258
  // callback and metadata defined in Video.h
259
260
261
262
263
264
265
266
267
268
269
270
  return (decoder.init(params, std::move(callback), &metadata));
}

std::tuple<std::string, int64_t> Video::getCurrentStream() const {
  return current_stream;
}

c10::Dict<std::string, c10::Dict<std::string, std::vector<double>>> Video::
    getStreamMetadata() const {
  return streamsMetadata;
}

271
void Video::Seek(double ts, bool fastSeek = false) {
272
273
274
275
  // initialize the class variables used for seeking and retrurn
  _getDecoderParams(
      ts, // video start
      0, // headerOnly
276
277
      std::get<0>(current_stream), // stream
      long(std::get<1>(
278
          current_stream)), // stream_id parsed from info above change to -2
279
      fastSeek, // fastseek
280
281
      false, // read all streams
      numThreads_ // global number of threads
282
283
  );

284
  // callback and metadata defined in Video.h
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
  succeeded = decoder.init(params, std::move(callback), &metadata);
  LOG(INFO) << "Decoder init at seek " << succeeded << "\n";
}

std::tuple<torch::Tensor, double> Video::Next() {
  // if failing to decode simply return a null tensor (note, should we
  // raise an exeption?)
  double frame_pts_s;
  torch::Tensor outFrame = torch::zeros({0}, torch::kByte);

  // decode single frame
  DecoderOutputMessage out;
  int64_t res = decoder.decode(&out, decoderTimeoutMs);
  // if successfull
  if (res == 0) {
    frame_pts_s = double(double(out.header.pts) * 1e-6);

    auto header = out.header;
    const auto& format = header.format;

    // initialize the output variables based on type

    if (format.type == TYPE_VIDEO) {
      // note: this can potentially be optimized
      // by having the global tensor that we fill at decode time
      // (would avoid allocations)
      int outHeight = format.format.video.height;
      int outWidth = format.format.video.width;
      int numChannels = 3;
      outFrame = torch::zeros({outHeight, outWidth, numChannels}, torch::kByte);
315
      fillVideoTensor(out, outFrame);
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
      outFrame = outFrame.permute({2, 0, 1});

    } else if (format.type == TYPE_AUDIO) {
      int outAudioChannels = format.format.audio.channels;
      int bytesPerSample = av_get_bytes_per_sample(
          static_cast<AVSampleFormat>(format.format.audio.format));
      int frameSizeTotal = out.payload->length();

      CHECK_EQ(frameSizeTotal % (outAudioChannels * bytesPerSample), 0);
      int numAudioSamples =
          frameSizeTotal / (outAudioChannels * bytesPerSample);

      outFrame =
          torch::zeros({numAudioSamples, outAudioChannels}, torch::kFloat);

331
      fillAudioTensor(out, outFrame);
332
333
334
335
    }
    // currently not supporting other formats (will do soon)

    out.payload.reset();
336
337
  } else if (res == ENODATA) {
    LOG(INFO) << "Decoder ran out of frames (ENODATA)\n";
338
339
340
341
  } else {
    LOG(ERROR) << "Decoder failed with ERROR_CODE " << res;
  }

vfdev's avatar
vfdev committed
342
  return std::make_tuple(outFrame, frame_pts_s);
343
}
344
345
346

static auto registerVideo =
    torch::class_<Video>("torchvision", "Video")
347
        .def(torch::init<std::string, std::string, int64_t>())
348
349
350
351
352
353
354
355
        .def("get_current_stream", &Video::getCurrentStream)
        .def("set_current_stream", &Video::setCurrentStream)
        .def("get_metadata", &Video::getStreamMetadata)
        .def("seek", &Video::Seek)
        .def("next", &Video::Next);

} // namespace video
} // namespace vision