[video reader] inception commit (#1303)

* [video reader] inception commit * add method save_metadata to class VideoClips in video_utils.py * add load_metadata() method to VideoClips class * add Exception to not catch unexpected events such as memory erros, interrupt * fix bugs in video_plus.py * [video reader]remove logging. update setup.py * remove time measurement in test_video_reader.py * Remove glog and try making ffmpeg finding more robust * Add ffmpeg to conda build * Add ffmpeg to conda build [again] * Make library path finding more robust * Missing import * One more missing fix for import * Py2 compatibility and change package to av to avoid version conflict with ffmpeg * Fix for python2 * [video reader] support to decode one stream only (e.g. video/audio stream) * remove argument _precomputed_metadata_filepath * remove save_metadata method * add get_metadata method * expose _precomputed_metadata and frame_rate arguments in video dataset __init__ method * remove ssize_t * remove size_t to pass CI check on Windows * add PyInit__video_reader function to pass CI check on Windows * minor fix to define PyInit_video_reader symbol * Make c++ video reader optional * Temporarily revert changes to test_io * Revert changes to python files * Rename files to make it private * Fix python lint * Fix C++ lint * add a functor object EnumClassHash to make Enum class instances usable as key type of std::unordered_map * fix cpp format check

[video reader] inception commit (#1303)
* [video reader] inception commit * add method save_metadata to class VideoClips in video_utils.py * add load_metadata() method to VideoClips class * add Exception to not catch unexpected events such as memory erros, interrupt * fix bugs in video_plus.py * [video reader]remove logging. update setup.py * remove time measurement in test_video_reader.py * Remove glog and try making ffmpeg finding more robust * Add ffmpeg to conda build * Add ffmpeg to conda build [again] * Make library path finding more robust * Missing import * One more missing fix for import * Py2 compatibility and change package to av to avoid version conflict with ffmpeg * Fix for python2 * [video reader] support to decode one stream only (e.g. video/audio stream) * remove argument _precomputed_metadata_filepath * remove save_metadata method * add get_metadata method * expose _precomputed_metadata and frame_rate arguments in video dataset __init__ method * remove ssize_t * remove size_t to pass CI check on Windows * add PyInit__video_reader function to pass CI check on Windows * minor fix to define PyInit_video_reader symbol * Make c++ video reader optional * Temporarily revert changes to test_io * Revert changes to python files * Rename files to make it private * Fix python lint * Fix C++ lint * add a functor object EnumClassHash to make Enum class instances usable as key type of std::unordered_map * fix cpp format check
31fad34f · Zhicheng Yan · Francisco Massa · a6a926bc · 31fad34f · 31fad34f
Commit 31fad34f authored Sep 20, 2019 by Zhicheng Yan Committed by Francisco Massa Sep 20, 2019
16 changed files
--- a/torchvision/csrc/cpu/video_reader/FfmpegStream.cpp
+++ b/torchvision/csrc/cpu/video_reader/FfmpegStream.cpp
+#include "FfmpegStream.h"
+#include "FfmpegUtil.h"
+
+using namespace std;
+
+// (TODO) Currently, disable the use of refCount
+static int refCount = 0;
+
+FfmpegStream::FfmpegStream(
+    AVFormatContext* inputCtx,
+    int index,
+    enum AVMediaType avMediaType,
+    double seekFrameMargin)
+    : inputCtx_(inputCtx),
+      index_(index),
+      avMediaType_(avMediaType),
+      seekFrameMargin_(seekFrameMargin) {}
+
+FfmpegStream::~FfmpegStream() {
+  if (frame_) {
+    av_frame_free(&frame_);
+  }
+  avcodec_free_context(&codecCtx_);
+}
+
+int FfmpegStream::openCodecContext() {
+  VLOG(2) << "stream start_time: " << inputCtx_->streams[index_]->start_time;
+
+  auto typeString = av_get_media_type_string(avMediaType_);
+  AVStream* st = inputCtx_->streams[index_];
+  auto codec_id = st->codecpar->codec_id;
+  VLOG(1) << "codec_id: " << codec_id;
+  AVCodec* codec = avcodec_find_decoder(codec_id);
+  if (!codec) {
+    LOG(ERROR) << "avcodec_find_decoder failed for codec_id: " << int(codec_id);
+    return AVERROR(EINVAL);
+  }
+  VLOG(1) << "Succeed to find decoder";
+
+  codecCtx_ = avcodec_alloc_context3(codec);
+  if (!codecCtx_) {
+    LOG(ERROR) << "avcodec_alloc_context3 fails";
+    return AVERROR(ENOMEM);
+  }
+
+  int ret;
+  /* Copy codec parameters from input stream to output codec context */
+  if ((ret = avcodec_parameters_to_context(codecCtx_, st->codecpar)) < 0) {
+    LOG(ERROR) << "Failed to copy " << typeString
+               << " codec parameters to decoder context";
+    return ret;
+  }
+
+  AVDictionary* opts = nullptr;
+  av_dict_set(&opts, "refcounted_frames", refCount ? "1" : "0", 0);
+
+  // after avcodec_open2, value of codecCtx_->time_base is NOT meaningful
+  // But inputCtx_->streams[index_]->time_base has meaningful values
+  if ((ret = avcodec_open2(codecCtx_, codec, &opts)) < 0) {
+    LOG(ERROR) << "avcodec_open2 failed. " << ffmpeg_util::getErrorDesc(ret);
+    return ret;
+  }
+  VLOG(1) << "Succeed to open codec";
+
+  frame_ = av_frame_alloc();
+  return initFormat();
+}
+
+unique_ptr<DecodedFrame> FfmpegStream::getFrameData(int getPtsOnly) {
+  if (!codecCtx_) {
+    LOG(ERROR) << "Codec is not initialized";
+    return nullptr;
+  }
+  if (getPtsOnly) {
+    unique_ptr<DecodedFrame> decodedFrame = make_unique<DecodedFrame>();
+    decodedFrame->pts_ = frame_->pts;
+    return decodedFrame;
+  } else {
+    unique_ptr<DecodedFrame> decodedFrame = sampleFrameData();
+    if (decodedFrame) {
+      decodedFrame->pts_ = frame_->pts;
+    }
+    return decodedFrame;
+  }
+}
+
+void FfmpegStream::flush(int getPtsOnly, DecoderOutput& decoderOutput) {
+  VLOG(1) << "Media Type: " << getMediaType() << ", flush stream.";
+  // need to receive frames before entering draining mode
+  receiveAvailFrames(getPtsOnly, decoderOutput);
+
+  VLOG(2) << "send nullptr packet";
+  sendPacket(nullptr);
+  // receive remaining frames after entering draining mode
+  receiveAvailFrames(getPtsOnly, decoderOutput);
+
+  avcodec_flush_buffers(codecCtx_);
+}
+
+bool FfmpegStream::isFramePtsInRange() {
+  CHECK(frame_);
+  auto pts = frame_->pts;
+  auto startPts = this->getStartPts();
+  auto endPts = this->getEndPts();
+  VLOG(2) << "isPtsInRange. pts: " << pts << ", startPts: " << startPts
+          << ", endPts: " << endPts;
+  return (pts == AV_NOPTS_VALUE) ||
+      (pts >= startPts && (endPts >= 0 ? pts <= endPts : true));
+}
+
+bool FfmpegStream::isFramePtsExceedRange() {
+  if (frame_) {
+    auto endPts = this->getEndPts();
+    VLOG(2) << "isFramePtsExceedRange. last_pts_: " << last_pts_
+            << ", endPts: " << endPts;
+    return endPts >= 0 ? last_pts_ >= endPts : false;
+  } else {
+    return true;
+  }
+}
+
+// seek a frame
+int FfmpegStream::seekFrame(int64_t seekPts) {
+  // translate margin from second to pts
+  int64_t margin = (int64_t)(
+      seekFrameMargin_ * (double)inputCtx_->streams[index_]->time_base.den /
+      (double)inputCtx_->streams[index_]->time_base.num);
+  int64_t real_seekPts = (seekPts - margin) > 0 ? (seekPts - margin) : 0;
+  VLOG(2) << "seek margin: " << margin;
+  VLOG(2) << "real seekPts: " << real_seekPts;
+  int ret = av_seek_frame(
+      inputCtx_,
+      index_,
+      (seekPts - margin) > 0 ? (seekPts - margin) : 0,
+      AVSEEK_FLAG_BACKWARD);
+  if (ret < 0) {
+    LOG(WARNING) << "av_seek_frame fails. Stream index: " << index_;
+    return ret;
+  }
+  return 0;
+}
+
+// send/receive encoding and decoding API overview
+// https://ffmpeg.org/doxygen/3.4/group__lavc__encdec.html
+int FfmpegStream::sendPacket(const AVPacket* packet) {
+  return avcodec_send_packet(codecCtx_, packet);
+}
+
+int FfmpegStream::receiveFrame() {
+  int ret = avcodec_receive_frame(codecCtx_, frame_);
+  if (ret >= 0) {
+    // succeed
+    frame_->pts = av_frame_get_best_effort_timestamp(frame_);
+    if (frame_->pts == AV_NOPTS_VALUE) {
+      // Trick: if we can not figure out pts, we just set it to be (last_pts +
+      // 1)
+      frame_->pts = last_pts_ + 1;
+    }
+    last_pts_ = frame_->pts;
+
+    VLOG(2) << "avcodec_receive_frame succeed";
+  } else if (ret == AVERROR(EAGAIN)) {
+    VLOG(2) << "avcodec_receive_frame fails and returns AVERROR(EAGAIN). ";
+  } else if (ret == AVERROR_EOF) {
+    // no more frame to read
+    VLOG(2) << "avcodec_receive_frame returns AVERROR_EOF";
+  } else {
+    LOG(WARNING) << "avcodec_receive_frame failed. Error: "
+                 << ffmpeg_util::getErrorDesc(ret);
+  }
+  return ret;
+}
+
+void FfmpegStream::receiveAvailFrames(
+    int getPtsOnly,
+    DecoderOutput& decoderOutput) {
+  int result = 0;
+  while ((result = receiveFrame()) >= 0) {
+    unique_ptr<DecodedFrame> decodedFrame = getFrameData(getPtsOnly);
+
+    if (decodedFrame &&
+        ((!getPtsOnly && decodedFrame->frameSize_ > 0) || getPtsOnly)) {
+      if (isFramePtsInRange()) {
+        decoderOutput.addMediaFrame(getMediaType(), std::move(decodedFrame));
+      }
+    } // end-if
+  } // end-while
+}
--- a/torchvision/csrc/cpu/video_reader/FfmpegStream.h
+++ b/torchvision/csrc/cpu/video_reader/FfmpegStream.h
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+#include <utility>
+#include "FfmpegHeaders.h"
+#include "Interface.h"
+
+/*
+Class uses FFMPEG library to decode one media stream (audio or video).
+*/
+class FfmpegStream {
+ public:
+  FfmpegStream(
+      AVFormatContext* inputCtx,
+      int index,
+      enum AVMediaType avMediaType,
+      double seekFrameMargin);
+  virtual ~FfmpegStream();
+
+  // returns 0 - on success or negative error
+  int openCodecContext();
+  // returns stream index
+  int getIndex() const {
+    return index_;
+  }
+  // returns number decoded/sampled bytes
+  std::unique_ptr<DecodedFrame> getFrameData(int getPtsOnly);
+  // flush the stream at the end of decoding.
+  // Return 0 on success and -1 when cache is drained
+  void flush(int getPtsOnly, DecoderOutput& decoderOutput);
+  // seek a frame
+  int seekFrame(int64_t ts);
+  // send an AVPacket
+  int sendPacket(const AVPacket* packet);
+  // receive AVFrame
+  int receiveFrame();
+  // receive all available frames from the internal buffer
+  void receiveAvailFrames(int getPtsOnly, DecoderOutput& decoderOutput);
+  // return media type
+  virtual MediaType getMediaType() const = 0;
+  // return media format
+  virtual FormatUnion getMediaFormat() const = 0;
+  // return start presentation timestamp
+  virtual int64_t getStartPts() const = 0;
+  // return end presentation timestamp
+  virtual int64_t getEndPts() const = 0;
+  // is the pts of most recent frame within range?
+  bool isFramePtsInRange();
+  // does the pts of most recent frame exceed range?
+  bool isFramePtsExceedRange();
+
+ protected:
+  virtual int initFormat() = 0;
+  // returns a decoded frame
+  virtual std::unique_ptr<DecodedFrame> sampleFrameData() = 0;
+
+ protected:
+  AVFormatContext* const inputCtx_;
+  const int index_;
+  enum AVMediaType avMediaType_;
+
+  AVCodecContext* codecCtx_{nullptr};
+  AVFrame* frame_{nullptr};
+  // pts of last decoded frame
+  int64_t last_pts_{0};
+  double seekFrameMargin_{1.0};
+};
--- a/torchvision/csrc/cpu/video_reader/FfmpegUtil.cpp
+++ b/torchvision/csrc/cpu/video_reader/FfmpegUtil.cpp
+#include "FfmpegUtil.h"
+
+using namespace std;
+
+namespace ffmpeg_util {
+
+bool mapFfmpegType(AVMediaType media, MediaType* type) {
+  switch (media) {
+    case AVMEDIA_TYPE_VIDEO:
+      *type = MediaType::TYPE_VIDEO;
+      return true;
+    case AVMEDIA_TYPE_AUDIO:
+      *type = MediaType::TYPE_AUDIO;
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool mapMediaType(MediaType type, AVMediaType* media) {
+  switch (type) {
+    case MediaType::TYPE_VIDEO:
+      *media = AVMEDIA_TYPE_VIDEO;
+      return true;
+    case MediaType::TYPE_AUDIO:
+      *media = AVMEDIA_TYPE_AUDIO;
+      return true;
+    default:
+      return false;
+  }
+}
+
+void setFormatDimensions(
+    int& destW,
+    int& destH,
+    int userW,
+    int userH,
+    int srcW,
+    int srcH,
+    int minDimension) {
+  // rounding rules
+  // int -> double -> round
+  // round up if fraction is >= 0.5 or round down if fraction is < 0.5
+  // int result = double(value) + 0.5
+  // here we rounding double to int according to the above rule
+  if (userW == 0 && userH == 0) {
+    if (minDimension > 0) { // #2
+      if (srcW > srcH) {
+        // landscape
+        destH = minDimension;
+        destW = round(double(srcW * minDimension) / srcH);
+      } else {
+        // portrait
+        destW = minDimension;
+        destH = round(double(srcH * minDimension) / srcW);
+      }
+    } else { // #1
+      destW = srcW;
+      destH = srcH;
+    }
+  } else if (userW != 0 && userH == 0) { // #3
+    destW = userW;
+    destH = round(double(srcH * userW) / srcW);
+  } else if (userW == 0 && userH != 0) { // #4
+    destW = round(double(srcW * userH) / srcH);
+    destH = userH;
+  } else {
+    // userW != 0 && userH != 0. #5
+    destW = userW;
+    destH = userH;
+  }
+  // prevent zeros
+  destW = std::max(destW, 1);
+  destH = std::max(destH, 1);
+}
+
+bool validateVideoFormat(const VideoFormat& f) {
+  /*
+  Valid parameters values for decoder
+  ___________________________________________________
+  |  W  |  H  | minDimension |  algorithm           |
+  |_________________________________________________|
+  |  0  |  0  |     0        |   original           |
+  |_________________________________________________|
+  |  0  |  0  |     >0       |scale to min dimension|
+  |_____|_____|____________________________________ |
+  |  >0 |  0  |     0        |   scale keeping W    |
+  |_________________________________________________|
+  |  0  |  >0 |     0        |   scale keeping H    |
+  |_________________________________________________|
+  |  >0 |  >0 |     0        |   stretch/scale      |
+  |_________________________________________________|
+
+  */
+  return (f.width == 0 && f.height == 0) || // #1 and #2
+      (f.width != 0 && f.height != 0 && f.minDimension == 0) || // # 5
+      (((f.width != 0 && f.height == 0) || // #3 and #4
+        (f.width == 0 && f.height != 0)) &&
+       f.minDimension == 0);
+}
+
+string getErrorDesc(int errnum) {
+  array<char, 1024> buffer;
+  if (av_strerror(errnum, buffer.data(), buffer.size()) < 0) {
+    return string("Unknown error code");
+  }
+  buffer.back() = 0;
+  return string(buffer.data());
+}
+
+} // namespace ffmpeg_util
--- a/torchvision/csrc/cpu/video_reader/FfmpegUtil.h
+++ b/torchvision/csrc/cpu/video_reader/FfmpegUtil.h
+#pragma once
+
+#include <array>
+#include <string>
+#include "FfmpegHeaders.h"
+#include "Interface.h"
+
+namespace ffmpeg_util {
+
+bool mapFfmpegType(AVMediaType media, enum MediaType* type);
+
+bool mapMediaType(MediaType type, enum AVMediaType* media);
+
+void setFormatDimensions(
+    int& destW,
+    int& destH,
+    int userW,
+    int userH,
+    int srcW,
+    int srcH,
+    int minDimension);
+
+bool validateVideoFormat(const VideoFormat& f);
+
+std::string getErrorDesc(int errnum);
+
+} // namespace ffmpeg_util
--- a/torchvision/csrc/cpu/video_reader/FfmpegVideoSampler.cpp
+++ b/torchvision/csrc/cpu/video_reader/FfmpegVideoSampler.cpp
+#include "FfmpegVideoSampler.h"
+#include "FfmpegUtil.h"
+
+using namespace std;
+
+FfmpegVideoSampler::FfmpegVideoSampler(
+    const VideoFormat& in,
+    const VideoFormat& out,
+    int swsFlags)
+    : inFormat_(in), outFormat_(out), swsFlags_(swsFlags) {}
+
+FfmpegVideoSampler::~FfmpegVideoSampler() {
+  if (scaleContext_) {
+    sws_freeContext(scaleContext_);
+    scaleContext_ = nullptr;
+  }
+}
+
+int FfmpegVideoSampler::init() {
+  VLOG(1) << "Input format: width " << inFormat_.width << ", height "
+          << inFormat_.height << ", format " << inFormat_.format
+          << ", minDimension " << inFormat_.minDimension;
+  VLOG(1) << "Scale format: width " << outFormat_.width << ", height "
+          << outFormat_.height << ", format " << outFormat_.format
+          << ", minDimension " << outFormat_.minDimension;
+
+  scaleContext_ = sws_getContext(
+      inFormat_.width,
+      inFormat_.height,
+      (AVPixelFormat)inFormat_.format,
+      outFormat_.width,
+      outFormat_.height,
+      static_cast<AVPixelFormat>(outFormat_.format),
+      swsFlags_,
+      nullptr,
+      nullptr,
+      nullptr);
+  if (scaleContext_) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+int32_t FfmpegVideoSampler::getImageBytes() const {
+  return av_image_get_buffer_size(
+      (AVPixelFormat)outFormat_.format, outFormat_.width, outFormat_.height, 1);
+}
+
+// https://ffmpeg.org/doxygen/3.4/scaling_video_8c-example.html#a10
+unique_ptr<DecodedFrame> FfmpegVideoSampler::sample(const AVFrame* frame) {
+  if (!frame) {
+    return nullptr; // no flush for videos
+  }
+  // scaled and cropped image
+  auto outImageSize = getImageBytes();
+  AvDataPtr frameData(static_cast<uint8_t*>(av_malloc(outImageSize)));
+
+  uint8_t* scalePlanes[4] = {nullptr};
+  int scaleLines[4] = {0};
+
+  int result;
+  if ((result = av_image_fill_arrays(
+           scalePlanes,
+           scaleLines,
+           frameData.get(),
+           static_cast<AVPixelFormat>(outFormat_.format),
+           outFormat_.width,
+           outFormat_.height,
+           1)) < 0) {
+    LOG(ERROR) << "av_image_fill_arrays failed, err: "
+               << ffmpeg_util::getErrorDesc(result);
+    return nullptr;
+  }
+
+  if ((result = sws_scale(
+           scaleContext_,
+           frame->data,
+           frame->linesize,
+           0,
+           inFormat_.height,
+           scalePlanes,
+           scaleLines)) < 0) {
+    LOG(ERROR) << "sws_scale failed, err: "
+               << ffmpeg_util::getErrorDesc(result);
+    return nullptr;
+  }
+
+  return make_unique<DecodedFrame>(std::move(frameData), outImageSize, 0);
+}
--- a/torchvision/csrc/cpu/video_reader/FfmpegVideoSampler.h
+++ b/torchvision/csrc/cpu/video_reader/FfmpegVideoSampler.h
+#pragma once
+
+#include "FfmpegSampler.h"
+
+/**
+ * Class transcode video frames from one format into another
+ */
+
+class FfmpegVideoSampler : public FfmpegSampler {
+ public:
+  explicit FfmpegVideoSampler(
+      const VideoFormat& in,
+      const VideoFormat& out,
+      int swsFlags = SWS_AREA);
+  ~FfmpegVideoSampler() override;
+
+  int init() override;
+
+  int32_t getImageBytes() const;
+  // returns number of bytes of the sampled data
+  std::unique_ptr<DecodedFrame> sample(const AVFrame* frame) override;
+
+  const VideoFormat& getInFormat() const {
+    return inFormat_;
+  }
+
+ private:
+  VideoFormat inFormat_;
+  VideoFormat outFormat_;
+  int swsFlags_;
+  SwsContext* scaleContext_{nullptr};
+};
--- a/torchvision/csrc/cpu/video_reader/FfmpegVideoStream.cpp
+++ b/torchvision/csrc/cpu/video_reader/FfmpegVideoStream.cpp
+#include "FfmpegVideoStream.h"
+#include "FfmpegUtil.h"
+
+using namespace std;
+
+namespace {
+
+bool operator==(const VideoFormat& x, const AVFrame& y) {
+  return x.width == y.width && x.height == y.height &&
+      x.format == static_cast<AVPixelFormat>(y.format);
+}
+
+VideoFormat toVideoFormat(const AVFrame& frame) {
+  VideoFormat videoFormat;
+  videoFormat.width = frame.width;
+  videoFormat.height = frame.height;
+  videoFormat.format = static_cast<AVPixelFormat>(frame.format);
+
+  return videoFormat;
+}
+
+} // namespace
+
+FfmpegVideoStream::FfmpegVideoStream(
+    AVFormatContext* inputCtx,
+    int index,
+    enum AVMediaType avMediaType,
+    MediaFormat mediaFormat,
+    double seekFrameMargin)
+    : FfmpegStream(inputCtx, index, avMediaType, seekFrameMargin),
+      mediaFormat_(mediaFormat) {}
+
+FfmpegVideoStream::~FfmpegVideoStream() {}
+
+void FfmpegVideoStream::checkStreamDecodeParams() {
+  auto timeBase = getTimeBase();
+  if (timeBase.first > 0) {
+    CHECK_EQ(timeBase.first, inputCtx_->streams[index_]->time_base.num);
+    CHECK_EQ(timeBase.second, inputCtx_->streams[index_]->time_base.den);
+  }
+}
+
+void FfmpegVideoStream::updateStreamDecodeParams() {
+  auto timeBase = getTimeBase();
+  if (timeBase.first == 0) {
+    mediaFormat_.format.video.timeBaseNum =
+        inputCtx_->streams[index_]->time_base.num;
+    mediaFormat_.format.video.timeBaseDen =
+        inputCtx_->streams[index_]->time_base.den;
+  }
+}
+
+int FfmpegVideoStream::initFormat() {
+  // set output format
+  VideoFormat& format = mediaFormat_.format.video;
+  if (!ffmpeg_util::validateVideoFormat(format)) {
+    LOG(ERROR) << "Invalid video format";
+    return -1;
+  }
+
+  format.fps = av_q2d(
+      av_guess_frame_rate(inputCtx_, inputCtx_->streams[index_], nullptr));
+
+  // keep aspect ratio
+  ffmpeg_util::setFormatDimensions(
+      format.width,
+      format.height,
+      format.width,
+      format.height,
+      codecCtx_->width,
+      codecCtx_->height,
+      format.minDimension);
+
+  VLOG(1) << "After adjusting, video format"
+          << ", width: " << format.width << ", height: " << format.height
+          << ", format: " << format.format
+          << ", minDimension: " << format.minDimension;
+
+  if (format.format == AV_PIX_FMT_NONE) {
+    format.format = codecCtx_->pix_fmt;
+    VLOG(1) << "Set pixel format: " << format.format;
+  }
+
+  checkStreamDecodeParams();
+
+  updateStreamDecodeParams();
+
+  return format.width != 0 && format.height != 0 &&
+          format.format != AV_PIX_FMT_NONE
+      ? 0
+      : -1;
+}
+
+unique_ptr<DecodedFrame> FfmpegVideoStream::sampleFrameData() {
+  VideoFormat& format = mediaFormat_.format.video;
+  if (!sampler_ || !(sampler_->getInFormat() == *frame_)) {
+    VideoFormat newInFormat = toVideoFormat(*frame_);
+    sampler_ = make_unique<FfmpegVideoSampler>(newInFormat, format, SWS_AREA);
+    VLOG(1) << "Set input video sampler format"
+            << ", width: " << newInFormat.width
+            << ", height: " << newInFormat.height
+            << ", format: " << newInFormat.format
+            << " : output video sampler format"
+            << ", width: " << format.width << ", height: " << format.height
+            << ", format: " << format.format
+            << ", minDimension: " << format.minDimension;
+    int ret = sampler_->init();
+    if (ret < 0) {
+      VLOG(1) << "Fail to initialize video sampler";
+      return nullptr;
+    }
+  }
+  return sampler_->sample(frame_);
+}
--- a/torchvision/csrc/cpu/video_reader/FfmpegVideoStream.h
+++ b/torchvision/csrc/cpu/video_reader/FfmpegVideoStream.h
+#pragma once
+
+#include <utility>
+#include "FfmpegStream.h"
+#include "FfmpegVideoSampler.h"
+
+/**
+ * Class uses FFMPEG library to decode one video stream.
+ */
+class FfmpegVideoStream : public FfmpegStream {
+ public:
+  explicit FfmpegVideoStream(
+      AVFormatContext* inputCtx,
+      int index,
+      enum AVMediaType avMediaType,
+      MediaFormat mediaFormat,
+      double seekFrameMargin);
+
+  ~FfmpegVideoStream() override;
+
+  // FfmpegStream overrides
+  MediaType getMediaType() const override {
+    return MediaType::TYPE_VIDEO;
+  }
+
+  FormatUnion getMediaFormat() const override {
+    return mediaFormat_.format;
+  }
+
+  int64_t getStartPts() const override {
+    return mediaFormat_.format.video.startPts;
+  }
+  int64_t getEndPts() const override {
+    return mediaFormat_.format.video.endPts;
+  }
+  // return numerator and denominator of time base
+  std::pair<int, int> getTimeBase() const {
+    return std::make_pair(
+        mediaFormat_.format.video.timeBaseNum,
+        mediaFormat_.format.video.timeBaseDen);
+  }
+
+  void checkStreamDecodeParams();
+
+  void updateStreamDecodeParams();
+
+ protected:
+  int initFormat() override;
+  std::unique_ptr<DecodedFrame> sampleFrameData() override;
+
+ private:
+  MediaFormat mediaFormat_;
+  std::unique_ptr<FfmpegVideoSampler> sampler_{nullptr};
+};
--- a/torchvision/csrc/cpu/video_reader/Interface.cpp
+++ b/torchvision/csrc/cpu/video_reader/Interface.cpp
+#include "Interface.h"
+
+void DecoderOutput::initMediaType(MediaType mediaType, FormatUnion format) {
+  MediaData mediaData(format);
+  media_data_.emplace(mediaType, std::move(mediaData));
+}
+
+void DecoderOutput::addMediaFrame(
+    MediaType mediaType,
+    std::unique_ptr<DecodedFrame> frame) {
+  if (media_data_.find(mediaType) != media_data_.end()) {
+    VLOG(1) << "media type: " << mediaType
+            << " add frame with pts: " << frame->pts_;
+    media_data_[mediaType].frames_.push_back(std::move(frame));
+  } else {
+    VLOG(1) << "media type: " << mediaType << " not found. Skip the frame.";
+  }
+}
+
+void DecoderOutput::clear() {
+  media_data_.clear();
+}
--- a/torchvision/csrc/cpu/video_reader/Interface.h
+++ b/torchvision/csrc/cpu/video_reader/Interface.h
+#pragma once
+
+#include <c10/util/Logging.h>
+#include <sys/types.h>
+#include <memory>
+#include <unordered_map>
+
+extern "C" {
+
+#include <libavutil/pixfmt.h>
+#include <libavutil/samplefmt.h>
+void av_free(void* ptr);
+}
+
+struct avDeleter {
+  void operator()(uint8_t* p) const {
+    av_free(p);
+  }
+};
+
+const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24;
+const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT;
+
+using AvDataPtr = std::unique_ptr<uint8_t, avDeleter>;
+
+enum MediaType : uint32_t {
+  TYPE_VIDEO = 1,
+  TYPE_AUDIO = 2,
+};
+
+struct EnumClassHash {
+  template <typename T>
+  uint32_t operator()(T t) const {
+    return static_cast<uint32_t>(t);
+  }
+};
+
+struct VideoFormat {
+  // fields are initialized for the auto detection
+  // caller can specify some/all of field values if specific output is desirable
+
+  int width{0}; // width in pixels
+  int height{0}; // height in pixels
+  int minDimension{0}; // choose min dimension and rescale accordingly
+  // Output image pixel format. data type AVPixelFormat
+  AVPixelFormat format{defaultVideoPixelFormat}; // type AVPixelFormat
+  int64_t startPts{0}, endPts{0}; // Start and end presentation timestamp
+  int timeBaseNum{0};
+  int timeBaseDen{1}; // numerator and denominator of time base
+  float fps{0.0};
+};
+
+struct AudioFormat {
+  // fields are initialized for the auto detection
+  // caller can specify some/all of field values if specific output is desirable
+
+  int samples{0}; // number samples per second (frequency)
+  int channels{0}; // number of channels
+  AVSampleFormat format{defaultAudioSampleFormat}; // type AVSampleFormat
+  int64_t startPts{0}, endPts{0}; // Start and end presentation timestamp
+  int timeBaseNum{0};
+  int timeBaseDen{1}; // numerator and denominator of time base
+};
+
+union FormatUnion {
+  FormatUnion() {}
+  VideoFormat video;
+  AudioFormat audio;
+};
+
+struct MediaFormat {
+  MediaFormat() {}
+
+  MediaFormat(const MediaFormat& mediaFormat) : type(mediaFormat.type) {
+    if (type == MediaType::TYPE_VIDEO) {
+      format.video = mediaFormat.format.video;
+    } else if (type == MediaType::TYPE_AUDIO) {
+      format.audio = mediaFormat.format.audio;
+    }
+  }
+
+  MediaFormat(MediaType mediaType) : type(mediaType) {
+    if (mediaType == MediaType::TYPE_VIDEO) {
+      format.video = VideoFormat();
+    } else if (mediaType == MediaType::TYPE_AUDIO) {
+      format.audio = AudioFormat();
+    }
+  }
+  // media type
+  MediaType type;
+  // format data
+  FormatUnion format;
+};
+
+class DecodedFrame {
+ public:
+  explicit DecodedFrame() : frame_(nullptr), frameSize_(0), pts_(0) {}
+  explicit DecodedFrame(AvDataPtr frame, int frameSize, int64_t pts)
+      : frame_(std::move(frame)), frameSize_(frameSize), pts_(pts) {}
+  AvDataPtr frame_{nullptr};
+  int frameSize_{0};
+  int64_t pts_{0};
+};
+
+struct MediaData {
+  MediaData() {}
+  MediaData(FormatUnion format) : format_(format) {}
+  FormatUnion format_;
+  std::vector<std::unique_ptr<DecodedFrame>> frames_;
+};
+
+class DecoderOutput {
+ public:
+  explicit DecoderOutput() {}
+
+  ~DecoderOutput() {}
+
+  void initMediaType(MediaType mediaType, FormatUnion format);
+
+  void addMediaFrame(MediaType mediaType, std::unique_ptr<DecodedFrame> frame);
+
+  void clear();
+
+  std::unordered_map<MediaType, MediaData, EnumClassHash> media_data_;
+};
--- a/torchvision/csrc/cpu/video_reader/VideoReader.cpp
+++ b/torchvision/csrc/cpu/video_reader/VideoReader.cpp
+#include "VideoReader.h"
+#include <ATen/ATen.h>
+#include <Python.h>
+#include <c10/util/Logging.h>
+#include <exception>
+#include "FfmpegDecoder.h"
+#include "FfmpegHeaders.h"
+#include "util.h"
+
+using namespace std;
+
+// If we are in a Windows environment, we need to define
+// initialization functions for the _custom_ops extension
+#ifdef _WIN32
+#if PY_MAJOR_VERSION < 3
+PyMODINIT_FUNC init_video_reader(void) {
+  // No need to do anything.
+  return NULL;
+}
+#else
+PyMODINIT_FUNC PyInit_video_reader(void) {
+  // No need to do anything.
+  return NULL;
+}
+#endif
+#endif
+
+namespace video_reader {
+
+bool glog_initialized = false;
+
+class UnknownPixelFormatException : public exception {
+  const char* what() const throw() override {
+    return "Unknown pixel format";
+  }
+};
+
+int getChannels(AVPixelFormat format) {
+  int numChannels = 0;
+  switch (format) {
+    case AV_PIX_FMT_BGR24:
+    case AV_PIX_FMT_RGB24:
+      numChannels = 3;
+      break;
+    default:
+      LOG(ERROR) << "Unknown format: " << format;
+      throw UnknownPixelFormatException();
+  }
+  return numChannels;
+}
+
+void fillVideoTensor(
+    std::vector<unique_ptr<DecodedFrame>>& frames,
+    torch::Tensor& videoFrame,
+    torch::Tensor& videoFramePts) {
+  int frameSize = 0;
+  if (videoFrame.numel() > 0) {
+    frameSize = videoFrame.numel() / frames.size();
+  }
+
+  int frameCount = 0;
+
+  uint8_t* videoFrameData =
+      videoFrame.numel() > 0 ? videoFrame.data_ptr<uint8_t>() : nullptr;
+  int64_t* videoFramePtsData = videoFramePts.data_ptr<int64_t>();
+
+  for (size_t i = 0; i < frames.size(); ++i) {
+    const auto& frame = frames[i];
+    if (videoFrameData) {
+      memcpy(
+          videoFrameData + (size_t)(frameCount++) * (size_t)frameSize,
+          frame->frame_.get(),
+          frameSize * sizeof(uint8_t));
+    }
+    videoFramePtsData[i] = frame->pts_;
+  }
+}
+
+void getVideoMeta(
+    DecoderOutput& decoderOutput,
+    int& numFrames,
+    int& height,
+    int& width,
+    int& numChannels) {
+  auto& videoFrames = decoderOutput.media_data_[TYPE_VIDEO].frames_;
+  numFrames = videoFrames.size();
+
+  FormatUnion& videoFormat = decoderOutput.media_data_[TYPE_VIDEO].format_;
+  height = videoFormat.video.height;
+  width = videoFormat.video.width;
+  numChannels = getChannels(videoFormat.video.format);
+}
+
+void fillAudioTensor(
+    std::vector<unique_ptr<DecodedFrame>>& frames,
+    torch::Tensor& audioFrame,
+    torch::Tensor& audioFramePts) {
+  if (frames.size() == 0) {
+    return;
+  }
+
+  float* audioFrameData =
+      audioFrame.numel() > 0 ? audioFrame.data_ptr<float>() : nullptr;
+  CHECK_EQ(audioFramePts.size(0), frames.size());
+  int64_t* audioFramePtsData = audioFramePts.data_ptr<int64_t>();
+
+  int bytesPerSample = av_get_bytes_per_sample(defaultAudioSampleFormat);
+
+  int64_t frameDataOffset = 0;
+  for (size_t i = 0; i < frames.size(); ++i) {
+    audioFramePtsData[i] = frames[i]->pts_;
+    if (audioFrameData) {
+      memcpy(
+          audioFrameData + frameDataOffset,
+          frames[i]->frame_.get(),
+          frames[i]->frameSize_);
+      frameDataOffset += (frames[i]->frameSize_ / bytesPerSample);
+    }
+  }
+}
+
+void getAudioMeta(
+    DecoderOutput& decoderOutput,
+    int64_t& numSamples,
+    int64_t& channels,
+    int64_t& numFrames) {
+  FormatUnion& audioFormat = decoderOutput.media_data_[TYPE_AUDIO].format_;
+
+  channels = audioFormat.audio.channels;
+  CHECK_EQ(audioFormat.audio.format, AV_SAMPLE_FMT_FLT);
+  int bytesPerSample = av_get_bytes_per_sample(
+      static_cast<AVSampleFormat>(audioFormat.audio.format));
+
+  // auto& audioFrames = decoderOutput.media_frames_[TYPE_AUDIO];
+  auto& audioFrames = decoderOutput.media_data_[TYPE_AUDIO].frames_;
+  numFrames = audioFrames.size();
+  int64_t frameSizeTotal = 0;
+  for (auto const& decodedFrame : audioFrames) {
+    frameSizeTotal += static_cast<int64_t>(decodedFrame->frameSize_);
+  }
+  VLOG(2) << "numFrames: " << numFrames;
+  VLOG(2) << "frameSizeTotal: " << frameSizeTotal;
+  VLOG(2) << "channels: " << channels;
+  VLOG(2) << "bytesPerSample: " << bytesPerSample;
+  CHECK_EQ(frameSizeTotal % (channels * bytesPerSample), 0);
+  numSamples = frameSizeTotal / (channels * bytesPerSample);
+}
+
+torch::List<torch::Tensor> readVideo(
+    bool isReadFile,
+    const torch::Tensor& input_video,
+    std::string videoPath,
+    double seekFrameMargin,
+    int64_t getPtsOnly,
+    int64_t readVideoStream,
+    int64_t width,
+    int64_t height,
+    int64_t minDimension,
+    int64_t videoStartPts,
+    int64_t videoEndPts,
+    int64_t videoTimeBaseNum,
+    int64_t videoTimeBaseDen,
+    int64_t readAudioStream,
+    int64_t audioSamples,
+    int64_t audioChannels,
+    int64_t audioStartPts,
+    int64_t audioEndPts,
+    int64_t audioTimeBaseNum,
+    int64_t audioTimeBaseDen) {
+  if (!glog_initialized) {
+    glog_initialized = true;
+    // google::InitGoogleLogging("VideoReader");
+  }
+
+  unique_ptr<DecoderParameters> params = util::getDecoderParams(
+      seekFrameMargin,
+      getPtsOnly,
+      readVideoStream,
+      width,
+      height,
+      minDimension,
+      videoStartPts,
+      videoEndPts,
+      videoTimeBaseNum,
+      videoTimeBaseDen,
+      readAudioStream,
+      audioSamples,
+      audioChannels,
+      audioStartPts,
+      audioEndPts,
+      audioTimeBaseNum,
+      audioTimeBaseDen);
+
+  FfmpegDecoder decoder;
+  DecoderOutput decoderOutput;
+
+  if (isReadFile) {
+    decoder.decodeFile(std::move(params), videoPath, decoderOutput);
+  } else {
+    decoder.decodeMemory(
+        std::move(params),
+        input_video.data_ptr<uint8_t>(),
+        input_video.size(0),
+        decoderOutput);
+  }
+
+  // video section
+  torch::Tensor videoFrame = torch::zeros({0}, torch::kByte);
+  torch::Tensor videoFramePts = torch::zeros({0}, torch::kLong);
+  torch::Tensor videoTimeBase = torch::zeros({0}, torch::kInt);
+  torch::Tensor videoFps = torch::zeros({0}, torch::kFloat);
+  if (readVideoStream == 1) {
+    auto it = decoderOutput.media_data_.find(TYPE_VIDEO);
+    if (it != decoderOutput.media_data_.end()) {
+      int numVideoFrames, outHeight, outWidth, numChannels;
+      getVideoMeta(
+          decoderOutput, numVideoFrames, outHeight, outWidth, numChannels);
+
+      if (getPtsOnly == 0) {
+        videoFrame = torch::zeros(
+            {numVideoFrames, outHeight, outWidth, numChannels}, torch::kByte);
+      }
+
+      videoFramePts = torch::zeros({numVideoFrames}, torch::kLong);
+
+      fillVideoTensor(
+          decoderOutput.media_data_[TYPE_VIDEO].frames_,
+          videoFrame,
+          videoFramePts);
+
+      videoTimeBase = torch::zeros({2}, torch::kInt);
+      int* videoTimeBaseData = videoTimeBase.data_ptr<int>();
+      videoTimeBaseData[0] = it->second.format_.video.timeBaseNum;
+      videoTimeBaseData[1] = it->second.format_.video.timeBaseDen;
+
+      videoFps = torch::zeros({1}, torch::kFloat);
+      float* videoFpsData = videoFps.data_ptr<float>();
+      videoFpsData[0] = it->second.format_.video.fps;
+    } else {
+      VLOG(1) << "Miss video stream";
+    }
+  }
+
+  // audio section
+  torch::Tensor audioFrame = torch::zeros({0}, torch::kFloat);
+  torch::Tensor audioFramePts = torch::zeros({0}, torch::kLong);
+  torch::Tensor audioTimeBase = torch::zeros({0}, torch::kInt);
+  torch::Tensor audioSampleRate = torch::zeros({0}, torch::kInt);
+  if (readAudioStream == 1) {
+    auto it = decoderOutput.media_data_.find(TYPE_AUDIO);
+    if (it != decoderOutput.media_data_.end()) {
+      VLOG(1) << "Find audio stream";
+      int64_t numAudioSamples = 0, outAudioChannels = 0, numAudioFrames = 0;
+      getAudioMeta(
+          decoderOutput, numAudioSamples, outAudioChannels, numAudioFrames);
+      VLOG(2) << "numAudioSamples: " << numAudioSamples;
+      VLOG(2) << "outAudioChannels: " << outAudioChannels;
+      VLOG(2) << "numAudioFrames: " << numAudioFrames;
+
+      if (getPtsOnly == 0) {
+        audioFrame =
+            torch::zeros({numAudioSamples, outAudioChannels}, torch::kFloat);
+      }
+      audioFramePts = torch::zeros({numAudioFrames}, torch::kLong);
+      fillAudioTensor(
+          decoderOutput.media_data_[TYPE_AUDIO].frames_,
+          audioFrame,
+          audioFramePts);
+
+      audioTimeBase = torch::zeros({2}, torch::kInt);
+      int* audioTimeBaseData = audioTimeBase.data_ptr<int>();
+      audioTimeBaseData[0] = it->second.format_.audio.timeBaseNum;
+      audioTimeBaseData[1] = it->second.format_.audio.timeBaseDen;
+
+      audioSampleRate = torch::zeros({1}, torch::kInt);
+      int* audioSampleRateData = audioSampleRate.data_ptr<int>();
+      audioSampleRateData[0] = it->second.format_.audio.samples;
+    } else {
+      VLOG(1) << "Miss audio stream";
+    }
+  }
+
+  torch::List<torch::Tensor> result;
+  result.push_back(std::move(videoFrame));
+  result.push_back(std::move(videoFramePts));
+  result.push_back(std::move(videoTimeBase));
+  result.push_back(std::move(videoFps));
+  result.push_back(std::move(audioFrame));
+  result.push_back(std::move(audioFramePts));
+  result.push_back(std::move(audioTimeBase));
+  result.push_back(std::move(audioSampleRate));
+
+  return result;
+}
+
+torch::List<torch::Tensor> readVideoFromMemory(
+    torch::Tensor input_video,
+    double seekFrameMargin,
+    int64_t getPtsOnly,
+    int64_t readVideoStream,
+    int64_t width,
+    int64_t height,
+    int64_t minDimension,
+    int64_t videoStartPts,
+    int64_t videoEndPts,
+    int64_t videoTimeBaseNum,
+    int64_t videoTimeBaseDen,
+    int64_t readAudioStream,
+    int64_t audioSamples,
+    int64_t audioChannels,
+    int64_t audioStartPts,
+    int64_t audioEndPts,
+    int64_t audioTimeBaseNum,
+    int64_t audioTimeBaseDen) {
+  return readVideo(
+      false,
+      input_video,
+      "", // videoPath
+      seekFrameMargin,
+      getPtsOnly,
+      readVideoStream,
+      width,
+      height,
+      minDimension,
+      videoStartPts,
+      videoEndPts,
+      videoTimeBaseNum,
+      videoTimeBaseDen,
+      readAudioStream,
+      audioSamples,
+      audioChannels,
+      audioStartPts,
+      audioEndPts,
+      audioTimeBaseNum,
+      audioTimeBaseDen);
+}
+
+torch::List<torch::Tensor> readVideoFromFile(
+    std::string videoPath,
+    double seekFrameMargin,
+    int64_t getPtsOnly,
+    int64_t readVideoStream,
+    int64_t width,
+    int64_t height,
+    int64_t minDimension,
+    int64_t videoStartPts,
+    int64_t videoEndPts,
+    int64_t videoTimeBaseNum,
+    int64_t videoTimeBaseDen,
+    int64_t readAudioStream,
+    int64_t audioSamples,
+    int64_t audioChannels,
+    int64_t audioStartPts,
+    int64_t audioEndPts,
+    int64_t audioTimeBaseNum,
+    int64_t audioTimeBaseDen) {
+  torch::Tensor dummy_input_video = torch::ones({0});
+  return readVideo(
+      true,
+      dummy_input_video,
+      videoPath,
+      seekFrameMargin,
+      getPtsOnly,
+      readVideoStream,
+      width,
+      height,
+      minDimension,
+      videoStartPts,
+      videoEndPts,
+      videoTimeBaseNum,
+      videoTimeBaseDen,
+      readAudioStream,
+      audioSamples,
+      audioChannels,
+      audioStartPts,
+      audioEndPts,
+      audioTimeBaseNum,
+      audioTimeBaseDen);
+}
+
+} // namespace video_reader
+
+static auto registry = torch::RegisterOperators()
+                           .op("video_reader::read_video_from_memory",
+                               &video_reader::readVideoFromMemory)
+                           .op("video_reader::read_video_from_file",
+                               &video_reader::readVideoFromFile);
--- a/torchvision/csrc/cpu/video_reader/VideoReader.h
+++ b/torchvision/csrc/cpu/video_reader/VideoReader.h
+#pragma once
+
+#include <torch/script.h>
+
+// Interface for Python
+
+/*
+  return:
+    videoFrame: tensor (N, H, W, C) kByte
+    videoFramePts: tensor (N) kLong
+    videoTimeBase: tensor (2) kInt
+    videoFps: tensor (1) kFloat
+    audioFrame: tensor (N, C) kFloat
+    audioFramePts: tensor (N) kLong
+    audioTimeBase: tensor (2) kInt
+    audioSampleRate: tensor (1) kInt
+*/
+torch::List<torch::Tensor> readVideoFromMemory(
+    // 1D tensor of data type uint8, storing the comparessed video data
+    torch::Tensor input_video,
+    // seeking frame in the video/audio stream is imprecise so seek to a
+    // timestamp earlier by a margin The unit of margin is second
+    double seekFrameMargin,
+    // If only pts is needed and video/audio frames are not needed, set it
+    // to 1
+    int64_t getPtsOnly,
+    // bool variable. Set it to 1 if video stream should be read. Otherwise, set
+    // it to 0
+    int64_t readVideoStream,
+    /*
+    Valid parameters values for rescaling video frames
+    ___________________________________________________
+    |  width  |  height  | min_dimension |  algorithm |
+    |_________________________________________________|
+    |  0  |  0  |     0        |   original           |
+    |_________________________________________________|
+    |  0  |  0  |     >0       |scale to min dimension|
+    |_____|_____|____________________________________ |
+    |  >0 |  0  |     0        |   scale keeping W    |
+    |_________________________________________________|
+    |  0  |  >0 |     0        |   scale keeping H    |
+    |_________________________________________________|
+    |  >0 |  >0 |     0        |   stretch/scale      |
+    |_________________________________________________|
+    */
+    int64_t width,
+    int64_t height,
+    int64_t minDimension,
+    // video frames with pts in [videoStartPts, videoEndPts] will be decoded
+    // For decoding all video frames, use [0, -1]
+    int64_t videoStartPts,
+    int64_t videoEndPts,
+    // numerator and denominator of time base of video stream.
+    // For decoding all video frames, supply dummy 0 (numerator) and 1
+    // (denominator). For decoding localized video frames, need to supply
+    // them which will be checked during decoding
+    int64_t videoTimeBaseNum,
+    int64_t videoTimeBaseDen,
+    // bool variable. Set it to 1 if audio stream should be read. Otherwise, set
+    // it to 0
+    int64_t readAudioStream,
+    // audio stream sampling rate.
+    // If not resampling audio waveform, supply 0
+    // Otherwise, supply a positive integer.
+    int64_t audioSamples,
+    // audio stream channels
+    // Supply 0 to use the same number of channels as in the original audio
+    // stream
+    int64_t audioChannels,
+    // audio frames with pts in [audioStartPts, audioEndPts] will be decoded
+    // For decoding all audio frames, use [0, -1]
+    int64_t audioStartPts,
+    int64_t audioEndPts,
+    // numerator and denominator of time base of audio stream.
+    // For decoding all audio frames, supply dummy 0 (numerator) and 1
+    // (denominator). For decoding localized audio frames, need to supply
+    // them which will be checked during decoding
+    int64_t audioTimeBaseNum,
+    int64_t audioTimeBaseDen);
+
+torch::List<torch::Tensor> readVideoFromFile(
+    std::string videoPath,
+    double seekFrameMargin,
+    int64_t getPtsOnly,
+    int64_t readVideoStream,
+    int64_t width,
+    int64_t height,
+    int64_t minDimension,
+    int64_t videoStartPts,
+    int64_t videoEndPts,
+    int64_t videoTimeBaseNum,
+    int64_t videoTimeBaseDen,
+    int64_t readAudioStream,
+    int64_t audioSamples,
+    int64_t audioChannels,
+    int64_t audioStartPts,
+    int64_t audioEndPts,
+    int64_t audioTimeBaseNum,
+    int64_t audioTimeBaseDen);
--- a/torchvision/csrc/cpu/video_reader/util.cpp
+++ b/torchvision/csrc/cpu/video_reader/util.cpp
+#include "util.h"
+
+using namespace std;
+
+namespace util {
+
+unique_ptr<DecoderParameters> getDecoderParams(
+    double seekFrameMargin,
+    int64_t getPtsOnly,
+    int64_t readVideoStream,
+    int videoWidth,
+    int videoHeight,
+    int videoMinDimension,
+    int64_t videoStartPts,
+    int64_t videoEndPts,
+    int videoTimeBaseNum,
+    int videoTimeBaseDen,
+    int64_t readAudioStream,
+    int audioSamples,
+    int audioChannels,
+    int64_t audioStartPts,
+    int64_t audioEndPts,
+    int audioTimeBaseNum,
+    int audioTimeBaseDen) {
+  unique_ptr<DecoderParameters> params = make_unique<DecoderParameters>();
+
+  if (readVideoStream == 1) {
+    params->formats.emplace(
+        MediaType::TYPE_VIDEO, MediaFormat(MediaType::TYPE_VIDEO));
+    MediaFormat& videoFormat = params->formats[MediaType::TYPE_VIDEO];
+
+    videoFormat.format.video.width = videoWidth;
+    videoFormat.format.video.height = videoHeight;
+    videoFormat.format.video.minDimension = videoMinDimension;
+    videoFormat.format.video.startPts = videoStartPts;
+    videoFormat.format.video.endPts = videoEndPts;
+    videoFormat.format.video.timeBaseNum = videoTimeBaseNum;
+    videoFormat.format.video.timeBaseDen = videoTimeBaseDen;
+  }
+
+  if (readAudioStream == 1) {
+    params->formats.emplace(
+        MediaType::TYPE_AUDIO, MediaFormat(MediaType::TYPE_AUDIO));
+    MediaFormat& audioFormat = params->formats[MediaType::TYPE_AUDIO];
+
+    audioFormat.format.audio.samples = audioSamples;
+    audioFormat.format.audio.channels = audioChannels;
+    audioFormat.format.audio.startPts = audioStartPts;
+    audioFormat.format.audio.endPts = audioEndPts;
+    audioFormat.format.audio.timeBaseNum = audioTimeBaseNum;
+    audioFormat.format.audio.timeBaseDen = audioTimeBaseDen;
+  }
+
+  params->seekFrameMargin = seekFrameMargin;
+  params->getPtsOnly = getPtsOnly;
+
+  return params;
+}
+
+} // namespace util
--- a/torchvision/csrc/cpu/video_reader/util.h
+++ b/torchvision/csrc/cpu/video_reader/util.h
+#pragma once
+#include <memory>
+#include "FfmpegDecoder.h"
+
+namespace util {
+
+std::unique_ptr<DecoderParameters> getDecoderParams(
+    double seekFrameMargin,
+    int64_t getPtsOnly,
+    int64_t readVideoStream,
+    int videoWidth,
+    int videoHeight,
+    int videoMinDimension,
+    int64_t videoStartPts,
+    int64_t videoEndPts,
+    int videoTimeBaseNum,
+    int videoTimeBaseDen,
+    int64_t readAudioStream,
+    int audioSamples,
+    int audioChannels,
+    int64_t audioStartPts,
+    int64_t audioEndPts,
+    int audioTimeBaseNum,
+    int audioTimeBaseDen);
+
+} // namespace util
--- a/torchvision/io/__init__.py
+++ b/torchvision/io/__init__.py
 from .video import write_video, read_video, read_video_timestamps
+from ._video_opt import _read_video_from_file, _read_video_timestamps_from_file


 __all__ = [
-    'write_video', 'read_video', 'read_video_timestamps'
+    'write_video', 'read_video', 'read_video_timestamps',
 ]
--- a/torchvision/io/_video_opt.py
+++ b/torchvision/io/_video_opt.py
+from fractions import Fraction
+import numpy as np
+import os
+import torch
+import imp
+import warnings
+
+
+_HAS_VIDEO_OPT = False
+
+try:
+    lib_dir = os.path.join(os.path.dirname(__file__), '..')
+    _, path, description = imp.find_module("video_reader", [lib_dir])
+    torch.ops.load_library(path)
+    _HAS_VIDEO_OPT = True
+except (ImportError, OSError):
+    warnings.warn("video reader based on ffmpeg c++ ops not available")
+
+default_timebase = Fraction(0, 1)
+
+
+def _validate_pts(pts_range):
+    if pts_range[1] > 0:
+        assert pts_range[0] <= pts_range[1], \
+            """Start pts should not be smaller than end pts, got
+            start pts: %d and end pts: %d""" % (pts_range[0], pts_range[1])
+
+
+def _fill_info(vtimebase, vfps, atimebase, asample_rate):
+    info = {}
+    if vtimebase.numel() > 0:
+        info["video_timebase"] = Fraction(vtimebase[0].item(), vtimebase[1].item())
+    if vfps.numel() > 0:
+        info["video_fps"] = vfps.item()
+    if atimebase.numel() > 0:
+        info["audio_timebase"] = Fraction(atimebase[0].item(), atimebase[1].item())
+    if asample_rate.numel() > 0:
+        info["audio_sample_rate"] = asample_rate.item()
+
+    return info
+
+
+def _align_audio_frames(aframes, aframe_pts, audio_pts_range):
+    start, end = aframe_pts[0], aframe_pts[-1]
+    num_samples = aframes.size(0)
+    step_per_aframe = float(end - start + 1) / float(num_samples)
+    s_idx = 0
+    e_idx = num_samples
+    if start < audio_pts_range[0]:
+        s_idx = int((audio_pts_range[0] - start) / step_per_aframe)
+    if end > audio_pts_range[1]:
+        e_idx = int((audio_pts_range[1] - end) / step_per_aframe)
+    return aframes[s_idx:e_idx, :]
+
+
+def _read_video_from_file(
+    filename,
+    seek_frame_margin=0.25,
+    read_video_stream=True,
+    video_width=0,
+    video_height=0,
+    video_min_dimension=0,
+    video_pts_range=(0, -1),
+    video_timebase=default_timebase,
+    read_audio_stream=True,
+    audio_samples=0,
+    audio_channels=0,
+    audio_pts_range=(0, -1),
+    audio_timebase=default_timebase,
+):
+    """
+    Reads a video from a file, returning both the video frames as well as
+    the audio frames
+
+    Args
+    ----------
+    filename : str
+        path to the video file
+    seek_frame_margin: double, optional
+        seeking frame in the stream is imprecise. Thus, when video_start_pts is specified,
+        we seek the pts earlier by seek_frame_margin seconds
+    read_video_stream: int, optional
+        whether read video stream. If yes, set to 1. Otherwise, 0
+    video_width/video_height/video_min_dimension: int
+        together decide the size of decoded frames
+        - when video_width = 0, video_height = 0, and video_min_dimension = 0, keep the orignal frame resolution
+        - when video_width = 0, video_height = 0, and video_min_dimension != 0, keep the aspect ratio and resize
+            the frame so that shorter edge size is video_min_dimension
+        - When video_width = 0, and video_height != 0, keep the aspect ratio and resize the frame
+            so that frame video_height is $video_height
+        - When video_width != 0, and video_height == 0, keep the aspect ratio and resize the frame
+            so that frame video_height is $video_width
+        - When video_width != 0, and video_height != 0, resize the frame so that frame video_width and video_height
+            are set to $video_width and $video_height, respectively
+    video_pts_range : list(int), optional
+        the start and end presentation timestamp of video stream
+    video_timebase: Fraction, optional
+        a Fraction rational number which denotes timebase in video stream
+    read_audio_stream: int, optional
+        whether read audio stream. If yes, set to 1. Otherwise, 0
+    audio_samples: int, optional
+        audio sampling rate
+    audio_channels: int optional
+        audio channels
+    audio_pts_range : list(int), optional
+        the start and end presentation timestamp of audio stream
+    audio_timebase: Fraction, optional
+        a Fraction rational number which denotes time base in audio stream
+
+    Returns
+    -------
+    vframes : Tensor[T, H, W, C]
+        the `T` video frames
+    aframes : Tensor[L, K]
+        the audio frames, where `L` is the number of points and
+            `K` is the number of audio_channels
+    info : Dict
+        metadata for the video and audio. Can contain the fields video_fps (float)
+        and audio_fps (int)
+    """
+    _validate_pts(video_pts_range)
+    _validate_pts(audio_pts_range)
+
+    result = torch.ops.video_reader.read_video_from_file(
+        filename,
+        seek_frame_margin,
+        0,  # getPtsOnly
+        read_video_stream,
+        video_width,
+        video_height,
+        video_min_dimension,
+        video_pts_range[0],
+        video_pts_range[1],
+        video_timebase.numerator,
+        video_timebase.denominator,
+        read_audio_stream,
+        audio_samples,
+        audio_channels,
+        audio_pts_range[0],
+        audio_pts_range[1],
+        audio_timebase.numerator,
+        audio_timebase.denominator,
+    )
+    vframes, _vframe_pts, vtimebase, vfps, aframes, aframe_pts, atimebase, asample_rate = result
+    info = _fill_info(vtimebase, vfps, atimebase, asample_rate)
+    if aframes.numel() > 0:
+        # when audio stream is found
+        aframes = _align_audio_frames(aframes, aframe_pts, audio_pts_range)
+    return vframes, aframes, info
+
+
+def _read_video_timestamps_from_file(filename):
+    """
+    Decode all video- and audio frames in the video. Only pts
+    (presentation timestamp) is returned. The actual frame pixel data is not
+    copied. Thus, it is much faster than read_video(...)
+    """
+    result = torch.ops.video_reader.read_video_from_file(
+        filename,
+        0,  # seek_frame_margin
+        1,  # getPtsOnly
+        1,  # read_video_stream
+        0,  # video_width
+        0,  # video_height
+        0,  # video_min_dimension
+        0,  # video_start_pts
+        -1,  # video_end_pts
+        0,  # video_timebase_num
+        1,  # video_timebase_den
+        1,  # read_audio_stream
+        0,  # audio_samples
+        0,  # audio_channels
+        0,  # audio_start_pts
+        -1,  # audio_end_pts
+        0,  # audio_timebase_num
+        1,  # audio_timebase_den
+    )
+    _vframes, vframe_pts, vtimebase, vfps, _aframes, aframe_pts, atimebase, asample_rate = result
+    info = _fill_info(vtimebase, vfps, atimebase, asample_rate)
+
+    vframe_pts = vframe_pts.numpy().tolist()
+    aframe_pts = aframe_pts.numpy().tolist()
+    return vframe_pts, aframe_pts, info
+
+
+def _read_video_from_memory(
+    file_buffer,
+    seek_frame_margin=0.25,
+    read_video_stream=1,
+    video_width=0,
+    video_height=0,
+    video_min_dimension=0,
+    video_pts_range=(0, -1),
+    video_timebase=default_timebase,
+    read_audio_stream=1,
+    audio_samples=0,
+    audio_channels=0,
+    audio_pts_range=(0, -1),
+    audio_timebase=default_timebase,
+):
+    """
+    Reads a video from memory, returning both the video frames as well as
+    the audio frames
+
+    Args
+    ----------
+    file_buffer : buffer
+        buffer of compressed video content
+    seek_frame_margin: double, optional
+        seeking frame in the stream is imprecise. Thus, when video_start_pts is specified,
+        we seek the pts earlier by seek_frame_margin seconds
+    read_video_stream: int, optional
+        whether read video stream. If yes, set to 1. Otherwise, 0
+    video_width/video_height/video_min_dimension: int
+        together decide the size of decoded frames
+        - when video_width = 0, video_height = 0, and video_min_dimension = 0, keep the orignal frame resolution
+        - when video_width = 0, video_height = 0, and video_min_dimension != 0, keep the aspect ratio and resize
+            the frame so that shorter edge size is video_min_dimension
+        - When video_width = 0, and video_height != 0, keep the aspect ratio and resize the frame
+            so that frame video_height is $video_height
+        - When video_width != 0, and video_height == 0, keep the aspect ratio and resize the frame
+            so that frame video_height is $video_width
+        - When video_width != 0, and video_height != 0, resize the frame so that frame video_width and video_height
+            are set to $video_width and $video_height, respectively
+    video_pts_range : list(int), optional
+        the start and end presentation timestamp of video stream
+    video_timebase: Fraction, optional
+        a Fraction rational number which denotes timebase in video stream
+    read_audio_stream: int, optional
+        whether read audio stream. If yes, set to 1. Otherwise, 0
+    audio_samples: int, optional
+        audio sampling rate
+    audio_channels: int optional
+        audio audio_channels
+    audio_pts_range : list(int), optional
+        the start and end presentation timestamp of audio stream
+    audio_timebase: Fraction, optional
+        a Fraction rational number which denotes time base in audio stream
+
+    Returns
+    -------
+    vframes : Tensor[T, H, W, C]
+        the `T` video frames
+    aframes : Tensor[L, K]
+        the audio frames, where `L` is the number of points and
+            `K` is the number of channels
+    info : Dict
+        metadata for the video and audio. Can contain the fields video fps (float)
+        and audio sample rate (int)
+    """
+
+    _validate_pts(video_pts_range)
+    _validate_pts(audio_pts_range)
+
+    video_tensor = torch.from_numpy(np.frombuffer(file_buffer, dtype=np.uint8))
+
+    result = torch.ops.video_reader.read_video_from_memory(
+        video_tensor,
+        seek_frame_margin,
+        0,  # getPtsOnly
+        read_video_stream,
+        video_width,
+        video_height,
+        video_min_dimension,
+        video_pts_range[0],
+        video_pts_range[1],
+        video_timebase.numerator,
+        video_timebase.denominator,
+        read_audio_stream,
+        audio_samples,
+        audio_channels,
+        audio_pts_range[0],
+        audio_pts_range[1],
+        audio_timebase.numerator,
+        audio_timebase.denominator,
+    )
+
+    vframes, _vframe_pts, vtimebase, vfps, aframes, aframe_pts, atimebase, asample_rate = result
+    info = _fill_info(vtimebase, vfps, atimebase, asample_rate)
+    if aframes.numel() > 0:
+        # when audio stream is found
+        aframes = _align_audio_frames(aframes, aframe_pts, audio_pts_range)
+    return vframes, aframes, info
+
+
+def _read_video_timestamps_from_memory(file_buffer):
+    """
+    Decode all frames in the video. Only pts (presentation timestamp) is returned.
+    The actual frame pixel data is not copied. Thus, read_video_timestamps(...)
+    is much faster than read_video(...)
+    """
+
+    video_tensor = torch.from_numpy(np.frombuffer(file_buffer, dtype=np.uint8))
+    result = torch.ops.video_reader.read_video_from_memory(
+        video_tensor,
+        0,  # seek_frame_margin
+        1,  # getPtsOnly
+        1,  # read_video_stream
+        0,  # video_width
+        0,  # video_height
+        0,  # video_min_dimension
+        0,  # video_start_pts
+        -1,  # video_end_pts
+        0,  # video_timebase_num
+        1,  # video_timebase_den
+        1,  # read_audio_stream
+        0,  # audio_samples
+        0,  # audio_channels
+        0,  # audio_start_pts
+        -1,  # audio_end_pts
+        0,  # audio_timebase_num
+        1,  # audio_timebase_den
+    )
+    _vframes, vframe_pts, vtimebase, vfps, _aframes, aframe_pts, atimebase, asample_rate = result
+    info = _fill_info(vtimebase, vfps, atimebase, asample_rate)
+
+    vframe_pts = vframe_pts.numpy().tolist()
+    aframe_pts = aframe_pts.numpy().tolist()
+    return vframe_pts, aframe_pts, info