Revert "Base decoder for video. (#1747) (#1793)" (#1833)

This reverts commit 28b7f8ae.

Revert "Base decoder for video. (#1747) (#1793)" (#1833)
This reverts commit 28b7f8ae.
f2600c2e · Francisco Massa · GitHub · c8345212 · c8345212 · c8345212
Unverified Commit f2600c2e authored Jan 29, 2020 by Francisco Massa Committed by GitHub Jan 29, 2020
9 changed files
--- a/torchvision/csrc/cpu/decoder/sync_decoder_test.cpp
+++ b/torchvision/csrc/cpu/decoder/sync_decoder_test.cpp
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#include <c10/util/Logging.h>
-#include <gtest/gtest.h>
-#include "sync_decoder.h"
-
-using namespace ffmpeg;
-
-TEST(SyncDecoder, Test) {
-  SyncDecoder decoder;
-  DecoderParameters params;
-  params.timeoutMs = 10000;
-  params.startOffsetMs = 1000;
-  params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')};
-  params.uri = "pytorch/vision/test/assets/videos/R6llTwEh07w.mp4";
-  CHECK(decoder.init(params, nullptr));
-  DecoderOutputMessage out;
-  while (0 == decoder.decode(&out, 100)) {
-    LOG(INFO) << "Decoded frame, timestamp(us): " << out.header.pts;
-  }
-  decoder.shutdown();
-}
--- a/torchvision/csrc/cpu/decoder/time_keeper.cpp
+++ b/torchvision/csrc/cpu/decoder/time_keeper.cpp
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#include "time_keeper.h"
-
-extern "C" {
-#include <libavutil/avutil.h>
-}
-
-namespace ffmpeg {
-
-namespace {
-const ssize_t kMaxTimeBaseDiference = 10;
-}
-
-ssize_t TimeKeeper::adjust(ssize_t& decoderTimestamp) {
-  const ssize_t now = std::chrono::duration_cast<std::chrono::microseconds>(
-                          std::chrono::system_clock::now().time_since_epoch())
-                          .count();
-
-  if (startTime_ == 0) {
-    startTime_ = now;
-  }
-  if (streamTimestamp_ == 0) {
-    streamTimestamp_ = decoderTimestamp;
-  }
-
-  const auto runOut = startTime_ + decoderTimestamp - streamTimestamp_;
-
-  if (std::labs((now - runOut) / AV_TIME_BASE) > kMaxTimeBaseDiference) {
-    streamTimestamp_ = startTime_ - now + decoderTimestamp;
-  }
-
-  const auto sleepAdvised = runOut - now;
-
-  decoderTimestamp += startTime_ - streamTimestamp_;
-
-  return sleepAdvised > 0 ? sleepAdvised : 0;
-}
-
-} // namespace ffmpeg
--- a/torchvision/csrc/cpu/decoder/time_keeper.h
+++ b/torchvision/csrc/cpu/decoder/time_keeper.h
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#pragma once
-
-#include <stdlib.h>
-#include <chrono>
-
-namespace ffmpeg {
-
-/**
- * Class keeps the track of the decoded timestamps (us) for media streams.
- */
-
-class TimeKeeper {
- public:
-  TimeKeeper() = default;
-
-  // adjust provided @timestamp to the corrected value
-  // return advised sleep time before next frame processing in (us)
-  ssize_t adjust(ssize_t& decoderTimestamp);
-
- private:
-  ssize_t startTime_{0};
-  ssize_t streamTimestamp_{0};
-};
-
-} // namespace ffmpeg
--- a/torchvision/csrc/cpu/decoder/util.cpp
+++ b/torchvision/csrc/cpu/decoder/util.cpp
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#include "util.h"
-#include <c10/util/Logging.h>
-
-namespace ffmpeg {
-
-namespace Serializer {
-
-// fixed size types
-template <typename T>
-inline size_t getSize(const T& x) {
-  return sizeof(x);
-}
-
-template <typename T>
-inline bool serializeItem(
-    uint8_t* dest,
-    size_t len,
-    size_t& pos,
-    const T& src) {
-  VLOG(6) << "Generic serializeItem";
-  const auto required = sizeof(src);
-  if (len < pos + required) {
-    return false;
-  }
-  memcpy(dest + pos, &src, required);
-  pos += required;
-  return true;
-}
-
-template <typename T>
-inline bool deserializeItem(
-    const uint8_t* src,
-    size_t len,
-    size_t& pos,
-    T& dest) {
-  const auto required = sizeof(dest);
-  if (len < pos + required) {
-    return false;
-  }
-  memcpy(&dest, src + pos, required);
-  pos += required;
-  return true;
-}
-
-// AVSubtitleRect specialization
-inline size_t getSize(const AVSubtitleRect& x) {
-  auto rectBytes = [](const AVSubtitleRect& y) -> size_t {
-    size_t s = 0;
-    switch (y.type) {
-      case SUBTITLE_BITMAP:
-        for (int i = 0; i < y.nb_colors; ++i) {
-          s += sizeof(y.pict.linesize[i]);
-          s += y.pict.linesize[i];
-        }
-        break;
-      case SUBTITLE_TEXT:
-        s += sizeof(size_t);
-        s += strlen(y.text);
-        break;
-      case SUBTITLE_ASS:
-        s += sizeof(size_t);
-        s += strlen(y.ass);
-        break;
-      default:
-        break;
-    }
-    return s;
-  };
-  return getSize(x.x) + getSize(x.y) + getSize(x.w) + getSize(x.h) +
-      getSize(x.nb_colors) + getSize(x.type) + getSize(x.flags) + rectBytes(x);
-}
-
-// AVSubtitle specialization
-inline size_t getSize(const AVSubtitle& x) {
-  auto rectBytes = [](const AVSubtitle& y) -> size_t {
-    size_t s = getSize(y.num_rects);
-    for (unsigned i = 0; i < y.num_rects; ++i) {
-      s += getSize(*y.rects[i]);
-    }
-    return s;
-  };
-  return getSize(x.format) + getSize(x.start_display_time) +
-      getSize(x.end_display_time) + getSize(x.pts) + rectBytes(x);
-}
-
-inline bool serializeItem(
-    uint8_t* dest,
-    size_t len,
-    size_t& pos,
-    const AVSubtitleRect& src) {
-  auto rectSerialize =
-      [](uint8_t* d, size_t l, size_t& p, const AVSubtitleRect& x) -> size_t {
-    switch (x.type) {
-      case SUBTITLE_BITMAP:
-        for (int i = 0; i < x.nb_colors; ++i) {
-          if (!serializeItem(d, l, p, x.pict.linesize[i])) {
-            return false;
-          }
-          if (p + x.pict.linesize[i] > l) {
-            return false;
-          }
-          memcpy(d + p, x.pict.data[i], x.pict.linesize[i]);
-          p += x.pict.linesize[i];
-        }
-        return true;
-      case SUBTITLE_TEXT: {
-        const size_t s = strlen(x.text);
-        if (!serializeItem(d, l, p, s)) {
-          return false;
-        }
-        if (p + s > l) {
-          return false;
-        }
-        memcpy(d + p, x.text, s);
-        p += s;
-        return true;
-      }
-      case SUBTITLE_ASS: {
-        const size_t s = strlen(x.ass);
-        if (!serializeItem(d, l, p, s)) {
-          return false;
-        }
-        if (p + s > l) {
-          return false;
-        }
-        memcpy(d + p, x.ass, s);
-        p += s;
-        return true;
-      }
-      default:
-        return true;
-    }
-  };
-  return serializeItem(dest, len, pos, src.x) &&
-      serializeItem(dest, len, pos, src.y) &&
-      serializeItem(dest, len, pos, src.w) &&
-      serializeItem(dest, len, pos, src.h) &&
-      serializeItem(dest, len, pos, src.nb_colors) &&
-      serializeItem(dest, len, pos, src.type) &&
-      serializeItem(dest, len, pos, src.flags) &&
-      rectSerialize(dest, len, pos, src);
-}
-
-inline bool serializeItem(
-    uint8_t* dest,
-    size_t len,
-    size_t& pos,
-    const AVSubtitle& src) {
-  auto rectSerialize =
-      [](uint8_t* d, size_t l, size_t& p, const AVSubtitle& x) -> bool {
-    bool res = serializeItem(d, l, p, x.num_rects);
-    for (unsigned i = 0; res && i < x.num_rects; ++i) {
-      res = serializeItem(d, l, p, *(x.rects[i]));
-    }
-    return res;
-  };
-  VLOG(6) << "AVSubtitle serializeItem";
-  return serializeItem(dest, len, pos, src.format) &&
-      serializeItem(dest, len, pos, src.start_display_time) &&
-      serializeItem(dest, len, pos, src.end_display_time) &&
-      serializeItem(dest, len, pos, src.pts) &&
-      rectSerialize(dest, len, pos, src);
-}
-
-inline bool deserializeItem(
-    const uint8_t* src,
-    size_t len,
-    size_t& pos,
-    AVSubtitleRect& dest) {
-  auto rectDeserialize =
-      [](const uint8_t* y, size_t l, size_t& p, AVSubtitleRect& x) -> bool {
-    switch (x.type) {
-      case SUBTITLE_BITMAP:
-        for (int i = 0; i < x.nb_colors; ++i) {
-          if (!deserializeItem(y, l, p, x.pict.linesize[i])) {
-            return false;
-          }
-          if (p + x.pict.linesize[i] > l) {
-            return false;
-          }
-          x.pict.data[i] = (uint8_t*)av_malloc(x.pict.linesize[i]);
-          memcpy(x.pict.data[i], y + p, x.pict.linesize[i]);
-          p += x.pict.linesize[i];
-        }
-        return true;
-      case SUBTITLE_TEXT: {
-        size_t s = 0;
-        if (!deserializeItem(y, l, p, s)) {
-          return false;
-        }
-        if (p + s > l) {
-          return false;
-        }
-        x.text = (char*)av_malloc(s + 1);
-        memcpy(x.text, y + p, s);
-        x.text[s] = 0;
-        p += s;
-        return true;
-      }
-      case SUBTITLE_ASS: {
-        size_t s = 0;
-        if (!deserializeItem(y, l, p, s)) {
-          return false;
-        }
-        if (p + s > l) {
-          return false;
-        }
-        x.ass = (char*)av_malloc(s + 1);
-        memcpy(x.ass, y + p, s);
-        x.ass[s] = 0;
-        p += s;
-        return true;
-      }
-      default:
-        return true;
-    }
-  };
-
-  return deserializeItem(src, len, pos, dest.x) &&
-      deserializeItem(src, len, pos, dest.y) &&
-      deserializeItem(src, len, pos, dest.w) &&
-      deserializeItem(src, len, pos, dest.h) &&
-      deserializeItem(src, len, pos, dest.nb_colors) &&
-      deserializeItem(src, len, pos, dest.type) &&
-      deserializeItem(src, len, pos, dest.flags) &&
-      rectDeserialize(src, len, pos, dest);
-}
-
-inline bool deserializeItem(
-    const uint8_t* src,
-    size_t len,
-    size_t& pos,
-    AVSubtitle& dest) {
-  auto rectDeserialize =
-      [](const uint8_t* y, size_t l, size_t& p, AVSubtitle& x) -> bool {
-    bool res = deserializeItem(y, l, p, x.num_rects);
-    if (res && x.num_rects) {
-      x.rects =
-          (AVSubtitleRect**)av_malloc(x.num_rects * sizeof(AVSubtitleRect*));
-    }
-    for (unsigned i = 0; res && i < x.num_rects; ++i) {
-      x.rects[i] = (AVSubtitleRect*)av_malloc(sizeof(AVSubtitleRect));
-      memset(x.rects[i], 0, sizeof(AVSubtitleRect));
-      res = deserializeItem(y, l, p, *x.rects[i]);
-    }
-    return res;
-  };
-  return deserializeItem(src, len, pos, dest.format) &&
-      deserializeItem(src, len, pos, dest.start_display_time) &&
-      deserializeItem(src, len, pos, dest.end_display_time) &&
-      deserializeItem(src, len, pos, dest.pts) &&
-      rectDeserialize(src, len, pos, dest);
-}
-} // namespace Serializer
-
-namespace Util {
-std::string generateErrorDesc(int errorCode) {
-  std::array<char, 1024> buffer;
-  if (av_strerror(errorCode, buffer.data(), buffer.size()) < 0) {
-    return std::string("Unknown error code: ") + std::to_string(errorCode);
-  }
-  buffer.back() = 0;
-  return std::string(buffer.data());
-}
-
-size_t serialize(const AVSubtitle& sub, ByteStorage* out) {
-  const auto len = size(sub);
-  CHECK_LE(len, out->tail());
-  size_t pos = 0;
-  if (!Serializer::serializeItem(out->writableTail(), len, pos, sub)) {
-    return 0;
-  }
-  out->append(len);
-  return len;
-}
-
-bool deserialize(const ByteStorage& buf, AVSubtitle* sub) {
-  size_t pos = 0;
-  return Serializer::deserializeItem(buf.data(), buf.length(), pos, *sub);
-}
-
-size_t size(const AVSubtitle& sub) {
-  return Serializer::getSize(sub);
-}
-
-bool validateVideoFormat(const VideoFormat& f) {
-  /*
-  Valid parameters values for decoder
-  ______________________________________________________________
-  |  W  |  H  | minDimension | cropImage |  algorithm           |
-  |_____________________________________________________________|
-  |  0  |  0  |     0        |  N/A      |   original           |
-  |_____________________________________________________________|
-  |  >0 |  0  |     N/A      |  N/A      |   scale keeping W    |
-  |_____________________________________________________________|
-  |  0  |  >0 |     N/A      |  N/A      |   scale keeping H    |
-  |_____________________________________________________________|
-  |  >0 |  >0 |     N/A      |   0       |   stretch/scale      |
-  |_____________________________________________________________|
-  |  >0 |  >0 |     N/A      |   >0      |   scale/crop         |
-  |_____________________________________________________________|
-  |  0  |  0  |     >0       |  N/A      |scale to min dimension|
-  |_____|_____|______________|___________|______________________|
-  */
-  return (f.width == 0 && // #1 and #6
-          f.height == 0 && f.cropImage == 0) ||
-      (f.width != 0 && // #4 and #5
-       f.height != 0 && f.minDimension == 0) ||
-      (((f.width != 0 && // #2
-         f.height == 0) ||
-        (f.width == 0 && // #3
-         f.height != 0)) &&
-       f.minDimension == 0 && f.cropImage == 0);
-}
-
-void setFormatDimensions(
-    size_t& destW,
-    size_t& destH,
-    size_t userW,
-    size_t userH,
-    size_t srcW,
-    size_t srcH,
-    size_t minDimension,
-    size_t cropImage) {
-  // rounding rules
-  // int -> double -> round up
-  // if fraction is >= 0.5 or round down if fraction is < 0.5
-  // int result = double(value) + 0.5
-  // here we rounding double to int according to the above rule
-  if (userW == 0 && userH == 0) {
-    if (minDimension > 0) {
-      if (srcW > srcH) {
-        // landscape
-        destH = minDimension;
-        destW = round(double(srcW * minDimension) / srcH);
-      } else {
-        // portrait
-        destW = minDimension;
-        destH = round(double(srcH * minDimension) / srcW);
-      }
-    } else {
-      destW = srcW;
-      destH = srcH;
-    }
-  } else if (userW != 0 && userH == 0) {
-    destW = userW;
-    destH = round(double(srcH * userW) / srcW);
-  } else if (userW == 0 && userH != 0) {
-    destW = round(double(srcW * userH) / srcH);
-    destH = userH;
-  } else { // userW != 0 && userH != 0
-    if (cropImage == 0) {
-      destW = userW;
-      destH = userH;
-    } else {
-      double userSlope = double(userH) / userW;
-      double srcSlope = double(srcH) / srcW;
-      if (srcSlope < userSlope) {
-        destW = round(double(srcW * userH) / srcH);
-        destH = userH;
-      } else {
-        destW = userW;
-        destH = round(double(srcH * userW) / srcW);
-      }
-    }
-  }
-  // prevent zeros
-  destW = std::max(destW, 1UL);
-  destH = std::max(destH, 1UL);
-}
-} // namespace Util
-} // namespace ffmpeg
--- a/torchvision/csrc/cpu/decoder/util.h
+++ b/torchvision/csrc/cpu/decoder/util.h
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#pragma once
-
-#include "defs.h"
-
-extern "C" {
-#include <libavcodec/avcodec.h>
-}
-
-namespace ffmpeg {
-
-/**
- * FFMPEG library utility functions.
- */
-
-namespace Util {
-std::string generateErrorDesc(int errorCode);
-size_t serialize(const AVSubtitle& sub, ByteStorage* out);
-bool deserialize(const ByteStorage& buf, AVSubtitle* sub);
-size_t size(const AVSubtitle& sub);
-void setFormatDimensions(
-    size_t& destW,
-    size_t& destH,
-    size_t userW,
-    size_t userH,
-    size_t srcW,
-    size_t srcH,
-    size_t minDimension,
-    size_t cropImage);
-bool validateVideoFormat(const VideoFormat& format);
-} // namespace Util
-} // namespace ffmpeg
--- a/torchvision/csrc/cpu/decoder/video_sampler.cpp
+++ b/torchvision/csrc/cpu/decoder/video_sampler.cpp
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#include "video_sampler.h"
-#include <c10/util/Logging.h>
-#include "util.h"
-
-extern "C" {
-#include <libavutil/imgutils.h>
-}
-
-// www.ffmpeg.org/doxygen/0.5/swscale-example_8c-source.html
-
-namespace ffmpeg {
-
-namespace {
-int preparePlanes(
-    const VideoFormat& fmt,
-    const uint8_t* buffer,
-    uint8_t** planes,
-    int* lineSize) {
-  int result;
-
-  if ((result = av_image_fill_arrays(
-           planes,
-           lineSize,
-           buffer,
-           (AVPixelFormat)fmt.format,
-           fmt.width,
-           fmt.height,
-           1)) < 0) {
-    LOG(ERROR) << "av_image_fill_arrays failed, err: "
-               << Util::generateErrorDesc(result);
-  }
-  return result;
-}
-
-int transformImage(
-    SwsContext* context,
-    const uint8_t* const srcSlice[],
-    int srcStride[],
-    VideoFormat inFormat,
-    VideoFormat outFormat,
-    uint8_t* out,
-    uint8_t* planes[],
-    int lines[]) {
-  int result;
-  if ((result = preparePlanes(outFormat, out, planes, lines)) < 0) {
-    return result;
-  }
-
-  if ((result = sws_scale(
-           context, srcSlice, srcStride, 0, inFormat.height, planes, lines)) <
-      0) {
-    LOG(ERROR) << "sws_scale failed, err: " << Util::generateErrorDesc(result);
-    return result;
-  }
-  return 0;
-}
-} // namespace
-
-VideoSampler::VideoSampler(int swsFlags, int64_t loggingUuid)
-    : swsFlags_(swsFlags), loggingUuid_(loggingUuid) {}
-
-VideoSampler::~VideoSampler() {
-  cleanUp();
-}
-
-void VideoSampler::shutdown() {
-  cleanUp();
-}
-
-bool VideoSampler::init(const SamplerParameters& params) {
-  cleanUp();
-
-  if (params.out.video.cropImage != 0) {
-    if (!Util::validateVideoFormat(params.out.video)) {
-      LOG(ERROR) << "Invalid video format"
-                 << ", width: " << params.out.video.width
-                 << ", height: " << params.out.video.height
-                 << ", format: " << params.out.video.format
-                 << ", minDimension: " << params.out.video.minDimension
-                 << ", crop: " << params.out.video.cropImage;
-
-      return false;
-    }
-
-    scaleFormat_.format = params.out.video.format;
-    Util::setFormatDimensions(
-        scaleFormat_.width,
-        scaleFormat_.height,
-        params.out.video.width,
-        params.out.video.height,
-        params.in.video.width,
-        params.in.video.height,
-        0,
-        1);
-
-    if (!(scaleFormat_ == params_.out.video)) { // crop required
-      cropContext_ = sws_getContext(
-          params.out.video.width,
-          params.out.video.height,
-          (AVPixelFormat)params_.out.video.format,
-          params.out.video.width,
-          params.out.video.height,
-          (AVPixelFormat)params.out.video.format,
-          swsFlags_,
-          nullptr,
-          nullptr,
-          nullptr);
-
-      if (!cropContext_) {
-        LOG(ERROR) << "sws_getContext failed for crop context";
-        return false;
-      }
-
-      const auto scaleImageSize = av_image_get_buffer_size(
-          (AVPixelFormat)scaleFormat_.format,
-          scaleFormat_.width,
-          scaleFormat_.height,
-          1);
-      scaleBuffer_.resize(scaleImageSize);
-    }
-  } else {
-    scaleFormat_ = params.out.video;
-  }
-
-  VLOG(1) << "Input format #" << loggingUuid_ << ", width "
-          << params.in.video.width << ", height " << params.in.video.height
-          << ", format " << params.in.video.format << ", minDimension "
-          << params.in.video.minDimension << ", cropImage "
-          << params.in.video.cropImage;
-  VLOG(1) << "Scale format #" << loggingUuid_ << ", width "
-          << scaleFormat_.width << ", height " << scaleFormat_.height
-          << ", format " << scaleFormat_.format << ", minDimension "
-          << scaleFormat_.minDimension << ", cropImage "
-          << scaleFormat_.cropImage;
-  VLOG(1) << "Crop format #" << loggingUuid_ << ", width "
-          << params.out.video.width << ", height " << params.out.video.height
-          << ", format " << params.out.video.format << ", minDimension "
-          << params.out.video.minDimension << ", cropImage "
-          << params.out.video.cropImage;
-
-  scaleContext_ = sws_getContext(
-      params.in.video.width,
-      params.in.video.height,
-      (AVPixelFormat)params.in.video.format,
-      scaleFormat_.width,
-      scaleFormat_.height,
-      (AVPixelFormat)scaleFormat_.format,
-      swsFlags_,
-      nullptr,
-      nullptr,
-      nullptr);
-
-  // set output format
-  params_ = params;
-
-  return scaleContext_ != nullptr;
-}
-
-int VideoSampler::getImageBytes() const {
-  return av_image_get_buffer_size(
-      (AVPixelFormat)params_.out.video.format,
-      params_.out.video.width,
-      params_.out.video.height,
-      1);
-}
-
-int VideoSampler::sample(
-    const uint8_t* const srcSlice[],
-    int srcStride[],
-    ByteStorage* out,
-    bool allocateBuffer) {
-  int result;
-  // scaled and cropped image
-  const auto outImageSize = getImageBytes();
-  if (allocateBuffer) {
-    out->clear();
-    out->ensure(outImageSize);
-  }
-  CHECK_LE(outImageSize, out->tail());
-
-  uint8_t* scalePlanes[4] = {nullptr};
-  int scaleLines[4] = {0};
-  // perform scale first
-  if ((result = transformImage(
-           scaleContext_,
-           srcSlice,
-           srcStride,
-           params_.in.video,
-           scaleFormat_,
-           // for crop use internal buffer
-           cropContext_ ? scaleBuffer_.data() : out->writableTail(),
-           scalePlanes,
-           scaleLines))) {
-    return result;
-  }
-
-  // is crop required?
-  if (cropContext_) {
-    uint8_t* cropPlanes[4] = {nullptr};
-    int cropLines[4] = {0};
-
-    if (params_.out.video.height < scaleFormat_.height) {
-      // Destination image is wider of source image: cut top and bottom
-      for (size_t i = 0; i < 4 && scalePlanes[i] != nullptr; ++i) {
-        scalePlanes[i] += scaleLines[i] *
-            (scaleFormat_.height - params_.out.video.height) / 2;
-      }
-    } else {
-      // Source image is wider of destination image: cut sides
-      for (size_t i = 0; i < 4 && scalePlanes[i] != nullptr; ++i) {
-        scalePlanes[i] += scaleLines[i] *
-            (scaleFormat_.width - params_.out.video.width) / 2 /
-            scaleFormat_.width;
-      }
-    }
-
-    // crop image
-    if ((result = transformImage(
-             cropContext_,
-             scalePlanes,
-             scaleLines,
-             params_.out.video,
-             params_.out.video,
-             out->writableTail(),
-             cropPlanes,
-             cropLines))) {
-      return result;
-    }
-  }
-
-  out->append(outImageSize);
-  return outImageSize;
-}
-
-int VideoSampler::sample(AVFrame* frame, ByteStorage* out) {
-  if (!frame) {
-    return 0; // no flush for videos
-  }
-
-  return sample(frame->data, frame->linesize, out, false);
-}
-
-int VideoSampler::sample(const ByteStorage* in, ByteStorage* out) {
-  if (!in) {
-    return 0; // no flush for videos
-  }
-
-  int result;
-  uint8_t* inPlanes[4] = {nullptr};
-  int inLineSize[4] = {0};
-
-  if ((result = preparePlanes(
-           params_.in.video, in->data(), inPlanes, inLineSize)) < 0) {
-    return result;
-  }
-
-  return sample(inPlanes, inLineSize, out, true);
-}
-
-void VideoSampler::cleanUp() {
-  if (scaleContext_) {
-    sws_freeContext(scaleContext_);
-    scaleContext_ = nullptr;
-  }
-  if (cropContext_) {
-    sws_freeContext(cropContext_);
-    cropContext_ = nullptr;
-    scaleBuffer_.clear();
-  }
-}
-
-} // namespace ffmpeg
--- a/torchvision/csrc/cpu/decoder/video_sampler.h
+++ b/torchvision/csrc/cpu/decoder/video_sampler.h
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#pragma once
-
-#include "defs.h"
-
-extern "C" {
-#include <libavformat/avformat.h>
-#include "libswscale/swscale.h"
-}
-
-namespace ffmpeg {
-
-/**
- * Class transcode video frames from one format into another
- */
-
-class VideoSampler : public MediaSampler {
- public:
-  VideoSampler(int swsFlags = SWS_AREA, int64_t loggingUuid = 0);
-
-  ~VideoSampler() override;
-
-  // MediaSampler overrides
-  bool init(const SamplerParameters& params) override;
-  int sample(const ByteStorage* in, ByteStorage* out) override;
-  void shutdown() override;
-
-  // returns number processed/scaling bytes
-  int sample(AVFrame* frame, ByteStorage* out);
-  int getImageBytes() const;
-
- private:
-  // close resources
-  void cleanUp();
-  // helper functions for rescaling, cropping, etc.
-  int sample(
-      const uint8_t* const srcSlice[],
-      int srcStride[],
-      ByteStorage* out,
-      bool allocateBuffer);
-
- private:
-  VideoFormat scaleFormat_;
-  SwsContext* scaleContext_{nullptr};
-  SwsContext* cropContext_{nullptr};
-  int swsFlags_{SWS_AREA};
-  std::vector<uint8_t> scaleBuffer_;
-  int64_t loggingUuid_{0};
-};
-
-} // namespace ffmpeg
--- a/torchvision/csrc/cpu/decoder/video_stream.cpp
+++ b/torchvision/csrc/cpu/decoder/video_stream.cpp
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#include "video_stream.h"
-#include <c10/util/Logging.h>
-#include "util.h"
-
-namespace ffmpeg {
-
-namespace {
-bool operator==(const VideoFormat& x, const AVFrame& y) {
-  return x.width == y.width && x.height == y.height && x.format == y.format;
-}
-
-VideoFormat& toVideoFormat(VideoFormat& x, const AVFrame& y) {
-  x.width = y.width;
-  x.height = y.height;
-  x.format = y.format;
-  return x;
-}
-} // namespace
-
-VideoStream::VideoStream(
-    AVFormatContext* inputCtx,
-    int index,
-    bool convertPtsToWallTime,
-    const VideoFormat& format,
-    int64_t loggingUuid)
-    : Stream(
-          inputCtx,
-          MediaFormat::makeMediaFormat(format, index),
-          convertPtsToWallTime),
-      loggingUuid_(loggingUuid) {}
-
-VideoStream::~VideoStream() {
-  if (sampler_) {
-    sampler_->shutdown();
-    sampler_.reset();
-  }
-}
-
-void VideoStream::ensureSampler() {
-  if (!sampler_) {
-    sampler_ = std::make_unique<VideoSampler>(SWS_AREA, loggingUuid_);
-  }
-}
-
-int VideoStream::initFormat() {
-  // set output format
-  if (!Util::validateVideoFormat(format_.format.video)) {
-    LOG(ERROR) << "Invalid video format"
-               << ", width: " << format_.format.video.width
-               << ", height: " << format_.format.video.height
-               << ", format: " << format_.format.video.format
-               << ", minDimension: " << format_.format.video.minDimension
-               << ", crop: " << format_.format.video.cropImage;
-    return -1;
-  }
-
-  // keep aspect ratio
-  Util::setFormatDimensions(
-      format_.format.video.width,
-      format_.format.video.height,
-      format_.format.video.width,
-      format_.format.video.height,
-      codecCtx_->width,
-      codecCtx_->height,
-      format_.format.video.minDimension,
-      0);
-
-  if (format_.format.video.format == AV_PIX_FMT_NONE) {
-    format_.format.video.format = codecCtx_->pix_fmt;
-  }
-  return format_.format.video.width != 0 && format_.format.video.height != 0 &&
-          format_.format.video.format != AV_PIX_FMT_NONE
-      ? 0
-      : -1;
-}
-
-int VideoStream::estimateBytes(bool flush) {
-  ensureSampler();
-  // check if input format gets changed
-  if (!flush && !(sampler_->getInputFormat().video == *frame_)) {
-    // - reinit sampler
-    SamplerParameters params;
-    params.type = format_.type;
-    params.out = format_.format;
-    toVideoFormat(params.in.video, *frame_);
-    if (!sampler_->init(params)) {
-      return -1;
-    }
-
-    VLOG(1) << "Set input video sampler format"
-            << ", width: " << params.in.video.width
-            << ", height: " << params.in.video.height
-            << ", format: " << params.in.video.format
-            << " : output video sampler format"
-            << ", width: " << format_.format.video.width
-            << ", height: " << format_.format.video.height
-            << ", format: " << format_.format.video.format
-            << ", minDimension: " << format_.format.video.minDimension
-            << ", crop: " << format_.format.video.cropImage;
-  }
-  return sampler_->getImageBytes();
-}
-
-int VideoStream::copyFrameBytes(ByteStorage* out, bool flush) {
-  ensureSampler();
-  return sampler_->sample(flush ? nullptr : frame_, out);
-}
-
-void VideoStream::setHeader(DecoderHeader* header) {
-  header->seqno = numGenerator_++;
-
-  if (codecCtx_->time_base.num != 0) {
-    header->pts = av_rescale_q(
-        av_frame_get_best_effort_timestamp(frame_),
-        codecCtx_->time_base,
-        AV_TIME_BASE_Q);
-  } else {
-    // If the codec time_base is missing then we would've skipped the
-    // rescalePackage step to rescale to codec time_base, so here we can
-    // rescale straight from the stream time_base into AV_TIME_BASE_Q.
-    header->pts = av_rescale_q(
-        av_frame_get_best_effort_timestamp(frame_),
-        inputCtx_->streams[format_.stream]->time_base,
-        AV_TIME_BASE_Q);
-  }
-
-  if (convertPtsToWallTime_) {
-    keeper_.adjust(header->pts);
-  }
-
-  header->keyFrame = frame_->key_frame;
-  auto fpsRational = inputCtx_->streams[format_.stream]->avg_frame_rate;
-  if (fpsRational.den) {
-    header->fps = av_q2d(fpsRational);
-  } else {
-    header->fps = std::numeric_limits<double>::quiet_NaN();
-  }
-  header->format = format_;
-}
-
-} // namespace ffmpeg
--- a/torchvision/csrc/cpu/decoder/video_stream.h
+++ b/torchvision/csrc/cpu/decoder/video_stream.h
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#pragma once
-
-#include "stream.h"
-#include "time_keeper.h"
-#include "video_sampler.h"
-
-namespace ffmpeg {
-
-/**
- * Class uses FFMPEG library to decode one video stream.
- */
-
-class VideoStream : public Stream {
- public:
-  VideoStream(
-      AVFormatContext* inputCtx,
-      int index,
-      bool convertPtsToWallTime,
-      const VideoFormat& format,
-      int64_t loggingUuid = 0);
-  ~VideoStream() override;
-
- private:
-  int initFormat() override;
-  int estimateBytes(bool flush) override;
-  int copyFrameBytes(ByteStorage* out, bool flush) override;
-  void setHeader(DecoderHeader* header) override;
-
-  void ensureSampler();
-
- private:
-  std::unique_ptr<VideoSampler> sampler_;
-  TimeKeeper keeper_;
-  int64_t loggingUuid_{0};
-};
-
-} // namespace ffmpeg