Commit 31fad34f authored by Zhicheng Yan's avatar Zhicheng Yan Committed by Francisco Massa
Browse files

[video reader] inception commit (#1303)

* [video reader] inception commit

* add method save_metadata to class VideoClips in video_utils.py

* add load_metadata() method to VideoClips class

* add Exception to not catch unexpected events such as memory erros, interrupt

* fix bugs in video_plus.py

* [video reader]remove logging. update setup.py

* remove time measurement in test_video_reader.py

* Remove glog and try making ffmpeg finding more robust

* Add ffmpeg to conda build

* Add ffmpeg to conda build [again]

* Make library path finding more robust

* Missing import

* One more missing fix for import

* Py2 compatibility and change package to av to avoid version conflict with ffmpeg

* Fix for python2

* [video reader] support to decode one stream only (e.g. video/audio stream)

* remove argument _precomputed_metadata_filepath

* remove save_metadata method

* add get_metadata method

* expose _precomputed_metadata and frame_rate arguments in video dataset __init__ method

* remove ssize_t

* remove size_t to pass CI check on Windows

* add PyInit__video_reader function to pass CI check on Windows

* minor fix to define PyInit_video_reader symbol

* Make c++ video reader optional

* Temporarily revert changes to test_io

* Revert changes to python files

* Rename files to make it private

* Fix python lint

* Fix C++ lint

* add a functor object EnumClassHash to make Enum class instances usable as key type of std::unordered_map

* fix cpp format check
parent a6a926bc
#include "FfmpegStream.h"
#include "FfmpegUtil.h"
using namespace std;
// (TODO) Currently, disable the use of refCount
static int refCount = 0;
FfmpegStream::FfmpegStream(
AVFormatContext* inputCtx,
int index,
enum AVMediaType avMediaType,
double seekFrameMargin)
: inputCtx_(inputCtx),
index_(index),
avMediaType_(avMediaType),
seekFrameMargin_(seekFrameMargin) {}
FfmpegStream::~FfmpegStream() {
if (frame_) {
av_frame_free(&frame_);
}
avcodec_free_context(&codecCtx_);
}
int FfmpegStream::openCodecContext() {
VLOG(2) << "stream start_time: " << inputCtx_->streams[index_]->start_time;
auto typeString = av_get_media_type_string(avMediaType_);
AVStream* st = inputCtx_->streams[index_];
auto codec_id = st->codecpar->codec_id;
VLOG(1) << "codec_id: " << codec_id;
AVCodec* codec = avcodec_find_decoder(codec_id);
if (!codec) {
LOG(ERROR) << "avcodec_find_decoder failed for codec_id: " << int(codec_id);
return AVERROR(EINVAL);
}
VLOG(1) << "Succeed to find decoder";
codecCtx_ = avcodec_alloc_context3(codec);
if (!codecCtx_) {
LOG(ERROR) << "avcodec_alloc_context3 fails";
return AVERROR(ENOMEM);
}
int ret;
/* Copy codec parameters from input stream to output codec context */
if ((ret = avcodec_parameters_to_context(codecCtx_, st->codecpar)) < 0) {
LOG(ERROR) << "Failed to copy " << typeString
<< " codec parameters to decoder context";
return ret;
}
AVDictionary* opts = nullptr;
av_dict_set(&opts, "refcounted_frames", refCount ? "1" : "0", 0);
// after avcodec_open2, value of codecCtx_->time_base is NOT meaningful
// But inputCtx_->streams[index_]->time_base has meaningful values
if ((ret = avcodec_open2(codecCtx_, codec, &opts)) < 0) {
LOG(ERROR) << "avcodec_open2 failed. " << ffmpeg_util::getErrorDesc(ret);
return ret;
}
VLOG(1) << "Succeed to open codec";
frame_ = av_frame_alloc();
return initFormat();
}
unique_ptr<DecodedFrame> FfmpegStream::getFrameData(int getPtsOnly) {
if (!codecCtx_) {
LOG(ERROR) << "Codec is not initialized";
return nullptr;
}
if (getPtsOnly) {
unique_ptr<DecodedFrame> decodedFrame = make_unique<DecodedFrame>();
decodedFrame->pts_ = frame_->pts;
return decodedFrame;
} else {
unique_ptr<DecodedFrame> decodedFrame = sampleFrameData();
if (decodedFrame) {
decodedFrame->pts_ = frame_->pts;
}
return decodedFrame;
}
}
void FfmpegStream::flush(int getPtsOnly, DecoderOutput& decoderOutput) {
VLOG(1) << "Media Type: " << getMediaType() << ", flush stream.";
// need to receive frames before entering draining mode
receiveAvailFrames(getPtsOnly, decoderOutput);
VLOG(2) << "send nullptr packet";
sendPacket(nullptr);
// receive remaining frames after entering draining mode
receiveAvailFrames(getPtsOnly, decoderOutput);
avcodec_flush_buffers(codecCtx_);
}
bool FfmpegStream::isFramePtsInRange() {
CHECK(frame_);
auto pts = frame_->pts;
auto startPts = this->getStartPts();
auto endPts = this->getEndPts();
VLOG(2) << "isPtsInRange. pts: " << pts << ", startPts: " << startPts
<< ", endPts: " << endPts;
return (pts == AV_NOPTS_VALUE) ||
(pts >= startPts && (endPts >= 0 ? pts <= endPts : true));
}
bool FfmpegStream::isFramePtsExceedRange() {
if (frame_) {
auto endPts = this->getEndPts();
VLOG(2) << "isFramePtsExceedRange. last_pts_: " << last_pts_
<< ", endPts: " << endPts;
return endPts >= 0 ? last_pts_ >= endPts : false;
} else {
return true;
}
}
// seek a frame
int FfmpegStream::seekFrame(int64_t seekPts) {
// translate margin from second to pts
int64_t margin = (int64_t)(
seekFrameMargin_ * (double)inputCtx_->streams[index_]->time_base.den /
(double)inputCtx_->streams[index_]->time_base.num);
int64_t real_seekPts = (seekPts - margin) > 0 ? (seekPts - margin) : 0;
VLOG(2) << "seek margin: " << margin;
VLOG(2) << "real seekPts: " << real_seekPts;
int ret = av_seek_frame(
inputCtx_,
index_,
(seekPts - margin) > 0 ? (seekPts - margin) : 0,
AVSEEK_FLAG_BACKWARD);
if (ret < 0) {
LOG(WARNING) << "av_seek_frame fails. Stream index: " << index_;
return ret;
}
return 0;
}
// send/receive encoding and decoding API overview
// https://ffmpeg.org/doxygen/3.4/group__lavc__encdec.html
int FfmpegStream::sendPacket(const AVPacket* packet) {
return avcodec_send_packet(codecCtx_, packet);
}
int FfmpegStream::receiveFrame() {
int ret = avcodec_receive_frame(codecCtx_, frame_);
if (ret >= 0) {
// succeed
frame_->pts = av_frame_get_best_effort_timestamp(frame_);
if (frame_->pts == AV_NOPTS_VALUE) {
// Trick: if we can not figure out pts, we just set it to be (last_pts +
// 1)
frame_->pts = last_pts_ + 1;
}
last_pts_ = frame_->pts;
VLOG(2) << "avcodec_receive_frame succeed";
} else if (ret == AVERROR(EAGAIN)) {
VLOG(2) << "avcodec_receive_frame fails and returns AVERROR(EAGAIN). ";
} else if (ret == AVERROR_EOF) {
// no more frame to read
VLOG(2) << "avcodec_receive_frame returns AVERROR_EOF";
} else {
LOG(WARNING) << "avcodec_receive_frame failed. Error: "
<< ffmpeg_util::getErrorDesc(ret);
}
return ret;
}
void FfmpegStream::receiveAvailFrames(
int getPtsOnly,
DecoderOutput& decoderOutput) {
int result = 0;
while ((result = receiveFrame()) >= 0) {
unique_ptr<DecodedFrame> decodedFrame = getFrameData(getPtsOnly);
if (decodedFrame &&
((!getPtsOnly && decodedFrame->frameSize_ > 0) || getPtsOnly)) {
if (isFramePtsInRange()) {
decoderOutput.addMediaFrame(getMediaType(), std::move(decodedFrame));
}
} // end-if
} // end-while
}
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#pragma once
#include <memory>
#include <unordered_map>
#include <utility>
#include "FfmpegHeaders.h"
#include "Interface.h"
/*
Class uses FFMPEG library to decode one media stream (audio or video).
*/
class FfmpegStream {
public:
FfmpegStream(
AVFormatContext* inputCtx,
int index,
enum AVMediaType avMediaType,
double seekFrameMargin);
virtual ~FfmpegStream();
// returns 0 - on success or negative error
int openCodecContext();
// returns stream index
int getIndex() const {
return index_;
}
// returns number decoded/sampled bytes
std::unique_ptr<DecodedFrame> getFrameData(int getPtsOnly);
// flush the stream at the end of decoding.
// Return 0 on success and -1 when cache is drained
void flush(int getPtsOnly, DecoderOutput& decoderOutput);
// seek a frame
int seekFrame(int64_t ts);
// send an AVPacket
int sendPacket(const AVPacket* packet);
// receive AVFrame
int receiveFrame();
// receive all available frames from the internal buffer
void receiveAvailFrames(int getPtsOnly, DecoderOutput& decoderOutput);
// return media type
virtual MediaType getMediaType() const = 0;
// return media format
virtual FormatUnion getMediaFormat() const = 0;
// return start presentation timestamp
virtual int64_t getStartPts() const = 0;
// return end presentation timestamp
virtual int64_t getEndPts() const = 0;
// is the pts of most recent frame within range?
bool isFramePtsInRange();
// does the pts of most recent frame exceed range?
bool isFramePtsExceedRange();
protected:
virtual int initFormat() = 0;
// returns a decoded frame
virtual std::unique_ptr<DecodedFrame> sampleFrameData() = 0;
protected:
AVFormatContext* const inputCtx_;
const int index_;
enum AVMediaType avMediaType_;
AVCodecContext* codecCtx_{nullptr};
AVFrame* frame_{nullptr};
// pts of last decoded frame
int64_t last_pts_{0};
double seekFrameMargin_{1.0};
};
#include "FfmpegUtil.h"
using namespace std;
namespace ffmpeg_util {
bool mapFfmpegType(AVMediaType media, MediaType* type) {
switch (media) {
case AVMEDIA_TYPE_VIDEO:
*type = MediaType::TYPE_VIDEO;
return true;
case AVMEDIA_TYPE_AUDIO:
*type = MediaType::TYPE_AUDIO;
return true;
default:
return false;
}
}
bool mapMediaType(MediaType type, AVMediaType* media) {
switch (type) {
case MediaType::TYPE_VIDEO:
*media = AVMEDIA_TYPE_VIDEO;
return true;
case MediaType::TYPE_AUDIO:
*media = AVMEDIA_TYPE_AUDIO;
return true;
default:
return false;
}
}
void setFormatDimensions(
int& destW,
int& destH,
int userW,
int userH,
int srcW,
int srcH,
int minDimension) {
// rounding rules
// int -> double -> round
// round up if fraction is >= 0.5 or round down if fraction is < 0.5
// int result = double(value) + 0.5
// here we rounding double to int according to the above rule
if (userW == 0 && userH == 0) {
if (minDimension > 0) { // #2
if (srcW > srcH) {
// landscape
destH = minDimension;
destW = round(double(srcW * minDimension) / srcH);
} else {
// portrait
destW = minDimension;
destH = round(double(srcH * minDimension) / srcW);
}
} else { // #1
destW = srcW;
destH = srcH;
}
} else if (userW != 0 && userH == 0) { // #3
destW = userW;
destH = round(double(srcH * userW) / srcW);
} else if (userW == 0 && userH != 0) { // #4
destW = round(double(srcW * userH) / srcH);
destH = userH;
} else {
// userW != 0 && userH != 0. #5
destW = userW;
destH = userH;
}
// prevent zeros
destW = std::max(destW, 1);
destH = std::max(destH, 1);
}
bool validateVideoFormat(const VideoFormat& f) {
/*
Valid parameters values for decoder
___________________________________________________
| W | H | minDimension | algorithm |
|_________________________________________________|
| 0 | 0 | 0 | original |
|_________________________________________________|
| 0 | 0 | >0 |scale to min dimension|
|_____|_____|____________________________________ |
| >0 | 0 | 0 | scale keeping W |
|_________________________________________________|
| 0 | >0 | 0 | scale keeping H |
|_________________________________________________|
| >0 | >0 | 0 | stretch/scale |
|_________________________________________________|
*/
return (f.width == 0 && f.height == 0) || // #1 and #2
(f.width != 0 && f.height != 0 && f.minDimension == 0) || // # 5
(((f.width != 0 && f.height == 0) || // #3 and #4
(f.width == 0 && f.height != 0)) &&
f.minDimension == 0);
}
string getErrorDesc(int errnum) {
array<char, 1024> buffer;
if (av_strerror(errnum, buffer.data(), buffer.size()) < 0) {
return string("Unknown error code");
}
buffer.back() = 0;
return string(buffer.data());
}
} // namespace ffmpeg_util
#pragma once
#include <array>
#include <string>
#include "FfmpegHeaders.h"
#include "Interface.h"
namespace ffmpeg_util {
bool mapFfmpegType(AVMediaType media, enum MediaType* type);
bool mapMediaType(MediaType type, enum AVMediaType* media);
void setFormatDimensions(
int& destW,
int& destH,
int userW,
int userH,
int srcW,
int srcH,
int minDimension);
bool validateVideoFormat(const VideoFormat& f);
std::string getErrorDesc(int errnum);
} // namespace ffmpeg_util
#include "FfmpegVideoSampler.h"
#include "FfmpegUtil.h"
using namespace std;
FfmpegVideoSampler::FfmpegVideoSampler(
const VideoFormat& in,
const VideoFormat& out,
int swsFlags)
: inFormat_(in), outFormat_(out), swsFlags_(swsFlags) {}
FfmpegVideoSampler::~FfmpegVideoSampler() {
if (scaleContext_) {
sws_freeContext(scaleContext_);
scaleContext_ = nullptr;
}
}
int FfmpegVideoSampler::init() {
VLOG(1) << "Input format: width " << inFormat_.width << ", height "
<< inFormat_.height << ", format " << inFormat_.format
<< ", minDimension " << inFormat_.minDimension;
VLOG(1) << "Scale format: width " << outFormat_.width << ", height "
<< outFormat_.height << ", format " << outFormat_.format
<< ", minDimension " << outFormat_.minDimension;
scaleContext_ = sws_getContext(
inFormat_.width,
inFormat_.height,
(AVPixelFormat)inFormat_.format,
outFormat_.width,
outFormat_.height,
static_cast<AVPixelFormat>(outFormat_.format),
swsFlags_,
nullptr,
nullptr,
nullptr);
if (scaleContext_) {
return 0;
} else {
return -1;
}
}
int32_t FfmpegVideoSampler::getImageBytes() const {
return av_image_get_buffer_size(
(AVPixelFormat)outFormat_.format, outFormat_.width, outFormat_.height, 1);
}
// https://ffmpeg.org/doxygen/3.4/scaling_video_8c-example.html#a10
unique_ptr<DecodedFrame> FfmpegVideoSampler::sample(const AVFrame* frame) {
if (!frame) {
return nullptr; // no flush for videos
}
// scaled and cropped image
auto outImageSize = getImageBytes();
AvDataPtr frameData(static_cast<uint8_t*>(av_malloc(outImageSize)));
uint8_t* scalePlanes[4] = {nullptr};
int scaleLines[4] = {0};
int result;
if ((result = av_image_fill_arrays(
scalePlanes,
scaleLines,
frameData.get(),
static_cast<AVPixelFormat>(outFormat_.format),
outFormat_.width,
outFormat_.height,
1)) < 0) {
LOG(ERROR) << "av_image_fill_arrays failed, err: "
<< ffmpeg_util::getErrorDesc(result);
return nullptr;
}
if ((result = sws_scale(
scaleContext_,
frame->data,
frame->linesize,
0,
inFormat_.height,
scalePlanes,
scaleLines)) < 0) {
LOG(ERROR) << "sws_scale failed, err: "
<< ffmpeg_util::getErrorDesc(result);
return nullptr;
}
return make_unique<DecodedFrame>(std::move(frameData), outImageSize, 0);
}
#pragma once
#include "FfmpegSampler.h"
/**
* Class transcode video frames from one format into another
*/
class FfmpegVideoSampler : public FfmpegSampler {
public:
explicit FfmpegVideoSampler(
const VideoFormat& in,
const VideoFormat& out,
int swsFlags = SWS_AREA);
~FfmpegVideoSampler() override;
int init() override;
int32_t getImageBytes() const;
// returns number of bytes of the sampled data
std::unique_ptr<DecodedFrame> sample(const AVFrame* frame) override;
const VideoFormat& getInFormat() const {
return inFormat_;
}
private:
VideoFormat inFormat_;
VideoFormat outFormat_;
int swsFlags_;
SwsContext* scaleContext_{nullptr};
};
#include "FfmpegVideoStream.h"
#include "FfmpegUtil.h"
using namespace std;
namespace {
bool operator==(const VideoFormat& x, const AVFrame& y) {
return x.width == y.width && x.height == y.height &&
x.format == static_cast<AVPixelFormat>(y.format);
}
VideoFormat toVideoFormat(const AVFrame& frame) {
VideoFormat videoFormat;
videoFormat.width = frame.width;
videoFormat.height = frame.height;
videoFormat.format = static_cast<AVPixelFormat>(frame.format);
return videoFormat;
}
} // namespace
FfmpegVideoStream::FfmpegVideoStream(
AVFormatContext* inputCtx,
int index,
enum AVMediaType avMediaType,
MediaFormat mediaFormat,
double seekFrameMargin)
: FfmpegStream(inputCtx, index, avMediaType, seekFrameMargin),
mediaFormat_(mediaFormat) {}
FfmpegVideoStream::~FfmpegVideoStream() {}
void FfmpegVideoStream::checkStreamDecodeParams() {
auto timeBase = getTimeBase();
if (timeBase.first > 0) {
CHECK_EQ(timeBase.first, inputCtx_->streams[index_]->time_base.num);
CHECK_EQ(timeBase.second, inputCtx_->streams[index_]->time_base.den);
}
}
void FfmpegVideoStream::updateStreamDecodeParams() {
auto timeBase = getTimeBase();
if (timeBase.first == 0) {
mediaFormat_.format.video.timeBaseNum =
inputCtx_->streams[index_]->time_base.num;
mediaFormat_.format.video.timeBaseDen =
inputCtx_->streams[index_]->time_base.den;
}
}
int FfmpegVideoStream::initFormat() {
// set output format
VideoFormat& format = mediaFormat_.format.video;
if (!ffmpeg_util::validateVideoFormat(format)) {
LOG(ERROR) << "Invalid video format";
return -1;
}
format.fps = av_q2d(
av_guess_frame_rate(inputCtx_, inputCtx_->streams[index_], nullptr));
// keep aspect ratio
ffmpeg_util::setFormatDimensions(
format.width,
format.height,
format.width,
format.height,
codecCtx_->width,
codecCtx_->height,
format.minDimension);
VLOG(1) << "After adjusting, video format"
<< ", width: " << format.width << ", height: " << format.height
<< ", format: " << format.format
<< ", minDimension: " << format.minDimension;
if (format.format == AV_PIX_FMT_NONE) {
format.format = codecCtx_->pix_fmt;
VLOG(1) << "Set pixel format: " << format.format;
}
checkStreamDecodeParams();
updateStreamDecodeParams();
return format.width != 0 && format.height != 0 &&
format.format != AV_PIX_FMT_NONE
? 0
: -1;
}
unique_ptr<DecodedFrame> FfmpegVideoStream::sampleFrameData() {
VideoFormat& format = mediaFormat_.format.video;
if (!sampler_ || !(sampler_->getInFormat() == *frame_)) {
VideoFormat newInFormat = toVideoFormat(*frame_);
sampler_ = make_unique<FfmpegVideoSampler>(newInFormat, format, SWS_AREA);
VLOG(1) << "Set input video sampler format"
<< ", width: " << newInFormat.width
<< ", height: " << newInFormat.height
<< ", format: " << newInFormat.format
<< " : output video sampler format"
<< ", width: " << format.width << ", height: " << format.height
<< ", format: " << format.format
<< ", minDimension: " << format.minDimension;
int ret = sampler_->init();
if (ret < 0) {
VLOG(1) << "Fail to initialize video sampler";
return nullptr;
}
}
return sampler_->sample(frame_);
}
#pragma once
#include <utility>
#include "FfmpegStream.h"
#include "FfmpegVideoSampler.h"
/**
* Class uses FFMPEG library to decode one video stream.
*/
class FfmpegVideoStream : public FfmpegStream {
public:
explicit FfmpegVideoStream(
AVFormatContext* inputCtx,
int index,
enum AVMediaType avMediaType,
MediaFormat mediaFormat,
double seekFrameMargin);
~FfmpegVideoStream() override;
// FfmpegStream overrides
MediaType getMediaType() const override {
return MediaType::TYPE_VIDEO;
}
FormatUnion getMediaFormat() const override {
return mediaFormat_.format;
}
int64_t getStartPts() const override {
return mediaFormat_.format.video.startPts;
}
int64_t getEndPts() const override {
return mediaFormat_.format.video.endPts;
}
// return numerator and denominator of time base
std::pair<int, int> getTimeBase() const {
return std::make_pair(
mediaFormat_.format.video.timeBaseNum,
mediaFormat_.format.video.timeBaseDen);
}
void checkStreamDecodeParams();
void updateStreamDecodeParams();
protected:
int initFormat() override;
std::unique_ptr<DecodedFrame> sampleFrameData() override;
private:
MediaFormat mediaFormat_;
std::unique_ptr<FfmpegVideoSampler> sampler_{nullptr};
};
#include "Interface.h"
void DecoderOutput::initMediaType(MediaType mediaType, FormatUnion format) {
MediaData mediaData(format);
media_data_.emplace(mediaType, std::move(mediaData));
}
void DecoderOutput::addMediaFrame(
MediaType mediaType,
std::unique_ptr<DecodedFrame> frame) {
if (media_data_.find(mediaType) != media_data_.end()) {
VLOG(1) << "media type: " << mediaType
<< " add frame with pts: " << frame->pts_;
media_data_[mediaType].frames_.push_back(std::move(frame));
} else {
VLOG(1) << "media type: " << mediaType << " not found. Skip the frame.";
}
}
void DecoderOutput::clear() {
media_data_.clear();
}
#pragma once
#include <c10/util/Logging.h>
#include <sys/types.h>
#include <memory>
#include <unordered_map>
extern "C" {
#include <libavutil/pixfmt.h>
#include <libavutil/samplefmt.h>
void av_free(void* ptr);
}
struct avDeleter {
void operator()(uint8_t* p) const {
av_free(p);
}
};
const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24;
const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT;
using AvDataPtr = std::unique_ptr<uint8_t, avDeleter>;
enum MediaType : uint32_t {
TYPE_VIDEO = 1,
TYPE_AUDIO = 2,
};
struct EnumClassHash {
template <typename T>
uint32_t operator()(T t) const {
return static_cast<uint32_t>(t);
}
};
struct VideoFormat {
// fields are initialized for the auto detection
// caller can specify some/all of field values if specific output is desirable
int width{0}; // width in pixels
int height{0}; // height in pixels
int minDimension{0}; // choose min dimension and rescale accordingly
// Output image pixel format. data type AVPixelFormat
AVPixelFormat format{defaultVideoPixelFormat}; // type AVPixelFormat
int64_t startPts{0}, endPts{0}; // Start and end presentation timestamp
int timeBaseNum{0};
int timeBaseDen{1}; // numerator and denominator of time base
float fps{0.0};
};
struct AudioFormat {
// fields are initialized for the auto detection
// caller can specify some/all of field values if specific output is desirable
int samples{0}; // number samples per second (frequency)
int channels{0}; // number of channels
AVSampleFormat format{defaultAudioSampleFormat}; // type AVSampleFormat
int64_t startPts{0}, endPts{0}; // Start and end presentation timestamp
int timeBaseNum{0};
int timeBaseDen{1}; // numerator and denominator of time base
};
union FormatUnion {
FormatUnion() {}
VideoFormat video;
AudioFormat audio;
};
struct MediaFormat {
MediaFormat() {}
MediaFormat(const MediaFormat& mediaFormat) : type(mediaFormat.type) {
if (type == MediaType::TYPE_VIDEO) {
format.video = mediaFormat.format.video;
} else if (type == MediaType::TYPE_AUDIO) {
format.audio = mediaFormat.format.audio;
}
}
MediaFormat(MediaType mediaType) : type(mediaType) {
if (mediaType == MediaType::TYPE_VIDEO) {
format.video = VideoFormat();
} else if (mediaType == MediaType::TYPE_AUDIO) {
format.audio = AudioFormat();
}
}
// media type
MediaType type;
// format data
FormatUnion format;
};
class DecodedFrame {
public:
explicit DecodedFrame() : frame_(nullptr), frameSize_(0), pts_(0) {}
explicit DecodedFrame(AvDataPtr frame, int frameSize, int64_t pts)
: frame_(std::move(frame)), frameSize_(frameSize), pts_(pts) {}
AvDataPtr frame_{nullptr};
int frameSize_{0};
int64_t pts_{0};
};
struct MediaData {
MediaData() {}
MediaData(FormatUnion format) : format_(format) {}
FormatUnion format_;
std::vector<std::unique_ptr<DecodedFrame>> frames_;
};
class DecoderOutput {
public:
explicit DecoderOutput() {}
~DecoderOutput() {}
void initMediaType(MediaType mediaType, FormatUnion format);
void addMediaFrame(MediaType mediaType, std::unique_ptr<DecodedFrame> frame);
void clear();
std::unordered_map<MediaType, MediaData, EnumClassHash> media_data_;
};
#include "VideoReader.h"
#include <ATen/ATen.h>
#include <Python.h>
#include <c10/util/Logging.h>
#include <exception>
#include "FfmpegDecoder.h"
#include "FfmpegHeaders.h"
#include "util.h"
using namespace std;
// If we are in a Windows environment, we need to define
// initialization functions for the _custom_ops extension
#ifdef _WIN32
#if PY_MAJOR_VERSION < 3
PyMODINIT_FUNC init_video_reader(void) {
// No need to do anything.
return NULL;
}
#else
PyMODINIT_FUNC PyInit_video_reader(void) {
// No need to do anything.
return NULL;
}
#endif
#endif
namespace video_reader {
bool glog_initialized = false;
class UnknownPixelFormatException : public exception {
const char* what() const throw() override {
return "Unknown pixel format";
}
};
int getChannels(AVPixelFormat format) {
int numChannels = 0;
switch (format) {
case AV_PIX_FMT_BGR24:
case AV_PIX_FMT_RGB24:
numChannels = 3;
break;
default:
LOG(ERROR) << "Unknown format: " << format;
throw UnknownPixelFormatException();
}
return numChannels;
}
void fillVideoTensor(
std::vector<unique_ptr<DecodedFrame>>& frames,
torch::Tensor& videoFrame,
torch::Tensor& videoFramePts) {
int frameSize = 0;
if (videoFrame.numel() > 0) {
frameSize = videoFrame.numel() / frames.size();
}
int frameCount = 0;
uint8_t* videoFrameData =
videoFrame.numel() > 0 ? videoFrame.data_ptr<uint8_t>() : nullptr;
int64_t* videoFramePtsData = videoFramePts.data_ptr<int64_t>();
for (size_t i = 0; i < frames.size(); ++i) {
const auto& frame = frames[i];
if (videoFrameData) {
memcpy(
videoFrameData + (size_t)(frameCount++) * (size_t)frameSize,
frame->frame_.get(),
frameSize * sizeof(uint8_t));
}
videoFramePtsData[i] = frame->pts_;
}
}
void getVideoMeta(
DecoderOutput& decoderOutput,
int& numFrames,
int& height,
int& width,
int& numChannels) {
auto& videoFrames = decoderOutput.media_data_[TYPE_VIDEO].frames_;
numFrames = videoFrames.size();
FormatUnion& videoFormat = decoderOutput.media_data_[TYPE_VIDEO].format_;
height = videoFormat.video.height;
width = videoFormat.video.width;
numChannels = getChannels(videoFormat.video.format);
}
void fillAudioTensor(
std::vector<unique_ptr<DecodedFrame>>& frames,
torch::Tensor& audioFrame,
torch::Tensor& audioFramePts) {
if (frames.size() == 0) {
return;
}
float* audioFrameData =
audioFrame.numel() > 0 ? audioFrame.data_ptr<float>() : nullptr;
CHECK_EQ(audioFramePts.size(0), frames.size());
int64_t* audioFramePtsData = audioFramePts.data_ptr<int64_t>();
int bytesPerSample = av_get_bytes_per_sample(defaultAudioSampleFormat);
int64_t frameDataOffset = 0;
for (size_t i = 0; i < frames.size(); ++i) {
audioFramePtsData[i] = frames[i]->pts_;
if (audioFrameData) {
memcpy(
audioFrameData + frameDataOffset,
frames[i]->frame_.get(),
frames[i]->frameSize_);
frameDataOffset += (frames[i]->frameSize_ / bytesPerSample);
}
}
}
void getAudioMeta(
DecoderOutput& decoderOutput,
int64_t& numSamples,
int64_t& channels,
int64_t& numFrames) {
FormatUnion& audioFormat = decoderOutput.media_data_[TYPE_AUDIO].format_;
channels = audioFormat.audio.channels;
CHECK_EQ(audioFormat.audio.format, AV_SAMPLE_FMT_FLT);
int bytesPerSample = av_get_bytes_per_sample(
static_cast<AVSampleFormat>(audioFormat.audio.format));
// auto& audioFrames = decoderOutput.media_frames_[TYPE_AUDIO];
auto& audioFrames = decoderOutput.media_data_[TYPE_AUDIO].frames_;
numFrames = audioFrames.size();
int64_t frameSizeTotal = 0;
for (auto const& decodedFrame : audioFrames) {
frameSizeTotal += static_cast<int64_t>(decodedFrame->frameSize_);
}
VLOG(2) << "numFrames: " << numFrames;
VLOG(2) << "frameSizeTotal: " << frameSizeTotal;
VLOG(2) << "channels: " << channels;
VLOG(2) << "bytesPerSample: " << bytesPerSample;
CHECK_EQ(frameSizeTotal % (channels * bytesPerSample), 0);
numSamples = frameSizeTotal / (channels * bytesPerSample);
}
torch::List<torch::Tensor> readVideo(
bool isReadFile,
const torch::Tensor& input_video,
std::string videoPath,
double seekFrameMargin,
int64_t getPtsOnly,
int64_t readVideoStream,
int64_t width,
int64_t height,
int64_t minDimension,
int64_t videoStartPts,
int64_t videoEndPts,
int64_t videoTimeBaseNum,
int64_t videoTimeBaseDen,
int64_t readAudioStream,
int64_t audioSamples,
int64_t audioChannels,
int64_t audioStartPts,
int64_t audioEndPts,
int64_t audioTimeBaseNum,
int64_t audioTimeBaseDen) {
if (!glog_initialized) {
glog_initialized = true;
// google::InitGoogleLogging("VideoReader");
}
unique_ptr<DecoderParameters> params = util::getDecoderParams(
seekFrameMargin,
getPtsOnly,
readVideoStream,
width,
height,
minDimension,
videoStartPts,
videoEndPts,
videoTimeBaseNum,
videoTimeBaseDen,
readAudioStream,
audioSamples,
audioChannels,
audioStartPts,
audioEndPts,
audioTimeBaseNum,
audioTimeBaseDen);
FfmpegDecoder decoder;
DecoderOutput decoderOutput;
if (isReadFile) {
decoder.decodeFile(std::move(params), videoPath, decoderOutput);
} else {
decoder.decodeMemory(
std::move(params),
input_video.data_ptr<uint8_t>(),
input_video.size(0),
decoderOutput);
}
// video section
torch::Tensor videoFrame = torch::zeros({0}, torch::kByte);
torch::Tensor videoFramePts = torch::zeros({0}, torch::kLong);
torch::Tensor videoTimeBase = torch::zeros({0}, torch::kInt);
torch::Tensor videoFps = torch::zeros({0}, torch::kFloat);
if (readVideoStream == 1) {
auto it = decoderOutput.media_data_.find(TYPE_VIDEO);
if (it != decoderOutput.media_data_.end()) {
int numVideoFrames, outHeight, outWidth, numChannels;
getVideoMeta(
decoderOutput, numVideoFrames, outHeight, outWidth, numChannels);
if (getPtsOnly == 0) {
videoFrame = torch::zeros(
{numVideoFrames, outHeight, outWidth, numChannels}, torch::kByte);
}
videoFramePts = torch::zeros({numVideoFrames}, torch::kLong);
fillVideoTensor(
decoderOutput.media_data_[TYPE_VIDEO].frames_,
videoFrame,
videoFramePts);
videoTimeBase = torch::zeros({2}, torch::kInt);
int* videoTimeBaseData = videoTimeBase.data_ptr<int>();
videoTimeBaseData[0] = it->second.format_.video.timeBaseNum;
videoTimeBaseData[1] = it->second.format_.video.timeBaseDen;
videoFps = torch::zeros({1}, torch::kFloat);
float* videoFpsData = videoFps.data_ptr<float>();
videoFpsData[0] = it->second.format_.video.fps;
} else {
VLOG(1) << "Miss video stream";
}
}
// audio section
torch::Tensor audioFrame = torch::zeros({0}, torch::kFloat);
torch::Tensor audioFramePts = torch::zeros({0}, torch::kLong);
torch::Tensor audioTimeBase = torch::zeros({0}, torch::kInt);
torch::Tensor audioSampleRate = torch::zeros({0}, torch::kInt);
if (readAudioStream == 1) {
auto it = decoderOutput.media_data_.find(TYPE_AUDIO);
if (it != decoderOutput.media_data_.end()) {
VLOG(1) << "Find audio stream";
int64_t numAudioSamples = 0, outAudioChannels = 0, numAudioFrames = 0;
getAudioMeta(
decoderOutput, numAudioSamples, outAudioChannels, numAudioFrames);
VLOG(2) << "numAudioSamples: " << numAudioSamples;
VLOG(2) << "outAudioChannels: " << outAudioChannels;
VLOG(2) << "numAudioFrames: " << numAudioFrames;
if (getPtsOnly == 0) {
audioFrame =
torch::zeros({numAudioSamples, outAudioChannels}, torch::kFloat);
}
audioFramePts = torch::zeros({numAudioFrames}, torch::kLong);
fillAudioTensor(
decoderOutput.media_data_[TYPE_AUDIO].frames_,
audioFrame,
audioFramePts);
audioTimeBase = torch::zeros({2}, torch::kInt);
int* audioTimeBaseData = audioTimeBase.data_ptr<int>();
audioTimeBaseData[0] = it->second.format_.audio.timeBaseNum;
audioTimeBaseData[1] = it->second.format_.audio.timeBaseDen;
audioSampleRate = torch::zeros({1}, torch::kInt);
int* audioSampleRateData = audioSampleRate.data_ptr<int>();
audioSampleRateData[0] = it->second.format_.audio.samples;
} else {
VLOG(1) << "Miss audio stream";
}
}
torch::List<torch::Tensor> result;
result.push_back(std::move(videoFrame));
result.push_back(std::move(videoFramePts));
result.push_back(std::move(videoTimeBase));
result.push_back(std::move(videoFps));
result.push_back(std::move(audioFrame));
result.push_back(std::move(audioFramePts));
result.push_back(std::move(audioTimeBase));
result.push_back(std::move(audioSampleRate));
return result;
}
torch::List<torch::Tensor> readVideoFromMemory(
torch::Tensor input_video,
double seekFrameMargin,
int64_t getPtsOnly,
int64_t readVideoStream,
int64_t width,
int64_t height,
int64_t minDimension,
int64_t videoStartPts,
int64_t videoEndPts,
int64_t videoTimeBaseNum,
int64_t videoTimeBaseDen,
int64_t readAudioStream,
int64_t audioSamples,
int64_t audioChannels,
int64_t audioStartPts,
int64_t audioEndPts,
int64_t audioTimeBaseNum,
int64_t audioTimeBaseDen) {
return readVideo(
false,
input_video,
"", // videoPath
seekFrameMargin,
getPtsOnly,
readVideoStream,
width,
height,
minDimension,
videoStartPts,
videoEndPts,
videoTimeBaseNum,
videoTimeBaseDen,
readAudioStream,
audioSamples,
audioChannels,
audioStartPts,
audioEndPts,
audioTimeBaseNum,
audioTimeBaseDen);
}
torch::List<torch::Tensor> readVideoFromFile(
std::string videoPath,
double seekFrameMargin,
int64_t getPtsOnly,
int64_t readVideoStream,
int64_t width,
int64_t height,
int64_t minDimension,
int64_t videoStartPts,
int64_t videoEndPts,
int64_t videoTimeBaseNum,
int64_t videoTimeBaseDen,
int64_t readAudioStream,
int64_t audioSamples,
int64_t audioChannels,
int64_t audioStartPts,
int64_t audioEndPts,
int64_t audioTimeBaseNum,
int64_t audioTimeBaseDen) {
torch::Tensor dummy_input_video = torch::ones({0});
return readVideo(
true,
dummy_input_video,
videoPath,
seekFrameMargin,
getPtsOnly,
readVideoStream,
width,
height,
minDimension,
videoStartPts,
videoEndPts,
videoTimeBaseNum,
videoTimeBaseDen,
readAudioStream,
audioSamples,
audioChannels,
audioStartPts,
audioEndPts,
audioTimeBaseNum,
audioTimeBaseDen);
}
} // namespace video_reader
static auto registry = torch::RegisterOperators()
.op("video_reader::read_video_from_memory",
&video_reader::readVideoFromMemory)
.op("video_reader::read_video_from_file",
&video_reader::readVideoFromFile);
#pragma once
#include <torch/script.h>
// Interface for Python
/*
return:
videoFrame: tensor (N, H, W, C) kByte
videoFramePts: tensor (N) kLong
videoTimeBase: tensor (2) kInt
videoFps: tensor (1) kFloat
audioFrame: tensor (N, C) kFloat
audioFramePts: tensor (N) kLong
audioTimeBase: tensor (2) kInt
audioSampleRate: tensor (1) kInt
*/
torch::List<torch::Tensor> readVideoFromMemory(
// 1D tensor of data type uint8, storing the comparessed video data
torch::Tensor input_video,
// seeking frame in the video/audio stream is imprecise so seek to a
// timestamp earlier by a margin The unit of margin is second
double seekFrameMargin,
// If only pts is needed and video/audio frames are not needed, set it
// to 1
int64_t getPtsOnly,
// bool variable. Set it to 1 if video stream should be read. Otherwise, set
// it to 0
int64_t readVideoStream,
/*
Valid parameters values for rescaling video frames
___________________________________________________
| width | height | min_dimension | algorithm |
|_________________________________________________|
| 0 | 0 | 0 | original |
|_________________________________________________|
| 0 | 0 | >0 |scale to min dimension|
|_____|_____|____________________________________ |
| >0 | 0 | 0 | scale keeping W |
|_________________________________________________|
| 0 | >0 | 0 | scale keeping H |
|_________________________________________________|
| >0 | >0 | 0 | stretch/scale |
|_________________________________________________|
*/
int64_t width,
int64_t height,
int64_t minDimension,
// video frames with pts in [videoStartPts, videoEndPts] will be decoded
// For decoding all video frames, use [0, -1]
int64_t videoStartPts,
int64_t videoEndPts,
// numerator and denominator of time base of video stream.
// For decoding all video frames, supply dummy 0 (numerator) and 1
// (denominator). For decoding localized video frames, need to supply
// them which will be checked during decoding
int64_t videoTimeBaseNum,
int64_t videoTimeBaseDen,
// bool variable. Set it to 1 if audio stream should be read. Otherwise, set
// it to 0
int64_t readAudioStream,
// audio stream sampling rate.
// If not resampling audio waveform, supply 0
// Otherwise, supply a positive integer.
int64_t audioSamples,
// audio stream channels
// Supply 0 to use the same number of channels as in the original audio
// stream
int64_t audioChannels,
// audio frames with pts in [audioStartPts, audioEndPts] will be decoded
// For decoding all audio frames, use [0, -1]
int64_t audioStartPts,
int64_t audioEndPts,
// numerator and denominator of time base of audio stream.
// For decoding all audio frames, supply dummy 0 (numerator) and 1
// (denominator). For decoding localized audio frames, need to supply
// them which will be checked during decoding
int64_t audioTimeBaseNum,
int64_t audioTimeBaseDen);
torch::List<torch::Tensor> readVideoFromFile(
std::string videoPath,
double seekFrameMargin,
int64_t getPtsOnly,
int64_t readVideoStream,
int64_t width,
int64_t height,
int64_t minDimension,
int64_t videoStartPts,
int64_t videoEndPts,
int64_t videoTimeBaseNum,
int64_t videoTimeBaseDen,
int64_t readAudioStream,
int64_t audioSamples,
int64_t audioChannels,
int64_t audioStartPts,
int64_t audioEndPts,
int64_t audioTimeBaseNum,
int64_t audioTimeBaseDen);
#include "util.h"
using namespace std;
namespace util {
unique_ptr<DecoderParameters> getDecoderParams(
double seekFrameMargin,
int64_t getPtsOnly,
int64_t readVideoStream,
int videoWidth,
int videoHeight,
int videoMinDimension,
int64_t videoStartPts,
int64_t videoEndPts,
int videoTimeBaseNum,
int videoTimeBaseDen,
int64_t readAudioStream,
int audioSamples,
int audioChannels,
int64_t audioStartPts,
int64_t audioEndPts,
int audioTimeBaseNum,
int audioTimeBaseDen) {
unique_ptr<DecoderParameters> params = make_unique<DecoderParameters>();
if (readVideoStream == 1) {
params->formats.emplace(
MediaType::TYPE_VIDEO, MediaFormat(MediaType::TYPE_VIDEO));
MediaFormat& videoFormat = params->formats[MediaType::TYPE_VIDEO];
videoFormat.format.video.width = videoWidth;
videoFormat.format.video.height = videoHeight;
videoFormat.format.video.minDimension = videoMinDimension;
videoFormat.format.video.startPts = videoStartPts;
videoFormat.format.video.endPts = videoEndPts;
videoFormat.format.video.timeBaseNum = videoTimeBaseNum;
videoFormat.format.video.timeBaseDen = videoTimeBaseDen;
}
if (readAudioStream == 1) {
params->formats.emplace(
MediaType::TYPE_AUDIO, MediaFormat(MediaType::TYPE_AUDIO));
MediaFormat& audioFormat = params->formats[MediaType::TYPE_AUDIO];
audioFormat.format.audio.samples = audioSamples;
audioFormat.format.audio.channels = audioChannels;
audioFormat.format.audio.startPts = audioStartPts;
audioFormat.format.audio.endPts = audioEndPts;
audioFormat.format.audio.timeBaseNum = audioTimeBaseNum;
audioFormat.format.audio.timeBaseDen = audioTimeBaseDen;
}
params->seekFrameMargin = seekFrameMargin;
params->getPtsOnly = getPtsOnly;
return params;
}
} // namespace util
#pragma once
#include <memory>
#include "FfmpegDecoder.h"
namespace util {
std::unique_ptr<DecoderParameters> getDecoderParams(
double seekFrameMargin,
int64_t getPtsOnly,
int64_t readVideoStream,
int videoWidth,
int videoHeight,
int videoMinDimension,
int64_t videoStartPts,
int64_t videoEndPts,
int videoTimeBaseNum,
int videoTimeBaseDen,
int64_t readAudioStream,
int audioSamples,
int audioChannels,
int64_t audioStartPts,
int64_t audioEndPts,
int audioTimeBaseNum,
int audioTimeBaseDen);
} // namespace util
from .video import write_video, read_video, read_video_timestamps
from ._video_opt import _read_video_from_file, _read_video_timestamps_from_file
__all__ = [
'write_video', 'read_video', 'read_video_timestamps'
'write_video', 'read_video', 'read_video_timestamps',
]
from fractions import Fraction
import numpy as np
import os
import torch
import imp
import warnings
_HAS_VIDEO_OPT = False
try:
lib_dir = os.path.join(os.path.dirname(__file__), '..')
_, path, description = imp.find_module("video_reader", [lib_dir])
torch.ops.load_library(path)
_HAS_VIDEO_OPT = True
except (ImportError, OSError):
warnings.warn("video reader based on ffmpeg c++ ops not available")
default_timebase = Fraction(0, 1)
def _validate_pts(pts_range):
if pts_range[1] > 0:
assert pts_range[0] <= pts_range[1], \
"""Start pts should not be smaller than end pts, got
start pts: %d and end pts: %d""" % (pts_range[0], pts_range[1])
def _fill_info(vtimebase, vfps, atimebase, asample_rate):
info = {}
if vtimebase.numel() > 0:
info["video_timebase"] = Fraction(vtimebase[0].item(), vtimebase[1].item())
if vfps.numel() > 0:
info["video_fps"] = vfps.item()
if atimebase.numel() > 0:
info["audio_timebase"] = Fraction(atimebase[0].item(), atimebase[1].item())
if asample_rate.numel() > 0:
info["audio_sample_rate"] = asample_rate.item()
return info
def _align_audio_frames(aframes, aframe_pts, audio_pts_range):
start, end = aframe_pts[0], aframe_pts[-1]
num_samples = aframes.size(0)
step_per_aframe = float(end - start + 1) / float(num_samples)
s_idx = 0
e_idx = num_samples
if start < audio_pts_range[0]:
s_idx = int((audio_pts_range[0] - start) / step_per_aframe)
if end > audio_pts_range[1]:
e_idx = int((audio_pts_range[1] - end) / step_per_aframe)
return aframes[s_idx:e_idx, :]
def _read_video_from_file(
filename,
seek_frame_margin=0.25,
read_video_stream=True,
video_width=0,
video_height=0,
video_min_dimension=0,
video_pts_range=(0, -1),
video_timebase=default_timebase,
read_audio_stream=True,
audio_samples=0,
audio_channels=0,
audio_pts_range=(0, -1),
audio_timebase=default_timebase,
):
"""
Reads a video from a file, returning both the video frames as well as
the audio frames
Args
----------
filename : str
path to the video file
seek_frame_margin: double, optional
seeking frame in the stream is imprecise. Thus, when video_start_pts is specified,
we seek the pts earlier by seek_frame_margin seconds
read_video_stream: int, optional
whether read video stream. If yes, set to 1. Otherwise, 0
video_width/video_height/video_min_dimension: int
together decide the size of decoded frames
- when video_width = 0, video_height = 0, and video_min_dimension = 0, keep the orignal frame resolution
- when video_width = 0, video_height = 0, and video_min_dimension != 0, keep the aspect ratio and resize
the frame so that shorter edge size is video_min_dimension
- When video_width = 0, and video_height != 0, keep the aspect ratio and resize the frame
so that frame video_height is $video_height
- When video_width != 0, and video_height == 0, keep the aspect ratio and resize the frame
so that frame video_height is $video_width
- When video_width != 0, and video_height != 0, resize the frame so that frame video_width and video_height
are set to $video_width and $video_height, respectively
video_pts_range : list(int), optional
the start and end presentation timestamp of video stream
video_timebase: Fraction, optional
a Fraction rational number which denotes timebase in video stream
read_audio_stream: int, optional
whether read audio stream. If yes, set to 1. Otherwise, 0
audio_samples: int, optional
audio sampling rate
audio_channels: int optional
audio channels
audio_pts_range : list(int), optional
the start and end presentation timestamp of audio stream
audio_timebase: Fraction, optional
a Fraction rational number which denotes time base in audio stream
Returns
-------
vframes : Tensor[T, H, W, C]
the `T` video frames
aframes : Tensor[L, K]
the audio frames, where `L` is the number of points and
`K` is the number of audio_channels
info : Dict
metadata for the video and audio. Can contain the fields video_fps (float)
and audio_fps (int)
"""
_validate_pts(video_pts_range)
_validate_pts(audio_pts_range)
result = torch.ops.video_reader.read_video_from_file(
filename,
seek_frame_margin,
0, # getPtsOnly
read_video_stream,
video_width,
video_height,
video_min_dimension,
video_pts_range[0],
video_pts_range[1],
video_timebase.numerator,
video_timebase.denominator,
read_audio_stream,
audio_samples,
audio_channels,
audio_pts_range[0],
audio_pts_range[1],
audio_timebase.numerator,
audio_timebase.denominator,
)
vframes, _vframe_pts, vtimebase, vfps, aframes, aframe_pts, atimebase, asample_rate = result
info = _fill_info(vtimebase, vfps, atimebase, asample_rate)
if aframes.numel() > 0:
# when audio stream is found
aframes = _align_audio_frames(aframes, aframe_pts, audio_pts_range)
return vframes, aframes, info
def _read_video_timestamps_from_file(filename):
"""
Decode all video- and audio frames in the video. Only pts
(presentation timestamp) is returned. The actual frame pixel data is not
copied. Thus, it is much faster than read_video(...)
"""
result = torch.ops.video_reader.read_video_from_file(
filename,
0, # seek_frame_margin
1, # getPtsOnly
1, # read_video_stream
0, # video_width
0, # video_height
0, # video_min_dimension
0, # video_start_pts
-1, # video_end_pts
0, # video_timebase_num
1, # video_timebase_den
1, # read_audio_stream
0, # audio_samples
0, # audio_channels
0, # audio_start_pts
-1, # audio_end_pts
0, # audio_timebase_num
1, # audio_timebase_den
)
_vframes, vframe_pts, vtimebase, vfps, _aframes, aframe_pts, atimebase, asample_rate = result
info = _fill_info(vtimebase, vfps, atimebase, asample_rate)
vframe_pts = vframe_pts.numpy().tolist()
aframe_pts = aframe_pts.numpy().tolist()
return vframe_pts, aframe_pts, info
def _read_video_from_memory(
file_buffer,
seek_frame_margin=0.25,
read_video_stream=1,
video_width=0,
video_height=0,
video_min_dimension=0,
video_pts_range=(0, -1),
video_timebase=default_timebase,
read_audio_stream=1,
audio_samples=0,
audio_channels=0,
audio_pts_range=(0, -1),
audio_timebase=default_timebase,
):
"""
Reads a video from memory, returning both the video frames as well as
the audio frames
Args
----------
file_buffer : buffer
buffer of compressed video content
seek_frame_margin: double, optional
seeking frame in the stream is imprecise. Thus, when video_start_pts is specified,
we seek the pts earlier by seek_frame_margin seconds
read_video_stream: int, optional
whether read video stream. If yes, set to 1. Otherwise, 0
video_width/video_height/video_min_dimension: int
together decide the size of decoded frames
- when video_width = 0, video_height = 0, and video_min_dimension = 0, keep the orignal frame resolution
- when video_width = 0, video_height = 0, and video_min_dimension != 0, keep the aspect ratio and resize
the frame so that shorter edge size is video_min_dimension
- When video_width = 0, and video_height != 0, keep the aspect ratio and resize the frame
so that frame video_height is $video_height
- When video_width != 0, and video_height == 0, keep the aspect ratio and resize the frame
so that frame video_height is $video_width
- When video_width != 0, and video_height != 0, resize the frame so that frame video_width and video_height
are set to $video_width and $video_height, respectively
video_pts_range : list(int), optional
the start and end presentation timestamp of video stream
video_timebase: Fraction, optional
a Fraction rational number which denotes timebase in video stream
read_audio_stream: int, optional
whether read audio stream. If yes, set to 1. Otherwise, 0
audio_samples: int, optional
audio sampling rate
audio_channels: int optional
audio audio_channels
audio_pts_range : list(int), optional
the start and end presentation timestamp of audio stream
audio_timebase: Fraction, optional
a Fraction rational number which denotes time base in audio stream
Returns
-------
vframes : Tensor[T, H, W, C]
the `T` video frames
aframes : Tensor[L, K]
the audio frames, where `L` is the number of points and
`K` is the number of channels
info : Dict
metadata for the video and audio. Can contain the fields video fps (float)
and audio sample rate (int)
"""
_validate_pts(video_pts_range)
_validate_pts(audio_pts_range)
video_tensor = torch.from_numpy(np.frombuffer(file_buffer, dtype=np.uint8))
result = torch.ops.video_reader.read_video_from_memory(
video_tensor,
seek_frame_margin,
0, # getPtsOnly
read_video_stream,
video_width,
video_height,
video_min_dimension,
video_pts_range[0],
video_pts_range[1],
video_timebase.numerator,
video_timebase.denominator,
read_audio_stream,
audio_samples,
audio_channels,
audio_pts_range[0],
audio_pts_range[1],
audio_timebase.numerator,
audio_timebase.denominator,
)
vframes, _vframe_pts, vtimebase, vfps, aframes, aframe_pts, atimebase, asample_rate = result
info = _fill_info(vtimebase, vfps, atimebase, asample_rate)
if aframes.numel() > 0:
# when audio stream is found
aframes = _align_audio_frames(aframes, aframe_pts, audio_pts_range)
return vframes, aframes, info
def _read_video_timestamps_from_memory(file_buffer):
"""
Decode all frames in the video. Only pts (presentation timestamp) is returned.
The actual frame pixel data is not copied. Thus, read_video_timestamps(...)
is much faster than read_video(...)
"""
video_tensor = torch.from_numpy(np.frombuffer(file_buffer, dtype=np.uint8))
result = torch.ops.video_reader.read_video_from_memory(
video_tensor,
0, # seek_frame_margin
1, # getPtsOnly
1, # read_video_stream
0, # video_width
0, # video_height
0, # video_min_dimension
0, # video_start_pts
-1, # video_end_pts
0, # video_timebase_num
1, # video_timebase_den
1, # read_audio_stream
0, # audio_samples
0, # audio_channels
0, # audio_start_pts
-1, # audio_end_pts
0, # audio_timebase_num
1, # audio_timebase_den
)
_vframes, vframe_pts, vtimebase, vfps, _aframes, aframe_pts, atimebase, asample_rate = result
info = _fill_info(vtimebase, vfps, atimebase, asample_rate)
vframe_pts = vframe_pts.numpy().tolist()
aframe_pts = aframe_pts.numpy().tolist()
return vframe_pts, aframe_pts, info
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment