Interface.h 3.35 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#pragma once

#include <c10/util/Logging.h>
#include <sys/types.h>
#include <memory>
#include <unordered_map>

extern "C" {

#include <libavutil/pixfmt.h>
#include <libavutil/samplefmt.h>
void av_free(void* ptr);
}

struct avDeleter {
  void operator()(uint8_t* p) const {
    av_free(p);
  }
};

const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24;
const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT;

using AvDataPtr = std::unique_ptr<uint8_t, avDeleter>;

enum MediaType : uint32_t {
  TYPE_VIDEO = 1,
  TYPE_AUDIO = 2,
};

struct EnumClassHash {
  template <typename T>
  uint32_t operator()(T t) const {
    return static_cast<uint32_t>(t);
  }
};

struct VideoFormat {
  // fields are initialized for the auto detection
  // caller can specify some/all of field values if specific output is desirable

  int width{0}; // width in pixels
  int height{0}; // height in pixels
  int minDimension{0}; // choose min dimension and rescale accordingly
  // Output image pixel format. data type AVPixelFormat
  AVPixelFormat format{defaultVideoPixelFormat}; // type AVPixelFormat
  int64_t startPts{0}, endPts{0}; // Start and end presentation timestamp
  int timeBaseNum{0};
  int timeBaseDen{1}; // numerator and denominator of time base
  float fps{0.0};
51
  int64_t duration{0}; // duration of the stream, in stream time base
52
53
54
55
56
57
58
59
60
61
62
63
};

struct AudioFormat {
  // fields are initialized for the auto detection
  // caller can specify some/all of field values if specific output is desirable

  int samples{0}; // number samples per second (frequency)
  int channels{0}; // number of channels
  AVSampleFormat format{defaultAudioSampleFormat}; // type AVSampleFormat
  int64_t startPts{0}, endPts{0}; // Start and end presentation timestamp
  int timeBaseNum{0};
  int timeBaseDen{1}; // numerator and denominator of time base
64
  int64_t duration{0}; // duration of the stream, in stream time base
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
};

union FormatUnion {
  FormatUnion() {}
  VideoFormat video;
  AudioFormat audio;
};

struct MediaFormat {
  MediaFormat() {}

  MediaFormat(const MediaFormat& mediaFormat) : type(mediaFormat.type) {
    if (type == MediaType::TYPE_VIDEO) {
      format.video = mediaFormat.format.video;
    } else if (type == MediaType::TYPE_AUDIO) {
      format.audio = mediaFormat.format.audio;
    }
  }

  MediaFormat(MediaType mediaType) : type(mediaType) {
    if (mediaType == MediaType::TYPE_VIDEO) {
      format.video = VideoFormat();
    } else if (mediaType == MediaType::TYPE_AUDIO) {
      format.audio = AudioFormat();
    }
  }
  // media type
  MediaType type;
  // format data
  FormatUnion format;
};

class DecodedFrame {
 public:
  explicit DecodedFrame() : frame_(nullptr), frameSize_(0), pts_(0) {}
  explicit DecodedFrame(AvDataPtr frame, int frameSize, int64_t pts)
      : frame_(std::move(frame)), frameSize_(frameSize), pts_(pts) {}
  AvDataPtr frame_{nullptr};
  int frameSize_{0};
  int64_t pts_{0};
};

struct MediaData {
  MediaData() {}
  MediaData(FormatUnion format) : format_(format) {}
  FormatUnion format_;
  std::vector<std::unique_ptr<DecodedFrame>> frames_;
};

class DecoderOutput {
 public:
  explicit DecoderOutput() {}

  ~DecoderOutput() {}

  void initMediaType(MediaType mediaType, FormatUnion format);

  void addMediaFrame(MediaType mediaType, std::unique_ptr<DecodedFrame> frame);

  void clear();

  std::unordered_map<MediaType, MediaData, EnumClassHash> media_data_;
};