video.py 7.92 KB
Newer Older
1
import re
2
3
4
5
6
7
import gc
import torch
import numpy as np

try:
    import av
8
    av.logging.set_level(av.logging.ERROR)
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
except ImportError:
    av = None


def _check_av_available():
    if av is None:
        raise ImportError("""\
PyAV is not installed, and is necessary for the video operations in torchvision.
See https://github.com/mikeboers/PyAV#installation for instructions on how to
install PyAV on your system.
""")


# PyAV has some reference cycles
_CALLED_TIMES = 0
_GC_COLLECTION_INTERVAL = 20


27
def write_video(filename, video_array, fps, video_codec='libx264', options=None):
28
29
30
31
32
33
34
35
36
37
38
39
40
41
    """
    Writes a 4d tensor in [T, H, W, C] format in a video file

    Arguments:
        filename (str): path where the video will be saved
        video_array (Tensor[T, H, W, C]): tensor containing the individual frames,
            as a uint8 tensor in [T, H, W, C] format
        fps (Number): frames per second
    """
    _check_av_available()
    video_array = torch.as_tensor(video_array, dtype=torch.uint8).numpy()

    container = av.open(filename, mode='w')

42
    stream = container.add_stream(video_codec, rate=fps)
43
44
    stream.width = video_array.shape[2]
    stream.height = video_array.shape[1]
45
46
    stream.pix_fmt = 'yuv420p' if video_codec != 'libx264rgb' else 'rgb24'
    stream.options = options or {}
47
48
49

    for img in video_array:
        frame = av.VideoFrame.from_ndarray(img, format='rgb24')
50
        frame.pict_type = 'NONE'
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
        for packet in stream.encode(frame):
            container.mux(packet)

    # Flush stream
    for packet in stream.encode():
        container.mux(packet)

    # Close the file
    container.close()


def _read_from_stream(container, start_offset, end_offset, stream, stream_name):
    global _CALLED_TIMES, _GC_COLLECTION_INTERVAL
    _CALLED_TIMES += 1
    if _CALLED_TIMES % _GC_COLLECTION_INTERVAL == _GC_COLLECTION_INTERVAL - 1:
        gc.collect()

68
69
70
71
    frames = {}
    should_buffer = False
    max_buffer_size = 5
    if stream.type == "video":
72
        # DivX-style packed B-frames can have out-of-order pts (2 frames in a single pkt)
73
74
        # so need to buffer some extra frames to sort everything
        # properly
75
76
77
78
79
80
81
82
83
84
85
86
        extradata = stream.codec_context.extradata
        # overly complicated way of finding if `divx_packed` is set, following
        # https://github.com/FFmpeg/FFmpeg/commit/d5a21172283572af587b3d939eba0091484d3263
        if extradata and b"DivX" in extradata:
            # can't use regex directly because of some weird characters sometimes...
            pos = extradata.find(b"DivX")
            d = extradata[pos:]
            o = re.search(br"DivX(\d+)Build(\d+)(\w)", d)
            if o is None:
                o = re.search(br"DivX(\d+)b(\d+)(\w)", d)
            if o is not None:
                should_buffer = o.group(3) == b"p"
87
    seek_offset = start_offset
88
89
    # some files don't seek to the right location, so better be safe here
    seek_offset = max(seek_offset - 1, 0)
90
91
92
93
    if should_buffer:
        # FIXME this is kind of a hack, but we will jump to the previous keyframe
        # so this will be safe
        seek_offset = max(seek_offset - max_buffer_size, 0)
94
95
96
97
98
99
    try:
        # TODO check if stream needs to always be the video stream here or not
        container.seek(seek_offset, any_frame=False, backward=True, stream=stream)
    except av.AVError:
        print("Corrupted file?", container.name)
        return []
100
    buffer_count = 0
101
    for idx, frame in enumerate(container.decode(**stream_name)):
102
        frames[frame.pts] = frame
103
        if frame.pts >= end_offset:
104
105
106
            if should_buffer and buffer_count < max_buffer_size:
                buffer_count += 1
                continue
107
            break
108
109
110
111
112
113
114
115
116
    # ensure that the results are sorted wrt the pts
    result = [frames[i] for i in sorted(frames) if start_offset <= frames[i].pts <= end_offset]
    if start_offset > 0 and start_offset not in frames:
        # if there is no frame that exactly matches the pts of start_offset
        # add the last frame smaller than start_offset, to guarantee that
        # we will have all the necessary data. This is most useful for audio
        first_frame_pts = max(i for i in frames if i < start_offset)
        result.insert(0, frames[first_frame_pts])
    return result
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158


def _align_audio_frames(aframes, audio_frames, ref_start, ref_end):
    start, end = audio_frames[0].pts, audio_frames[-1].pts
    total_aframes = aframes.shape[1]
    step_per_aframe = (end - start + 1) / total_aframes
    s_idx = 0
    e_idx = total_aframes
    if start < ref_start:
        s_idx = int((ref_start - start) / step_per_aframe)
    if end > ref_end:
        e_idx = int((ref_end - end) / step_per_aframe)
    return aframes[:, s_idx:e_idx]


def read_video(filename, start_pts=0, end_pts=None):
    """
    Reads a video from a file, returning both the video frames as well as
    the audio frames

    Arguments:
        filename (str): path to the video file
        start_pts (int, optional): the start presentation time of the video
        end_pts (int, optional): the end presentation time

    Returns:
        vframes (Tensor[T, H, W, C]): the `T` video frames
        aframes (Tensor[K, L]): the audio frames, where `K` is the number of channels
            and `L` is the number of points
        info (Dict): metadata for the video and audio. Can contain the fields
            - video_fps (float)
            - audio_fps (int)
    """
    _check_av_available()

    if end_pts is None:
        end_pts = float("inf")

    if end_pts < start_pts:
        raise ValueError("end_pts should be larger than start_pts, got "
                         "start_pts={} and end_pts={}".format(start_pts, end_pts))

159
    container = av.open(filename, metadata_errors='ignore')
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
    info = {}

    video_frames = []
    if container.streams.video:
        video_frames = _read_from_stream(container, start_pts, end_pts,
                                         container.streams.video[0], {'video': 0})
        info["video_fps"] = float(container.streams.video[0].average_rate)
    audio_frames = []
    if container.streams.audio:
        audio_frames = _read_from_stream(container, start_pts, end_pts,
                                         container.streams.audio[0], {'audio': 0})
        info["audio_fps"] = container.streams.audio[0].rate

    container.close()

    vframes = [frame.to_rgb().to_ndarray() for frame in video_frames]
    aframes = [frame.to_ndarray() for frame in audio_frames]
    vframes = torch.as_tensor(np.stack(vframes))
    if aframes:
        aframes = np.concatenate(aframes, 1)
        aframes = torch.as_tensor(aframes)
        aframes = _align_audio_frames(aframes, audio_frames, start_pts, end_pts)
    else:
        aframes = torch.empty((1, 0), dtype=torch.float32)

    return vframes, aframes, info


188
189
190
191
192
193
194
195
196
def _can_read_timestamps_from_packets(container):
    extradata = container.streams[0].codec_context.extradata
    if extradata is None:
        return False
    if b"Lavc" in extradata:
        return True
    return False


197
198
199
200
201
202
203
204
205
206
207
208
def read_video_timestamps(filename):
    """
    List the video frames timestamps.

    Note that the function decodes the whole video frame-by-frame.

    Arguments:
        filename (str): path to the video file

    Returns:
        pts (List[int]): presentation timestamps for each one of the frames
            in the video.
209
        video_fps (int): the frame rate for the video
210
211
    """
    _check_av_available()
212
    container = av.open(filename, metadata_errors='ignore')
213
214

    video_frames = []
215
    video_fps = None
216
    if container.streams.video:
217
218
219
220
221
222
        if _can_read_timestamps_from_packets(container):
            # fast path
            video_frames = [x for x in container.demux(video=0) if x.pts is not None]
        else:
            video_frames = _read_from_stream(container, 0, float("inf"),
                                             container.streams.video[0], {'video': 0})
223
        video_fps = float(container.streams.video[0].average_rate)
224
    container.close()
225
    return [x.pts for x in video_frames], video_fps