av-asr: move video loading outside detector (#3498)

Summary: This PR moves video loading outside detector during pre-processing. Pull Request resolved: https://github.com/pytorch/audio/pull/3498 Reviewed By: mthrok Differential Revision: D47811044 Pulled By: mpc001 fbshipit-source-id: f17839b695b13d3cf2d9db343d7e9a0202eea7d5

av-asr: move video loading outside detector (#3498)
Summary: This PR moves video loading outside detector during pre-processing. Pull Request resolved: https://github.com/pytorch/audio/pull/3498 Reviewed By: mthrok Differential Revision: D47811044 Pulled By: mpc001 fbshipit-source-id: f17839b695b13d3cf2d9db343d7e9a0202eea7d5
c977afe0 · Pingchuan Ma · Facebook GitHub Bot · da212020 · c977afe0 · c977afe0
Commit c977afe0 authored Jul 26, 2023 by Pingchuan Ma Committed by Facebook GitHub Bot Jul 26, 2023
3 changed files
--- a/examples/avsr/data_prep/data/data_module.py
+++ b/examples/avsr/data_prep/data/data_module.py
@@ -32,8 +32,8 @@ class AVSRDataLoader:
            audio = self.audio_process(audio, sample_rate)
            return audio
        if self.modality == "video":
-            landmarks = self.landmarks_detector(data_filename)
            video = self.load_video(data_filename)
+            landmarks = self.landmarks_detector(video)
            video = self.video_process(video, landmarks)
            video = torch.tensor(video)
            return video

--- a/examples/avsr/data_prep/detectors/mediapipe/detector.py
+++ b/examples/avsr/data_prep/detectors/mediapipe/detector.py
@@ -9,7 +9,6 @@ import warnings
 import mediapipe as mp

 import numpy as np
-import torchvision

 warnings.filterwarnings("ignore")

@@ -29,8 +28,7 @@ class LandmarksDetector:
            assert any(l is not None for l in landmarks), "Cannot detect any frames in the video"
        return landmarks

-    def detect(self, filename, detector):
-        video_frames = torchvision.io.read_video(filename, pts_unit="sec")[0].numpy()
+    def detect(self, video_frames, detector):
        landmarks = []
        for frame in video_frames:
            results = detector.process(frame)

--- a/examples/avsr/data_prep/detectors/retinaface/detector.py
+++ b/examples/avsr/data_prep/detectors/retinaface/detector.py
@@ -7,7 +7,6 @@
 import warnings

 import numpy as np
-import torchvision
 from ibug.face_detection import RetinaFacePredictor

 warnings.filterwarnings("ignore")
@@ -19,8 +18,7 @@ class LandmarksDetector:
            device=device, threshold=0.8, model=RetinaFacePredictor.get_model(model_name)
        )

-    def __call__(self, filename):
-        video_frames = torchvision.io.read_video(filename, pts_unit="sec")[0].numpy()
+    def __call__(self, video_frames):
        landmarks = []
        for frame in video_frames:
            detected_faces = self.face_detector(frame, rgb=False)