Commit 78bae405 authored by mashun1's avatar mashun1
Browse files

open_sora_inference

parents
Pipeline #826 canceled with stages
import base64
import csv
import os
import cv2
from PIL import Image
prompts = {
"naive": "Describe the video",
"three_frames": "A video is given by providing three frames in chronological order. Describe this video and its style to generate a description. Pay attention to all objects in the video. Do not describe each frame individually. Do not reply with words like 'first frame'. The description should be useful for AI to re-generate the video. The description should be less than six sentences. Here are some examples of good descriptions: 1. A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. 2. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field. 3. Drone view of waves crashing against the rugged cliffs along Big Sur's garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff's edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff’s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.",
}
def get_filelist(file_path):
Filelist = []
VID_EXTENSIONS = ("mp4", "avi", "mov", "mkv")
for home, dirs, files in os.walk(file_path):
for filename in files:
ext = filename.split(".")[-1]
if ext in VID_EXTENSIONS:
Filelist.append(filename)
return Filelist
def get_video_length(cap):
return int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
def extract_frames(video_path, points=(0.2, 0.5, 0.8), base_64=False):
cap = cv2.VideoCapture(video_path)
length = get_video_length(cap)
points = [int(length * point) for point in points]
frames = []
if length < 3:
return frames, length
for point in points:
cap.set(cv2.CAP_PROP_POS_FRAMES, point)
ret, frame = cap.read()
if not base_64:
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frame = Image.fromarray(frame)
else:
_, buffer = cv2.imencode(".jpg", frame)
frame = base64.b64encode(buffer).decode("utf-8")
frames.append(frame)
return frames, length
def read_video_list(video_folder, output_file):
processed_videos = []
if os.path.exists(output_file):
with open(output_file, "r") as f:
reader = csv.reader(f)
samples = list(reader)
processed_videos = [sample[0] for sample in samples]
# read video list
videos = get_filelist(video_folder)
print(f"Dataset contains {len(videos)} videos.")
videos = [video for video in videos if video not in processed_videos]
print(f"Processing {len(videos)} new videos.")
return videos
# Dataset Download and Management
## Dataset Format
The training data should be provided in a CSV file with the following format:
```csv
/absolute/path/to/image1.jpg, caption1, num_of_frames
/absolute/path/to/image2.jpg, caption2, num_of_frames
```
## HD-VG-130M
This dataset comprises 130M text-video pairs. You can download the dataset and prepare it for training according to [the dataset repository's instructions](https://github.com/daooshee/HD-VG-130M). There is a README.md file in the Google Drive link that provides instructions on how to download and cut the videos. For this version, we directly use the dataset provided by the authors.
## Demo Dataset
You can use ImageNet and UCF101 for a quick demo. After downloading the datasets, you can use the following command to prepare the csv file for the dataset:
```bash
# ImageNet
python -m tools.datasets.convert_dataset imagenet IMAGENET_FOLDER --split train
# UCF101
python -m tools.datasets.convert_dataset ucf101 UCF101_FOLDER --split videos
```
## Manage datasets
We provide `csvutils.py` to manage the CSV files. You can use the following commands to process the CSV files:
```bash
# generate DATA_fmin_128_fmax_256.csv with frames between 128 and 256
python -m tools.datasets.csvutil DATA.csv --fmin 128 --fmax 256
# generate DATA_root.csv with absolute path
python -m tools.datasets.csvutil DATA.csv --root /absolute/path/to/dataset
# remove videos with no captions
python -m tools.datasets.csvutil DATA.csv --remove-empty-caption
# compute the number of frames for each video
python -m tools.datasets.csvutil DATA.csv --relength
# remove caption prefix
python -m tools.datasets.csvutil DATA.csv --remove-caption-prefix
```
To merge multiple CSV files, you can use the following command:
```bash
cat *csv > combined.csv
```
import argparse
import csv
import os
from torchvision.datasets import ImageNet
def get_filelist(file_path):
Filelist = []
for home, dirs, files in os.walk(file_path):
for filename in files:
Filelist.append(os.path.join(home, filename))
return Filelist
def split_by_capital(name):
# BoxingPunchingBag -> Boxing Punching Bag
new_name = ""
for i in range(len(name)):
if name[i].isupper() and i != 0:
new_name += " "
new_name += name[i]
return new_name
def process_imagenet(root, split):
root = os.path.expanduser(root)
data = ImageNet(root, split=split)
samples = [(path, data.classes[label][0]) for path, label in data.samples]
output = f"imagenet_{split}.csv"
with open(output, "w") as f:
writer = csv.writer(f)
writer.writerows(samples)
print(f"Saved {len(samples)} samples to {output}.")
def process_ucf101(root, split):
root = os.path.expanduser(root)
video_lists = get_filelist(os.path.join(root, split))
classes = [x.split("/")[-2] for x in video_lists]
classes = [split_by_capital(x) for x in classes]
samples = list(zip(video_lists, classes))
output = f"ucf101_{split}.csv"
with open(output, "w") as f:
writer = csv.writer(f)
writer.writerows(samples)
print(f"Saved {len(samples)} samples to {output}.")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("dataset", type=str, choices=["imagenet", "ucf101"])
parser.add_argument("root", type=str)
parser.add_argument("--split", type=str, default="train")
args = parser.parse_args()
if args.dataset == "imagenet":
process_imagenet(args.root, args.split)
elif args.dataset == "ucf101":
process_ucf101(args.root, args.split)
else:
raise ValueError("Invalid dataset")
import argparse
import csv
import os
from tqdm import tqdm
# path, name, #frames
PREFIX = [
"The video shows",
"The video captures",
"The video features",
"The video depicts",
"The video presents",
"The video features",
"The video is ",
"In the video,",
]
def get_video_length(path):
import cv2
cap = cv2.VideoCapture(path)
return int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
def main(args):
input_path = args.input
output_path = args.output
if output_path is None:
name = os.path.basename(input_path)
name, ext = os.path.splitext(name)
if args.fmin is not None:
name += f"_fmin_{args.fmin}"
if args.fmax is not None:
name += f"_fmax_{args.fmax}"
if args.remove_empty_caption:
name += "_rec"
if args.remove_caption_prefix:
name += "_rcp"
if args.root is not None:
name += f"_root"
if args.relength:
name += "_relength"
output_path = os.path.join(os.path.dirname(input_path), name + ext)
with open(input_path, "r") as f:
reader = csv.reader(f)
data = list(reader)
print("Number of videos before filtering:", len(data))
data_new = []
for i, row in tqdm(enumerate(data)):
path = row[0]
caption = row[1]
n_frames = int(row[2])
if args.fmin is not None and n_frames < args.fmin:
continue
if args.fmax is not None and n_frames > args.fmax:
continue
if args.remove_empty_caption and len(caption) == 0:
continue
if args.remove_caption_prefix:
for prefix in PREFIX:
if caption.startswith(prefix):
caption = caption[len(prefix) :].strip()
if caption[0].islower():
caption = caption[0].upper() + caption[1:]
row[1] = caption
break
if args.root is not None:
row[0] = os.path.join(args.root, path)
if args.relength:
n_frames = get_video_length(row[0])
row[2] = n_frames
data_new.append(row)
print("Number of videos after filtering:", len(data_new))
with open(output_path, "w") as f:
writer = csv.writer(f)
writer.writerows(data_new)
print("Output saved to", output_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("input", type=str)
parser.add_argument("--output", type=str, default=None)
parser.add_argument("--fmin", type=int, default=None)
parser.add_argument("--fmax", type=int, default=None)
parser.add_argument("--root", type=str, default=None)
parser.add_argument("--remove-empty-caption", action="store_true")
parser.add_argument("--remove-caption-prefix", action="store_true")
parser.add_argument("--relength", action="store_true")
args = parser.parse_args()
main(args)
# Scene Detection and Video Split
Raw videos from the Internet may be too long for training.
Thus, we detect scenes in raw videos and split them into short clips based on the scenes.
First prepare the video processing packages.
```bash
pip install scenedetect moviepy opencv-python
```
Then run `scene_detect.py`. We provide efficient processing using `multiprocessing`. Don't forget to specify your own dataset path.
import os
from multiprocessing import Pool
from mmengine.logging import MMLogger
from scenedetect import ContentDetector, detect
from tqdm import tqdm
from opensora.utils.misc import get_timestamp
from .utils import check_mp4_integrity, clone_folder_structure, iterate_files, split_video
# config
target_fps = 30 # int
shorter_size = 512 # int
min_seconds = 1 # float
max_seconds = 5 # float
assert max_seconds > min_seconds
cfg = dict(
target_fps=target_fps,
min_seconds=min_seconds,
max_seconds=max_seconds,
shorter_size=shorter_size,
)
def process_folder(root_src, root_dst):
# create logger
folder_path_log = os.path.dirname(root_dst)
log_name = os.path.basename(root_dst)
timestamp = get_timestamp()
log_path = os.path.join(folder_path_log, f"{log_name}_{timestamp}.log")
logger = MMLogger.get_instance(log_name, log_file=log_path)
# clone folder structure
clone_folder_structure(root_src, root_dst)
# all source videos
mp4_list = [x for x in iterate_files(root_src) if x.endswith(".mp4")]
mp4_list = sorted(mp4_list)
for idx, sample_path in tqdm(enumerate(mp4_list)):
folder_src = os.path.dirname(sample_path)
folder_dst = os.path.join(root_dst, os.path.relpath(folder_src, root_src))
# check src video integrity
if not check_mp4_integrity(sample_path, logger=logger):
continue
# detect scenes
scene_list = detect(sample_path, ContentDetector(), start_in_scene=True)
# split scenes
save_path_list = split_video(sample_path, scene_list, save_dir=folder_dst, **cfg, logger=logger)
# check integrity of generated clips
for x in save_path_list:
check_mp4_integrity(x, logger=logger)
def scene_detect():
"""detect & cut scenes using a single process
Expected dataset structure:
data/
your_dataset/
raw_videos/
xxx.mp4
yyy.mp4
This function results in:
data/
your_dataset/
raw_videos/
xxx.mp4
yyy.mp4
zzz.mp4
clips/
xxx_scene-0.mp4
yyy_scene-0.mp4
yyy_scene-1.mp4
"""
# TODO: specify your dataset root
root_src = f"./data/your_dataset/raw_videos"
root_dst = f"./data/your_dataset/clips"
process_folder(root_src, root_dst)
def scene_detect_mp():
"""detect & cut scenes using multiple processes
Expected dataset structure:
data/
your_dataset/
raw_videos/
split_0/
xxx.mp4
yyy.mp4
split_1/
xxx.mp4
yyy.mp4
This function results in:
data/
your_dataset/
raw_videos/
split_0/
xxx.mp4
yyy.mp4
split_1/
xxx.mp4
yyy.mp4
clips/
split_0/
xxx_scene-0.mp4
yyy_scene-0.mp4
split_1/
xxx_scene-0.mp4
yyy_scene-0.mp4
yyy_scene-1.mp4
"""
# TODO: specify your dataset root
root_src = f"./data/your_dataset/raw_videos"
root_dst = f"./data/your_dataset/clips"
# TODO: specify your splits
splits = ["split_0", "split_1"]
# process folders
root_src_list = [os.path.join(root_src, x) for x in splits]
root_dst_list = [os.path.join(root_dst, x) for x in splits]
with Pool(processes=len(splits)) as pool:
pool.starmap(process_folder, list(zip(root_src_list, root_dst_list)))
if __name__ == "__main__":
# TODO: choose single process or multiprocessing
scene_detect()
# scene_detect_mp()
import os
import subprocess
import cv2
from imageio_ffmpeg import get_ffmpeg_exe
from mmengine.logging import print_log
from moviepy.editor import VideoFileClip
from scenedetect import FrameTimecode
def iterate_files(folder_path):
for root, dirs, files in os.walk(folder_path):
# root contains the current directory path
# dirs contains the list of subdirectories in the current directory
# files contains the list of files in the current directory
# Process files in the current directory
for file in files:
file_path = os.path.join(root, file)
# print("File:", file_path)
yield file_path
# Process subdirectories and recursively call the function
for subdir in dirs:
subdir_path = os.path.join(root, subdir)
# print("Subdirectory:", subdir_path)
iterate_files(subdir_path)
def iterate_folders(folder_path):
for root, dirs, files in os.walk(folder_path):
for subdir in dirs:
subdir_path = os.path.join(root, subdir)
yield subdir_path
# print("Subdirectory:", subdir_path)
iterate_folders(subdir_path)
def clone_folder_structure(root_src, root_dst, verbose=False):
src_path_list = iterate_folders(root_src)
src_relpath_list = [os.path.relpath(x, root_src) for x in src_path_list]
os.makedirs(root_dst, exist_ok=True)
dst_path_list = [os.path.join(root_dst, x) for x in src_relpath_list]
for folder_path in dst_path_list:
os.makedirs(folder_path, exist_ok=True)
if verbose:
print(f"Create folder: '{folder_path}'")
def count_files(root, suffix=".mp4"):
files_list = iterate_files(root)
cnt = len([x for x in files_list if x.endswith(suffix)])
return cnt
def check_mp4_integrity(file_path, verbose=True, logger=None):
try:
VideoFileClip(file_path)
if verbose:
print_log(f"The MP4 file '{file_path}' is intact.", logger=logger)
return True
except Exception as e:
if verbose:
print_log(f"Error: {e}", logger=logger)
print_log(f"The MP4 file '{file_path}' is not intact.", logger=logger)
return False
def count_frames(video_path):
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
print(f"Error: Could not open video file '{video_path}'")
return
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
print(f"Total frames in the video '{video_path}': {total_frames}")
cap.release()
def split_video(
sample_path,
scene_list,
save_dir,
target_fps=30,
min_seconds=1,
max_seconds=10,
shorter_size=512,
verbose=False,
logger=None,
):
FFMPEG_PATH = get_ffmpeg_exe()
save_path_list = []
for idx, scene in enumerate(scene_list):
s, t = scene # FrameTimecode
fps = s.framerate
max_duration = FrameTimecode(timecode="00:00:00", fps=fps)
max_duration.frame_num = round(fps * max_seconds)
duration = min(max_duration, t - s)
if duration.get_frames() < round(min_seconds * fps):
continue
# save path
fname = os.path.basename(sample_path)
fname_wo_ext = os.path.splitext(fname)[0]
# TODO: fname pattern
save_path = os.path.join(save_dir, f"{fname_wo_ext}_scene-{idx}.mp4")
# ffmpeg cmd
cmd = [FFMPEG_PATH]
# Only show ffmpeg output for the first call, which will display any
# errors if it fails, and then break the loop. We only show error messages
# for the remaining calls.
# cmd += ['-v', 'error']
# input path
cmd += ["-i", sample_path]
# clip to cut
cmd += ["-nostdin", "-y", "-ss", str(s.get_seconds()), "-t", str(duration.get_seconds())]
# target fps
# cmd += ['-vf', 'select=mod(n\,2)']
cmd += ["-r", f"{target_fps}"]
# aspect ratio
cmd += ["-vf", f"scale='if(gt(iw,ih),-2,{shorter_size})':'if(gt(iw,ih),{shorter_size},-2)'"]
# cmd += ['-vf', f"scale='if(gt(iw,ih),{shorter_size},trunc(ow/a/2)*2)':-2"]
cmd += ["-map", "0", save_path]
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
stdout, stderr = proc.communicate()
if verbose:
stdout = stdout.decode("utf-8")
print_log(stdout, logger=logger)
save_path_list.append(sample_path)
print_log(f"Video clip saved to '{save_path}'", logger=logger)
return save_path_list
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment