open_sora_inference

78bae405 · mashun1 · 78bae405 · 78bae405 · 78bae405 · 78bae405
Commit 78bae405 authored Mar 25, 2024 by mashun1
9 changed files
--- a/tools/caption/utils.py
+++ b/tools/caption/utils.py
+import base64
+import csv
+import os
+
+import cv2
+from PIL import Image
+
+prompts = {
+    "naive": "Describe the video",
+    "three_frames": "A video is given by providing three frames in chronological order. Describe this video and its style to generate a description. Pay attention to all objects in the video. Do not describe each frame individually. Do not reply with words like 'first frame'. The description should be useful for AI to re-generate the video. The description should be less than six sentences. Here are some examples of good descriptions: 1. A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. 2. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field. 3. Drone view of waves crashing against the rugged cliffs along Big Sur's garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff's edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff’s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.",
+}
+
+
+def get_filelist(file_path):
+    Filelist = []
+    VID_EXTENSIONS = ("mp4", "avi", "mov", "mkv")
+    for home, dirs, files in os.walk(file_path):
+        for filename in files:
+            ext = filename.split(".")[-1]
+            if ext in VID_EXTENSIONS:
+                Filelist.append(filename)
+    return Filelist
+
+
+def get_video_length(cap):
+    return int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+
+
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode("utf-8")
+
+
+def extract_frames(video_path, points=(0.2, 0.5, 0.8), base_64=False):
+    cap = cv2.VideoCapture(video_path)
+    length = get_video_length(cap)
+    points = [int(length * point) for point in points]
+    frames = []
+    if length < 3:
+        return frames, length
+    for point in points:
+        cap.set(cv2.CAP_PROP_POS_FRAMES, point)
+        ret, frame = cap.read()
+        if not base_64:
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            frame = Image.fromarray(frame)
+        else:
+            _, buffer = cv2.imencode(".jpg", frame)
+            frame = base64.b64encode(buffer).decode("utf-8")
+        frames.append(frame)
+    return frames, length
+
+
+def read_video_list(video_folder, output_file):
+    processed_videos = []
+    if os.path.exists(output_file):
+        with open(output_file, "r") as f:
+            reader = csv.reader(f)
+            samples = list(reader)
+            processed_videos = [sample[0] for sample in samples]
+
+    # read video list
+    videos = get_filelist(video_folder)
+    print(f"Dataset contains {len(videos)} videos.")
+    videos = [video for video in videos if video not in processed_videos]
+    print(f"Processing {len(videos)} new videos.")
+    return videos
--- a/tools/datasets/README.md
+++ b/tools/datasets/README.md
+# Dataset Download and Management
+
+## Dataset Format
+
+The training data should be provided in a CSV file with the following format:
+
+```csv
+/absolute/path/to/image1.jpg, caption1, num_of_frames
+/absolute/path/to/image2.jpg, caption2, num_of_frames
+```
+
+## HD-VG-130M
+
+This dataset comprises 130M text-video pairs. You can download the dataset and prepare it for training according to [the dataset repository's instructions](https://github.com/daooshee/HD-VG-130M). There is a README.md file in the Google Drive link that provides instructions on how to download and cut the videos. For this version, we directly use the dataset provided by the authors.
+
+## Demo Dataset
+
+You can use ImageNet and UCF101 for a quick demo. After downloading the datasets, you can use the following command to prepare the csv file for the dataset:
+
+```bash
+# ImageNet
+python -m tools.datasets.convert_dataset imagenet IMAGENET_FOLDER --split train
+# UCF101
+python -m tools.datasets.convert_dataset ucf101 UCF101_FOLDER --split videos
+```
+
+## Manage datasets
+
+We provide `csvutils.py` to manage the CSV files. You can use the following commands to process the CSV files:
+
+```bash
+# generate DATA_fmin_128_fmax_256.csv with frames between 128 and 256
+python -m tools.datasets.csvutil DATA.csv --fmin 128 --fmax 256
+# generate DATA_root.csv with absolute path
+python -m tools.datasets.csvutil DATA.csv --root /absolute/path/to/dataset
+# remove videos with no captions
+python -m tools.datasets.csvutil DATA.csv --remove-empty-caption
+# compute the number of frames for each video
+python -m tools.datasets.csvutil DATA.csv --relength
+# remove caption prefix
+python -m tools.datasets.csvutil DATA.csv --remove-caption-prefix
+```
+
+To merge multiple CSV files, you can use the following command:
+
+```bash
+cat *csv > combined.csv
+```
--- a/tools/datasets/__init__.py
+++ b/tools/datasets/__init__.py
--- a/tools/datasets/convert_dataset.py
+++ b/tools/datasets/convert_dataset.py
+import argparse
+import csv
+import os
+
+from torchvision.datasets import ImageNet
+
+
+def get_filelist(file_path):
+    Filelist = []
+    for home, dirs, files in os.walk(file_path):
+        for filename in files:
+            Filelist.append(os.path.join(home, filename))
+    return Filelist
+
+
+def split_by_capital(name):
+    # BoxingPunchingBag -> Boxing Punching Bag
+    new_name = ""
+    for i in range(len(name)):
+        if name[i].isupper() and i != 0:
+            new_name += " "
+        new_name += name[i]
+    return new_name
+
+
+def process_imagenet(root, split):
+    root = os.path.expanduser(root)
+    data = ImageNet(root, split=split)
+    samples = [(path, data.classes[label][0]) for path, label in data.samples]
+    output = f"imagenet_{split}.csv"
+
+    with open(output, "w") as f:
+        writer = csv.writer(f)
+        writer.writerows(samples)
+
+    print(f"Saved {len(samples)} samples to {output}.")
+
+
+def process_ucf101(root, split):
+    root = os.path.expanduser(root)
+    video_lists = get_filelist(os.path.join(root, split))
+    classes = [x.split("/")[-2] for x in video_lists]
+    classes = [split_by_capital(x) for x in classes]
+    samples = list(zip(video_lists, classes))
+    output = f"ucf101_{split}.csv"
+
+    with open(output, "w") as f:
+        writer = csv.writer(f)
+        writer.writerows(samples)
+
+    print(f"Saved {len(samples)} samples to {output}.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("dataset", type=str, choices=["imagenet", "ucf101"])
+    parser.add_argument("root", type=str)
+    parser.add_argument("--split", type=str, default="train")
+    args = parser.parse_args()
+
+    if args.dataset == "imagenet":
+        process_imagenet(args.root, args.split)
+    elif args.dataset == "ucf101":
+        process_ucf101(args.root, args.split)
+    else:
+        raise ValueError("Invalid dataset")
--- a/tools/datasets/csvutil.py
+++ b/tools/datasets/csvutil.py
+import argparse
+import csv
+import os
+
+from tqdm import tqdm
+
+# path, name, #frames
+PREFIX = [
+    "The video shows",
+    "The video captures",
+    "The video features",
+    "The video depicts",
+    "The video presents",
+    "The video features",
+    "The video is ",
+    "In the video,",
+]
+
+
+def get_video_length(path):
+    import cv2
+
+    cap = cv2.VideoCapture(path)
+    return int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+
+
+def main(args):
+    input_path = args.input
+    output_path = args.output
+    if output_path is None:
+        name = os.path.basename(input_path)
+        name, ext = os.path.splitext(name)
+        if args.fmin is not None:
+            name += f"_fmin_{args.fmin}"
+        if args.fmax is not None:
+            name += f"_fmax_{args.fmax}"
+        if args.remove_empty_caption:
+            name += "_rec"
+        if args.remove_caption_prefix:
+            name += "_rcp"
+        if args.root is not None:
+            name += f"_root"
+        if args.relength:
+            name += "_relength"
+        output_path = os.path.join(os.path.dirname(input_path), name + ext)
+
+    with open(input_path, "r") as f:
+        reader = csv.reader(f)
+        data = list(reader)
+    print("Number of videos before filtering:", len(data))
+
+    data_new = []
+    for i, row in tqdm(enumerate(data)):
+        path = row[0]
+        caption = row[1]
+        n_frames = int(row[2])
+        if args.fmin is not None and n_frames < args.fmin:
+            continue
+        if args.fmax is not None and n_frames > args.fmax:
+            continue
+        if args.remove_empty_caption and len(caption) == 0:
+            continue
+        if args.remove_caption_prefix:
+            for prefix in PREFIX:
+                if caption.startswith(prefix):
+                    caption = caption[len(prefix) :].strip()
+                    if caption[0].islower():
+                        caption = caption[0].upper() + caption[1:]
+                    row[1] = caption
+                    break
+        if args.root is not None:
+            row[0] = os.path.join(args.root, path)
+        if args.relength:
+            n_frames = get_video_length(row[0])
+            row[2] = n_frames
+        data_new.append(row)
+
+    print("Number of videos after filtering:", len(data_new))
+    with open(output_path, "w") as f:
+        writer = csv.writer(f)
+        writer.writerows(data_new)
+    print("Output saved to", output_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("input", type=str)
+    parser.add_argument("--output", type=str, default=None)
+    parser.add_argument("--fmin", type=int, default=None)
+    parser.add_argument("--fmax", type=int, default=None)
+    parser.add_argument("--root", type=str, default=None)
+    parser.add_argument("--remove-empty-caption", action="store_true")
+    parser.add_argument("--remove-caption-prefix", action="store_true")
+    parser.add_argument("--relength", action="store_true")
+    args = parser.parse_args()
+    main(args)
--- a/tools/intepolate/README.md
+++ b/tools/intepolate/README.md
+# To be added
--- a/tools/scenedetect/README.md
+++ b/tools/scenedetect/README.md
+# Scene Detection and Video Split
+
+Raw videos from the Internet may be too long for training. 
+Thus, we detect scenes in raw videos and split them into short clips based on the scenes.
+First prepare the video processing packages.
+```bash
+pip install scenedetect moviepy opencv-python
+```
+Then run `scene_detect.py`. We provide efficient processing using `multiprocessing`. Don't forget to specify your own dataset path.
--- a/tools/scenedetect/scene_detect.py
+++ b/tools/scenedetect/scene_detect.py
+import os
+from multiprocessing import Pool
+
+from mmengine.logging import MMLogger
+from scenedetect import ContentDetector, detect
+from tqdm import tqdm
+
+from opensora.utils.misc import get_timestamp
+
+from .utils import check_mp4_integrity, clone_folder_structure, iterate_files, split_video
+
+# config
+target_fps = 30  # int
+shorter_size = 512  # int
+min_seconds = 1  # float
+max_seconds = 5  # float
+assert max_seconds > min_seconds
+cfg = dict(
+    target_fps=target_fps,
+    min_seconds=min_seconds,
+    max_seconds=max_seconds,
+    shorter_size=shorter_size,
+)
+
+
+def process_folder(root_src, root_dst):
+    # create logger
+    folder_path_log = os.path.dirname(root_dst)
+    log_name = os.path.basename(root_dst)
+    timestamp = get_timestamp()
+    log_path = os.path.join(folder_path_log, f"{log_name}_{timestamp}.log")
+    logger = MMLogger.get_instance(log_name, log_file=log_path)
+
+    # clone folder structure
+    clone_folder_structure(root_src, root_dst)
+
+    # all source videos
+    mp4_list = [x for x in iterate_files(root_src) if x.endswith(".mp4")]
+    mp4_list = sorted(mp4_list)
+
+    for idx, sample_path in tqdm(enumerate(mp4_list)):
+        folder_src = os.path.dirname(sample_path)
+        folder_dst = os.path.join(root_dst, os.path.relpath(folder_src, root_src))
+
+        # check src video integrity
+        if not check_mp4_integrity(sample_path, logger=logger):
+            continue
+
+        # detect scenes
+        scene_list = detect(sample_path, ContentDetector(), start_in_scene=True)
+
+        # split scenes
+        save_path_list = split_video(sample_path, scene_list, save_dir=folder_dst, **cfg, logger=logger)
+
+        # check integrity of generated clips
+        for x in save_path_list:
+            check_mp4_integrity(x, logger=logger)
+
+
+def scene_detect():
+    """detect & cut scenes using a single process
+    Expected dataset structure:
+    data/
+        your_dataset/
+            raw_videos/
+                xxx.mp4
+                yyy.mp4
+
+    This function results in:
+    data/
+        your_dataset/
+            raw_videos/
+                xxx.mp4
+                yyy.mp4
+                zzz.mp4
+            clips/
+                xxx_scene-0.mp4
+                yyy_scene-0.mp4
+                yyy_scene-1.mp4
+    """
+    # TODO: specify your dataset root
+    root_src = f"./data/your_dataset/raw_videos"
+    root_dst = f"./data/your_dataset/clips"
+
+    process_folder(root_src, root_dst)
+
+
+def scene_detect_mp():
+    """detect & cut scenes using multiple processes
+    Expected dataset structure:
+    data/
+        your_dataset/
+            raw_videos/
+                split_0/
+                    xxx.mp4
+                    yyy.mp4
+                split_1/
+                    xxx.mp4
+                    yyy.mp4
+
+    This function results in:
+    data/
+        your_dataset/
+            raw_videos/
+                split_0/
+                    xxx.mp4
+                    yyy.mp4
+                split_1/
+                    xxx.mp4
+                    yyy.mp4
+            clips/
+                split_0/
+                    xxx_scene-0.mp4
+                    yyy_scene-0.mp4
+                split_1/
+                    xxx_scene-0.mp4
+                    yyy_scene-0.mp4
+                    yyy_scene-1.mp4
+    """
+    # TODO: specify your dataset root
+    root_src = f"./data/your_dataset/raw_videos"
+    root_dst = f"./data/your_dataset/clips"
+
+    # TODO: specify your splits
+    splits = ["split_0", "split_1"]
+
+    # process folders
+    root_src_list = [os.path.join(root_src, x) for x in splits]
+    root_dst_list = [os.path.join(root_dst, x) for x in splits]
+
+    with Pool(processes=len(splits)) as pool:
+        pool.starmap(process_folder, list(zip(root_src_list, root_dst_list)))
+
+
+if __name__ == "__main__":
+    # TODO: choose single process or multiprocessing
+    scene_detect()
+    # scene_detect_mp()
--- a/tools/scenedetect/utils.py
+++ b/tools/scenedetect/utils.py
+import os
+import subprocess
+
+import cv2
+from imageio_ffmpeg import get_ffmpeg_exe
+from mmengine.logging import print_log
+from moviepy.editor import VideoFileClip
+from scenedetect import FrameTimecode
+
+
+def iterate_files(folder_path):
+    for root, dirs, files in os.walk(folder_path):
+        # root contains the current directory path
+        # dirs contains the list of subdirectories in the current directory
+        # files contains the list of files in the current directory
+
+        # Process files in the current directory
+        for file in files:
+            file_path = os.path.join(root, file)
+            # print("File:", file_path)
+            yield file_path
+
+        # Process subdirectories and recursively call the function
+        for subdir in dirs:
+            subdir_path = os.path.join(root, subdir)
+            # print("Subdirectory:", subdir_path)
+            iterate_files(subdir_path)
+
+
+def iterate_folders(folder_path):
+    for root, dirs, files in os.walk(folder_path):
+        for subdir in dirs:
+            subdir_path = os.path.join(root, subdir)
+            yield subdir_path
+            # print("Subdirectory:", subdir_path)
+            iterate_folders(subdir_path)
+
+
+def clone_folder_structure(root_src, root_dst, verbose=False):
+    src_path_list = iterate_folders(root_src)
+    src_relpath_list = [os.path.relpath(x, root_src) for x in src_path_list]
+
+    os.makedirs(root_dst, exist_ok=True)
+    dst_path_list = [os.path.join(root_dst, x) for x in src_relpath_list]
+    for folder_path in dst_path_list:
+        os.makedirs(folder_path, exist_ok=True)
+        if verbose:
+            print(f"Create folder: '{folder_path}'")
+
+
+def count_files(root, suffix=".mp4"):
+    files_list = iterate_files(root)
+    cnt = len([x for x in files_list if x.endswith(suffix)])
+    return cnt
+
+
+def check_mp4_integrity(file_path, verbose=True, logger=None):
+    try:
+        VideoFileClip(file_path)
+        if verbose:
+            print_log(f"The MP4 file '{file_path}' is intact.", logger=logger)
+        return True
+    except Exception as e:
+        if verbose:
+            print_log(f"Error: {e}", logger=logger)
+            print_log(f"The MP4 file '{file_path}' is not intact.", logger=logger)
+        return False
+
+
+def count_frames(video_path):
+    cap = cv2.VideoCapture(video_path)
+
+    if not cap.isOpened():
+        print(f"Error: Could not open video file '{video_path}'")
+        return
+
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    print(f"Total frames in the video '{video_path}': {total_frames}")
+
+    cap.release()
+
+
+def split_video(
+    sample_path,
+    scene_list,
+    save_dir,
+    target_fps=30,
+    min_seconds=1,
+    max_seconds=10,
+    shorter_size=512,
+    verbose=False,
+    logger=None,
+):
+    FFMPEG_PATH = get_ffmpeg_exe()
+
+    save_path_list = []
+    for idx, scene in enumerate(scene_list):
+        s, t = scene  # FrameTimecode
+        fps = s.framerate
+        max_duration = FrameTimecode(timecode="00:00:00", fps=fps)
+        max_duration.frame_num = round(fps * max_seconds)
+        duration = min(max_duration, t - s)
+        if duration.get_frames() < round(min_seconds * fps):
+            continue
+
+        # save path
+        fname = os.path.basename(sample_path)
+        fname_wo_ext = os.path.splitext(fname)[0]
+        # TODO: fname pattern
+        save_path = os.path.join(save_dir, f"{fname_wo_ext}_scene-{idx}.mp4")
+
+        # ffmpeg cmd
+        cmd = [FFMPEG_PATH]
+
+        # Only show ffmpeg output for the first call, which will display any
+        # errors if it fails, and then break the loop. We only show error messages
+        # for the remaining calls.
+        # cmd += ['-v', 'error']
+
+        # input path
+        cmd += ["-i", sample_path]
+
+        # clip to cut
+        cmd += ["-nostdin", "-y", "-ss", str(s.get_seconds()), "-t", str(duration.get_seconds())]
+
+        # target fps
+        # cmd += ['-vf', 'select=mod(n\,2)']
+        cmd += ["-r", f"{target_fps}"]
+
+        # aspect ratio
+        cmd += ["-vf", f"scale='if(gt(iw,ih),-2,{shorter_size})':'if(gt(iw,ih),{shorter_size},-2)'"]
+        # cmd += ['-vf', f"scale='if(gt(iw,ih),{shorter_size},trunc(ow/a/2)*2)':-2"]
+
+        cmd += ["-map", "0", save_path]
+
+        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+        stdout, stderr = proc.communicate()
+        if verbose:
+            stdout = stdout.decode("utf-8")
+            print_log(stdout, logger=logger)
+
+        save_path_list.append(sample_path)
+        print_log(f"Video clip saved to '{save_path}'", logger=logger)
+
+    return save_path_list