git init

1f5da520 · yangzhong · 1f5da520 · 1f5da520 · 1f5da520 · 1f5da520
Commit 1f5da520 authored Dec 05, 2025 by yangzhong
20 changed files
--- a/input/b.mp4
+++ b/input/b.mp4
--- a/input/text/prompt.txt
+++ b/input/text/prompt.txt
+A serene scene of a panda bear playing a guitar at sunset unfolds by a tranquil lake. The panda, with its black-and-white fur, strums the guitar while seated on a rock. Behind, a breathtaking mountain range glows under the orange and pink hues of the setting sun, contrasting beautifully with the lake's deep blue. The composition highlights the panda's relaxed interaction with the guitar, set against the stunning natural landscape, creating depth and peaceful harmony.
\ No newline at end of file
--- a/input/video/a.mp4
+++ b/input/video/a.mp4
--- a/pretrained_weight/.gitattributes
+++ b/pretrained_weight/.gitattributes
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
--- a/pretrained_weight/README.md
+++ b/pretrained_weight/README.md
+We provide two versions for I2VGen-XL-based model, heavy_deg.pt for heavy degraded videos and light_deg.pt for light degraded videos (e.g., the low-resolution video downloaded from video websites).
+
+You can put the weight into pretrained_weight/
+本仓库提供了快速从huggingface下载该模型的脚本，你可以运行以下命令进行下载：
+python /STAR/down_model.py
--- a/requirements.txt
+++ b/requirements.txt
+accelerate
+av
+torch==2.0.1
+torchvision==0.15.2
+torchaudio==2.0.2
+opencv-python==4.10.0.84
+easydict==1.13
+einops==0.8.0
+open-clip-torch==2.20.0
+xformers==0.0.21
+fairscale==0.4.13
+torchsde==0.2.6
+pytorch-lightning==2.0.1
+diffusers==0.30.0
+huggingface_hub==0.23.3
+peft==0.5.0
+gradio==4.41.0
+numpy==1.24
--- a/results/README.md
+++ b/results/README.md
+保存结果文件
--- a/utils_data/README.md
+++ b/utils_data/README.md
+## Generate Training Data
+
+### Step 1: Create the environment
+```
+conda create -n make_data python=3.10
+conda activate make_data
+bash build.sh
+```
+
+### Step 2: Prepare CSV File
+Create a CSV file listing the paths to ground-truth (GT) videos and their corresponding text descriptions. Use the following format:
+```
+path,text
+/xxx/xxx/dog.mp4, A dog is sitting on the couch.
+...
+```
+
+### Step 3: Configure Paths
+Open `make_paired_data.sh` and modify the following variables:
+
+- `INPUT_CSV`: Path to your CSV file
+- `SAVE_PATH`: Directory to save the generated paired data
+
+### Step 4: Run the Script
+```
+bash make_paired_data.sh
+```
+
+⚠️ **Notice:** The current version of `make_paired_data.sh` only supports `batch_size=1`.  
+To process data in parallel, you can split the CSV file into multiple parts and run the script separately on each part.
--- a/utils_data/build.sh
+++ b/utils_data/build.sh
+pip3 install --upgrade pip
+
+pip3 install greenlet==1.1.3
+pip3 install gevent==22.8.0
+pip3 install torch 
+pip3 install torchvision
+
+pip3 install ftfy
+pip3 install numpy
+pip3 install tqdm
+pip3 install psutil
+pip3 install pre-commit
+pip3 install rich
+pip3 install click
+pip3 install fabric
+pip3 install contexttimer
+pip3 install safetensors
+pip3 install einops
+pip3 install pydantic
+pip3 install ray
+pip3 install protobuf
+pip3 install gdown
+pip3 install pyav
+pip3 install tensorboard
+pip3 install timm
+pip3 install matplotlib
+pip3 install accelerate
+pip3 install diffusers
+pip3 install transformers
+pip3 install ipdb
+pip3 install opencv-python
+pip3 install webdataset
+pip3 install gateloop_transformer
+pip3 install kornia
+pip3 install scipy
+sudo apt-get install -y libgl1-mesa-dev
+
+# install flash attention (optional)
+# set enable_flashattn=False in config to avoid using flash attention
+pip3 install packaging
+pip3 install ninja
+pip3 install flash-attn --no-build-isolation
+
+# install apex (optional)
+# set enable_layernorm_kernel=False in config to avoid using apex
+pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git
+
+# install xformers
+#pip install -U xformers --index-url https://download.pytorch.org/whl/cu121
+# cp -r /mnt/bn/videodataset/VSR/data/compile/xformers-0.0.25.post1-cp39-cp39-manylinux2014_x86_64.whl .
+# pip install xformers-0.0.25.post1-cp39-cp39-manylinux2014_x86_64.whl
+
+# install this project
+git clone https://github.com/hpcaitech/Open-Sora
+cd Open-Sora
+pip install -v .
+pip uninstall colossalai -y
+pip install colossalai==0.3.7
+cd ..
\ No newline at end of file
--- a/utils_data/make_data_config.py
+++ b/utils_data/make_data_config.py
+# Define dataset
+dataset = dict(
+    type="VideoTextDataset",
+    data_path=None,
+    num_frames=32,
+    frame_interval=2,
+    image_size=(720, 1280),
+)
+
+data_path = ''
+save_path = ''
+dtype = "bf16"
+num_workers = 2
+batch_size = 1  # now only support batch_size=1
+seed = 42
\ No newline at end of file
--- a/utils_data/make_paired_data.py
+++ b/utils_data/make_paired_data.py
+import os
+
+import colossalai
+import torch
+import torch.distributed as dist
+from colossalai.cluster import DistCoordinator
+from mmengine.runner import set_random_seed
+
+from opensora.acceleration.parallel_states import set_sequence_parallel_group
+from opensora.datasets import save_sample, prepare_dataloader
+from opensora.registry import MODELS, SCHEDULERS, build_module, DATASETS
+from opensora.utils.config_utils import parse_configs
+from opensora.utils.misc import to_torch_dtype
+
+import torch.nn.functional as F
+from einops import rearrange
+from opensora.datasets.high_order.degrade_video import degradation_process
+from tqdm import tqdm
+
+def main():
+    # ======================================================
+    # 1. cfg and init distributed env
+    # ======================================================
+    cfg = parse_configs(training=False)
+    print(cfg)
+
+    # init distributed
+    if os.environ.get("WORLD_SIZE", None):
+        use_dist = True
+        colossalai.launch_from_torch({})
+        coordinator = DistCoordinator()
+
+        if coordinator.world_size > 1:
+            set_sequence_parallel_group(dist.group.WORLD)
+            enable_sequence_parallelism = True
+        else:
+            enable_sequence_parallelism = False
+    else:
+        use_dist = False
+        enable_sequence_parallelism = False
+
+    # ======================================================
+    # 2. runtime variables
+    # ======================================================
+    torch.set_grad_enabled(False)
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    dtype = to_torch_dtype(cfg.dtype)
+    set_random_seed(seed=cfg.seed)
+
+    # ======================================================
+    # 3. build model & load weights
+    # ======================================================
+    cfg.dataset['data_path'] = cfg.data_path
+    dataset = build_module(cfg.dataset, DATASETS)
+    dataloader_args = dict(
+        dataset=dataset,
+        batch_size=cfg.batch_size,
+        num_workers=cfg.num_workers,
+        seed=cfg.seed,
+        shuffle=True,
+        drop_last=True,
+        pin_memory=False,
+    )
+    dataloader = prepare_dataloader(**dataloader_args)
+    dataloader_iter = iter(dataloader)
+
+    # ======================================================
+    # 4. inference
+    # ======================================================
+    sample_idx = 0
+    save_dir_gt = cfg.save_path + '/gt'
+    save_dir_lq = cfg.save_path + '/lq'
+    save_dir_txt = cfg.save_path + '/text'
+    os.makedirs(save_dir_gt, exist_ok=True)
+    os.makedirs(save_dir_lq, exist_ok=True)
+    os.makedirs(save_dir_txt, exist_ok=True)
+
+    # 4.1. batch generation with progress bar
+    for _, batch in tqdm(enumerate(dataloader_iter), total=len(dataloader), desc="Processing 10K Batches"):
+        x = batch.pop("video").to(device, dtype)  # [B, C, T, H, W], HR-video
+        fps = batch.pop('fps')
+
+        # generate LR-video
+        lr, x = degradation_process(x)
+        _, _, t, _, _ = lr.shape
+        lr = rearrange(F.interpolate(rearrange(lr, "B C T H W -> (B T) C H W"), scale_factor=4, mode='bicubic'), "(B T) C H W -> B C T H W", T=t)
+        y = batch.pop("text")
+
+        # 4.4. save samples
+        if not use_dist or coordinator.is_master():
+            for i in range(0, lr.shape[0]):
+                save_dir_gt_ = os.path.join(save_dir_gt, f"{sample_idx}")
+                save_dir_lq_ = os.path.join(save_dir_lq, f"{sample_idx}")
+                save_dir_txt_ = os.path.join(save_dir_txt, f"{sample_idx}.txt")
+
+                save_sample(x[i], fps=fps / cfg.dataset['frame_interval'], save_path=save_dir_gt_)
+                save_sample(lr[i], fps=fps / cfg.dataset['frame_interval'], save_path=save_dir_lq_)
+                with open(save_dir_txt_, 'w', encoding='utf-8') as file:
+                    file.write(y[i])
+
+                sample_idx += 1
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
--- a/utils_data/make_paired_data.sh
+++ b/utils_data/make_paired_data.sh
+#!/bin/bash
+
+# Define environment variables
+export CUDA_VISIBLE_DEVICES="0"
+export TOKENIZERS_PARALLELISM=false
+
+# Define paths
+INPUT_CSV=""
+SAVE_PATH=""
+
+# Run script on the full CSV file
+torchrun --nnodes=1 --nproc_per_node=1 --master_port=29501 \
+    make_paired_data.py \
+    --config "./make_data_config.py" \
+    --data-path $INPUT_CSV \
+    --save_path $SAVE_PATH
\ No newline at end of file
--- a/utils_data/opensora/__init__.py
+++ b/utils_data/opensora/__init__.py
+from .acceleration import *
+from .datasets import *
+# from .models import *
+from .registry import *
--- a/utils_data/opensora/__pycache__/__init__.cpython-39.pyc
+++ b/utils_data/opensora/__pycache__/__init__.cpython-39.pyc
--- a/utils_data/opensora/__pycache__/registry.cpython-39.pyc
+++ b/utils_data/opensora/__pycache__/registry.cpython-39.pyc
--- a/utils_data/opensora/acceleration/__init__.py
+++ b/utils_data/opensora/acceleration/__init__.py
--- a/utils_data/opensora/acceleration/__pycache__/__init__.cpython-39.pyc
+++ b/utils_data/opensora/acceleration/__pycache__/__init__.cpython-39.pyc
--- a/utils_data/opensora/acceleration/__pycache__/checkpoint.cpython-39.pyc
+++ b/utils_data/opensora/acceleration/__pycache__/checkpoint.cpython-39.pyc
--- a/utils_data/opensora/acceleration/__pycache__/communications.cpython-39.pyc
+++ b/utils_data/opensora/acceleration/__pycache__/communications.cpython-39.pyc
--- a/utils_data/opensora/acceleration/__pycache__/parallel_states.cpython-39.pyc
+++ b/utils_data/opensora/acceleration/__pycache__/parallel_states.cpython-39.pyc