Commit 78bae405 authored by mashun1's avatar mashun1
Browse files

open_sora_inference

parents
num_frames = 64
frame_interval = 2
image_size = (512, 512)
# Define dataset
root = None
data_path = "CSV_PATH"
use_image_transform = False
num_workers = 4
# Define acceleration
dtype = "bf16"
grad_checkpoint = True
plugin = "zero2-seq"
sp_size = 2
# Define model
model = dict(
type="STDiT-XL/2",
space_scale=1.0,
time_scale=2 / 3,
from_pretrained=None,
enable_flashattn=True,
enable_layernorm_kernel=True,
enable_sequence_parallelism=True, # enable sq here
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
type="t5",
from_pretrained="./pretrained_models/t5_ckpts",
model_max_length=120,
shardformer=True,
)
scheduler = dict(
type="iddpm",
timestep_respacing="",
)
# Others
seed = 42
outputs = "outputs"
wandb = False
epochs = 1000
log_every = 10
ckpt_every = 1000
load = None
batch_size = 1
lr = 2e-5
grad_clip = 1.0
num_frames = 64
frame_interval = 2
image_size = (512, 512)
# Define dataset
root = None
data_path = "CSV_PATH"
use_image_transform = False
num_workers = 4
# Define acceleration
dtype = "bf16"
grad_checkpoint = True
plugin = "zero2"
sp_size = 1
# Define model
model = dict(
type="STDiT-XL/2",
space_scale=1.0,
time_scale=2 / 3,
from_pretrained=None,
enable_flashattn=True,
enable_layernorm_kernel=True,
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
micro_batch_size=64,
)
text_encoder = dict(
type="t5",
from_pretrained="./pretrained_models/t5_ckpts",
model_max_length=120,
shardformer=True,
)
scheduler = dict(
type="iddpm",
timestep_respacing="",
)
# Others
seed = 42
outputs = "outputs"
wandb = False
epochs = 1000
log_every = 10
ckpt_every = 250
load = None
batch_size = 4
lr = 2e-5
grad_clip = 1.0
num_frames = 16
fps = 8
image_size = (256, 256)
# Define model
model = dict(
type="PixArt-XL/2",
space_scale=0.5,
time_scale=1.0,
from_pretrained="outputs/098-F16S3-PixArt-XL-2/epoch7-global_step30000/model_ckpt.pt",
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
type="t5",
from_pretrained="./pretrained_models/t5_ckpts",
model_max_length=120,
)
scheduler = dict(
type="dpm-solver",
num_sampling_steps=20,
cfg_scale=7.0,
)
dtype = "fp16"
# Others
batch_size = 2
seed = 42
prompt_path = "./assets/texts/t2v_samples.txt"
save_dir = "./outputs/samples/"
num_frames = 1
fps = 1
image_size = (1920, 512)
multi_resolution = True
# Define model
model = dict(
type="PixArtMS-XL/2",
space_scale=2.0,
time_scale=1.0,
no_temporal_pos_emb=True,
from_pretrained="PixArt-XL-2-1024-MS.pth",
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
type="t5",
from_pretrained="./pretrained_models/t5_ckpts",
model_max_length=120,
)
scheduler = dict(
type="dpm-solver",
num_sampling_steps=20,
cfg_scale=7.0,
)
dtype = "fp16"
# Others
batch_size = 2
seed = 42
prompt_path = "./assets/texts/t2i_samples.txt"
save_dir = "./outputs/samples/"
num_frames = 1
fps = 1
image_size = (256, 256)
# Define model
model = dict(
type="PixArt-XL/2",
space_scale=1.0,
time_scale=1.0,
no_temporal_pos_emb=True,
from_pretrained="PixArt-XL-2-256x256.pth",
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
type="t5",
from_pretrained="./pretrained_models/t5_ckpts",
model_max_length=120,
)
scheduler = dict(
type="dpm-solver",
num_sampling_steps=20,
cfg_scale=7.0,
)
dtype = "fp16"
# Others
batch_size = 2
seed = 42
prompt_path = "./assets/texts/t2i_samples.txt"
save_dir = "./outputs/samples/"
num_frames = 1
fps = 1
image_size = (512, 512)
# Define model
model = dict(
type="PixArt-XL/2",
space_scale=1.0,
time_scale=1.0,
no_temporal_pos_emb=True,
from_pretrained="PixArt-XL-2-512x512.pth",
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
type="t5",
from_pretrained="./pretrained_models/t5_ckpts",
model_max_length=120,
)
scheduler = dict(
type="dpm-solver",
num_sampling_steps=20,
cfg_scale=7.0,
)
dtype = "fp16"
# Others
batch_size = 2
seed = 42
prompt_path = "./assets/texts/t2i_samples.txt"
save_dir = "./outputs/samples/"
num_frames = 16
frame_interval = 3
image_size = (256, 256)
# Define dataset
root = None
data_path = "CSV_PATH"
use_image_transform = False
num_workers = 4
# Define acceleration
dtype = "bf16"
grad_checkpoint = False
plugin = "zero2"
sp_size = 1
# Define model
model = dict(
type="PixArt-XL/2",
space_scale=0.5,
time_scale=1.0,
from_pretrained="PixArt-XL-2-512x512.pth",
enable_flashattn=True,
enable_layernorm_kernel=True,
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
type="t5",
from_pretrained="./pretrained_models/t5_ckpts",
model_max_length=120,
shardformer=True,
)
scheduler = dict(
type="iddpm",
timestep_respacing="",
)
# Others
seed = 42
outputs = "outputs"
wandb = False
epochs = 1000
log_every = 10
ckpt_every = 1000
load = None
batch_size = 8
lr = 2e-5
grad_clip = 1.0
num_frames = 1
frame_interval = 1
image_size = (512, 512)
# Define dataset
root = None
data_path = "CSV_PATH"
use_image_transform = True
num_workers = 4
# Define acceleration
dtype = "bf16"
grad_checkpoint = True
plugin = "zero2"
sp_size = 1
# Define model
model = dict(
type="PixArt-XL/2",
space_scale=1.0,
time_scale=1.0,
no_temporal_pos_emb=True,
from_pretrained="PixArt-XL-2-512x512.pth",
enable_flashattn=True,
enable_layernorm_kernel=True,
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
type="t5",
from_pretrained="./pretrained_models/t5_ckpts",
model_max_length=120,
shardformer=True,
)
scheduler = dict(
type="iddpm",
timestep_respacing="",
)
# Others
seed = 42
outputs = "outputs"
wandb = False
epochs = 1000
log_every = 10
ckpt_every = 1000
load = None
batch_size = 32
lr = 2e-5
grad_clip = 1.0
num_frames = 64
frame_interval = 2
image_size = (512, 512)
# Define dataset
root = None
data_path = "CSV_PATH"
use_image_transform = False
num_workers = 4
# Define acceleration
dtype = "bf16"
grad_checkpoint = True
plugin = "zero2"
sp_size = 1
# Define model
model = dict(
type="PixArt-XL/2",
space_scale=1.0,
time_scale=2 / 3,
from_pretrained=None,
enable_flashattn=True,
enable_layernorm_kernel=True,
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
micro_batch_size=128,
)
text_encoder = dict(
type="t5",
from_pretrained="./pretrained_models/t5_ckpts",
model_max_length=120,
shardformer=True,
)
scheduler = dict(
type="iddpm",
timestep_respacing="",
)
# Others
seed = 42
outputs = "outputs"
wandb = False
epochs = 1000
log_every = 10
ckpt_every = 250
load = None
batch_size = 4
lr = 2e-5
grad_clip = 1.0
<p align="center">
<img src="../assets/readme/icon.png" width="250"/>
<p>
<div align="center">
<a href="https://github.com/hpcaitech/Open-Sora/stargazers"><img src="https://img.shields.io/github/stars/hpcaitech/Open-Sora?style=social"></a>
<a href="https://hpcaitech.github.io/Open-Sora/"><img src="https://img.shields.io/badge/Gallery-View-orange?logo=&amp"></a>
<a href="https://discord.gg/shpbperhGs"><img src="https://img.shields.io/badge/Discord-join-blueviolet?logo=discord&amp"></a>
<a href="https://join.slack.com/t/colossalaiworkspace/shared_invite/zt-247ipg9fk-KRRYmUl~u2ll2637WRURVA"><img src="https://img.shields.io/badge/Slack-ColossalAI-blueviolet?logo=slack&amp"></a>
<a href="https://twitter.com/yangyou1991/status/1769411544083996787?s=61&t=jT0Dsx2d-MS5vS9rNM5e5g"><img src="https://img.shields.io/badge/Twitter-Discuss-blue?logo=twitter&amp"></a>
<a href="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png"><img src="https://img.shields.io/badge/微信-小助手加群-green?logo=wechat&amp"></a>
</div>
## Open-Sora: 完全开源的高效复现类Sora视频生成方案
**Open-Sora**项目是一项致力于**高效**制作高质量视频,并使所有人都能使用其模型、工具和内容的计划。
通过采用**开源**原则,Open-Sora 不仅实现了先进视频生成技术的低成本普及,还提供了一个精简且用户友好的方案,简化了视频制作的复杂性。
通过 Open-Sora,我们希望更多开发者一起探索内容创作领域的创新、创造和包容。
[[English]](/README.md)
## 📰 资讯
* **[2024.03.18]** 🔥 我们发布了**Open-Sora 1.0**,这是一个完全开源的视频生成项目。
* Open-Sora 1.0 支持视频数据预处理、<a href="https://github.com/hpcaitech/ColossalAI"><img src="../assets/readme/colossal_ai.png" width="8%" ></a> 加速训练、推理等全套流程。
* 我们提供的[模型权重](#model-weights)只需 3 天的训练就能生成 2~5 秒的 512x512 视频。
* **[2024.03.04]** Open-Sora:开源Sora复现方案,成本降低46%,序列扩充至近百万
## 🎥 最新视频
| **2s 512×512** | **2s 512×512** | **2s 512×512** |
| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- |
| [<img src="/assets/readme/sample_0.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/de1963d3-b43b-4e68-a670-bb821ebb6f80) | [<img src="/assets/readme/sample_1.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/13f8338f-3d42-4b71-8142-d234fbd746cc) | [<img src="/assets/readme/sample_2.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/fa6a65a6-e32a-4d64-9a9e-eabb0ebb8c16) |
| A serene night scene in a forested area. [...] The video is a time-lapse, capturing the transition from day to night, with the lake and forest serving as a constant backdrop. | A soaring drone footage captures the majestic beauty of a coastal cliff, [...] The water gently laps at the rock base and the greenery that clings to the top of the cliff. | The majestic beauty of a waterfall cascading down a cliff into a serene lake. [...] The camera angle provides a bird's eye view of the waterfall. |
| [<img src="/assets/readme/sample_3.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/64232f84-1b36-4750-a6c0-3e610fa9aa94) | [<img src="/assets/readme/sample_4.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/983a1965-a374-41a7-a76b-c07941a6c1e9) | [<img src="/assets/readme/sample_5.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/ec10c879-9767-4c31-865f-2e8d6cf11e65) |
| A bustling city street at night, filled with the glow of car headlights and the ambient light of streetlights. [...] | The vibrant beauty of a sunflower field. The sunflowers are arranged in neat rows, creating a sense of order and symmetry. [...] | A serene underwater scene featuring a sea turtle swimming through a coral reef. The turtle, with its greenish-brown shell [...] |
视频经过降采样处理为`.gif`格式,以便显示。点击查看原始视频。为便于显示,文字经过修剪,全文请参见 [此处](/assets/texts/t2v_samples.txt)。在我们的[图片库](https://hpcaitech.github.io/Open-Sora/)中查看更多样本。
## 🔆 新功能
* 📍Open-Sora-v1 已发布。[这里](#model-weights)提供了模型权重。只需 400K 视频片段和在单卡 H800 上训200天(类比Stable Video Diffusion 的 152M 样本),我们就能生成 2 秒的 512×512 视频。
* ✅ 从图像扩散模型到视频扩散模型的三阶段训练。我们提供每个阶段的权重。
* ✅ 支持训练加速,包括Transformer加速、更快的 T5 和 VAE 以及序列并行。在对 64x512x512 视频进行训练时,Open-Sora 可将训练速度提高**55%**。详细信息请参见[训练加速](docs/acceleration.md)
* ✅ 我们提供用于数据预处理的视频切割和字幕工具。有关说明请点击[此处](tools/data/README.md),我们的数据收集计划请点击 [数据集](docs/datasets.md)
* ✅ 我们发现来自[VideoGPT](https://wilson1yan.github.io/videogpt/index.html)的 VQ-VAE 质量较低,因此采用了来自[Stability-AI](https://huggingface.co/stabilityai/sd-vae-ft-mse-original) 的高质量 VAE。我们还发现使用添加了时间维度的采样会导致生成质量降低。更多讨论,请参阅我们的 **[报告](docs/report_v1.md)**
* ✅ 我们研究了不同的架构,包括 DiT、Latte 和我们提出的 **STDiT**。我们的STDiT在质量和速度之间实现了更好的权衡。更多讨论,请参阅我们的 **[报告](docs/report_v1.md)**
* ✅ 支持剪辑和 T5 文本调节。
* ✅ 通过将图像视为单帧视频,我们的项目支持在图像和视频(如 ImageNet 和 UCF101)上训练 DiT。更多说明请参见 [指令解析](docs/command.md)
* ✅ 利用[DiT](https://github.com/facebookresearch/DiT)[Latte](https://github.com/Vchitect/Latte)[PixArt](https://pixart-alpha.github.io/) 的官方权重支持推理。
<details>
<summary>查看更多</summary>
* ✅ 重构代码库。请参阅[结构](docs/structure.md),了解项目结构以及如何使用配置文件。
</details>
### 下一步计划【按优先级排序】
* [ ] 完成数据处理流程(包括密集光流、美学评分、文本图像相似性、重复数据删除等)。更多信息请参见[数据集](/docs/datasets.md)**[项目进行中]**
* [ ] 训练视频-VAE。 **[项目进行中]**
<details>
<summary>查看更多</summary>
* [ ] 支持图像和视频调节。
* [ ] 评估流程。
* [ ] 加入更好的调度程序,如 SD3 中的rectified flow程序。
* [ ] 支持可变长宽比、分辨率和持续时间。
* [ ] 发布后支持 SD3。
</details>
## 目录
* [安装](#installation)
* [模型权重](/#model-weights)
* [推理](/#inference)
* [数据处理](/#data-processing)
* [训练](/#training)
* [贡献](/#contribution)
* [声明](/#acknowledgement)
* [引用](/#citation)
## 安装
```bash
# create a virtual env
conda create -n opensora python=3.10
# install torch
# the command below is for CUDA 12.1, choose install commands from
# https://pytorch.org/get-started/locally/ based on your own CUDA version
pip3 install torch torchvision
# install flash attention (optional)
pip install packaging ninja
pip install flash-attn --no-build-isolation
# install apex (optional)
pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git
# install xformers
pip3 install -U xformers --index-url https://download.pytorch.org/whl/cu121
# install this project
git clone https://github.com/hpcaitech/Open-Sora
cd Open-Sora
pip install -v .
```
安装完成后,建议阅读[结构](docs/structure.md),了解项目结构以及如何使用配置文件。
## 模型权重
| 分辨率 | 数据 | 迭代次数 | 批量大小 | GPU 天数 (H800) | 网址 |
| ---------- | ------ | ----------- | ---------- | --------------- | ---------- |
| 16×256×256 | 366K | 80k | 8×64 | 117 | [:link:]() |
| 16×256×256 | 20K HQ | 24k | 8×64 | 45 | [:link:]() |
| 16×512×512 | 20K HQ | 20k | 2×64 | 35 | [:link:]() |
| 64×512×512 | 50K HQ | | | | TBD |
我们模型的权重部分由[PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha) 初始化。参数数量为 724M。有关训练的更多信息,请参阅我们的 **[报告](/docs/report_v1.md)**。有关数据集的更多信息,请参阅[数据](/docs/dataset.md)。HQ 表示高质量。
:warning: **局限性**:我们的模型是在有限的预算内训练出来的。质量和文本对齐度相对较差。特别是在生成人类时,模型表现很差,无法遵循详细的指令。我们正在努力改进质量和文本对齐。
## 推理
要使用我们提供的权重进行推理,首先要将[T5](https://huggingface.co/DeepFloyd/t5-v1_1-xxl/tree/main)权重下载到pretrained_models/t5_ckpts/t5-v1_1-xxl 中。然后下载模型权重。运行以下命令生成样本。请参阅[此处](docs/structure.md#inference-config-demos)自定义配置。
```bash
# Sample 16x256x256 (5s/sample)
torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/16x256x256.py --ckpt-path ./path/to/your/ckpt.pth
# Sample 16x512x512 (20s/sample, 100 time steps)
torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/16x512x512.py --ckpt-path ./path/to/your/ckpt.pth
# Sample 64x512x512 (40s/sample, 100 time steps)
torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/64x512x512.py --ckpt-path ./path/to/your/ckpt.pth
# Sample 64x512x512 with sequence parallelism (30s/sample, 100 time steps)
# sequence parallelism is enabled automatically when nproc_per_node is larger than 1
torchrun --standalone --nproc_per_node 2 scripts/inference.py configs/opensora/inference/64x512x512.py --ckpt-path ./path/to/your/ckpt.pth
```
我们在 H800 GPU 上进行了速度测试。如需使用其他模型进行推理,请参阅[此处](docs/commands.md)获取更多说明。
## 数据处理
高质量数据是高质量模型的关键。[这里](/docs/datasets.md)有我们使用过的数据集和数据收集计划。我们提供处理视频数据的工具。目前,我们的数据处理流程包括以下步骤:
1. 下载数据集。[[文件](/tools/datasets/README.md)]
2. 将视频分割成片段。 [[文件](/tools/scenedetect/README.md)]
3. 生成视频字幕。 [[文件](/tools/caption/README.md)]
## 训练
要启动训练,首先要将[T5](https://huggingface.co/DeepFloyd/t5-v1_1-xxl/tree/main)权重下载到pretrained_models/t5_ckpts/t5-v1_1-xxl 中。然后运行以下命令在单个节点上启动训练。
```bash
# 1 GPU, 16x256x256
torchrun --nnodes=1 --nproc_per_node=1 scripts/train.py configs/opensora/train/16x256x512.py --data-path YOUR_CSV_PATH
# 8 GPUs, 64x512x512
torchrun --nnodes=1 --nproc_per_node=8 scripts/train.py configs/opensora/train/64x512x512.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT
```
要在多个节点上启动训练,请根据[ColossalAI](https://colossalai.org/docs/basics/launch_colossalai/#launch-with-colossal-ai-cli) 准备一个主机文件,并运行以下命令。
```bash
colossalai run --nproc_per_node 8 --hostfile hostfile scripts/train.py configs/opensora/train/64x512x512.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT
```
有关其他型号的培训和高级使用方法,请参阅[此处](docs/commands.md)获取更多说明。
## 贡献
如果您希望为该项目做出贡献,可以参考 [贡献指南](./CONTRIBUTING.md).
## 声明
* [DiT](https://github.com/facebookresearch/DiT): Scalable Diffusion Models with Transformers.
* [OpenDiT](https://github.com/NUS-HPC-AI-Lab/OpenDiT): An acceleration for DiT training. We adopt valuable acceleration strategies for training progress from OpenDiT.
* [PixArt](https://github.com/PixArt-alpha/PixArt-alpha): An open-source DiT-based text-to-image model.
* [Latte](https://github.com/Vchitect/Latte): An attempt to efficiently train DiT for video.
* [StabilityAI VAE](https://huggingface.co/stabilityai/sd-vae-ft-mse-original): A powerful image VAE model.
* [CLIP](https://github.com/openai/CLIP): A powerful text-image embedding model.
* [T5](https://github.com/google-research/text-to-text-transfer-transformer): A powerful text encoder.
* [LLaVA](https://github.com/haotian-liu/LLaVA): A powerful image captioning model based on [Yi-34B](https://huggingface.co/01-ai/Yi-34B).
我们对他们的出色工作和对开源的慷慨贡献表示感谢。
## 引用
```bibtex
@software{opensora,
author = {Zangwei Zheng and Xiangyu Peng and Yang You},
title = {Open-Sora: Democratizing Efficient Video Production for All},
month = {March},
year = {2024},
url = {https://github.com/hpcaitech/Open-Sora}
}
```
[Zangwei Zheng](https://github.com/zhengzangw) and [Xiangyu Peng](https://github.com/xyupeng) equally contributed to this work during their internship at [HPC-AI Tech](https://hpc-ai.com/).
## Star 走势
[![Star History Chart](https://api.star-history.com/svg?repos=hpcaitech/Open-Sora&type=Date)](https://star-history.com/#hpcaitech/Open-Sora&Date)
# Acceleration
Open-Sora aims to provide a high-speed training framework for diffusion models. We can achieve **55%** training speed acceleration when training on **64 frames 512x512 videos**. Our framework support training **1min 1080p videos**.
## Accelerated Transformer
Open-Sora boosts the training speed by:
- Kernal optimization including [flash attention](https://github.com/Dao-AILab/flash-attention), fused layernorm kernal, and the ones compiled by colossalAI.
- Hybrid parallelism including ZeRO.
- Gradient checkpointing for larger batch size.
Our training speed on images is comparable to [OpenDiT](https://github.com/NUS-HPC-AI-Lab/OpenDiT), an project to accelerate DiT training. The training speed is measured on 8 H800 GPUs with batch size 128, image size 256x256.
| Model | Throughput (img/s/GPU) | Throughput (tokens/s/GPU) |
| -------- | ---------------------- | ------------------------- |
| DiT | 100 | 26k |
| OpenDiT | 175 | 45k |
| OpenSora | 175 | 45k |
## Efficient STDiT
Our STDiT adopts spatial-temporal attention to model the video data. Compared with directly applying full attention on DiT, our STDiT is more efficient as the number of frames increases. Our current framework only supports sequence parallelism for very long sequence.
The training speed is measured on 8 H800 GPUs with acceleration techniques applied, GC means gradient checkpointing. Both with T5 conditioning like PixArt.
| Model | Setting | Throughput (sample/s/GPU) | Throughput (tokens/s/GPU) |
| ---------------- | -------------- | ------------------------- | ------------------------- |
| DiT | 16x256 (4k) | 7.20 | 29k |
| STDiT | 16x256 (4k) | 7.00 | 28k |
| DiT | 16x512 (16k) | 0.85 | 14k |
| STDiT | 16x512 (16k) | 1.45 | 23k |
| DiT (GC) | 64x512 (65k) | 0.08 | 5k |
| STDiT (GC) | 64x512 (65k) | 0.40 | 25k |
| STDiT (GC, sp=2) | 360x512 (370k) | 0.10 | 18k |
With a 4x downsampling in the temporal dimension with Video-VAE, an 24fps video has 450 frames. The gap between the speed of STDiT (28k tokens/s) and DiT on images (up to 45k tokens/s) mainly comes from the T5 and VAE encoding, and temperal attention.
## Accelerated Encoder (T5, VAE)
During training, texts are encoded by T5, and videos are encoded by VAE. Typically there are two ways to accelerate the training:
1. Preprocess text and video data in advance and save them to disk.
2. Encode text and video data during training, and accelerate the encoding process.
For option 1, 120 tokens for one sample require 1M disk space, and a 64x64x64 latent requires 4M. Considering a training dataset with 10M video clips, the total disk space required is 50TB. Our storage system is not ready at this time for this scale of data.
For option 2, we boost T5 speed and memory requirement. According to [OpenDiT](https://github.com/NUS-HPC-AI-Lab/OpenDiT), we find VAE consumes a large number of GPU memory. Thus we split batch size into smaller ones for VAE encoding. With both techniques, we can greatly accelerated the training speed.
The training speed is measured on 8 H800 GPUs with STDiT.
| Acceleration | Setting | Throughput (img/s/GPU) | Throughput (tokens/s/GPU) |
| ------------ | ------------- | ---------------------- | ------------------------- |
| Baseline | 16x256 (4k) | 6.16 | 25k |
| w. faster T5 | 16x256 (4k) | 7.00 | 29k |
| Baseline | 64x512 (65k) | 0.94 | 15k |
| w. both | 64x512 (65k) | 1.45 | 23k |
# Commands
## Inference
You can modify corresponding config files to change the inference settings. See more details [here](/docs/structure.md#inference-config-demos).
### Inference with DiT pretrained on ImageNet
The following command automatically downloads the pretrained weights on ImageNet and runs inference.
```bash
python scripts/inference.py configs/dit/inference/1x256x256-class.py --ckpt-path DiT-XL-2-256x256.pt
```
### Inference with Latte pretrained on UCF101
The following command automatically downloads the pretrained weights on UCF101 and runs inference.
```bash
python scripts/inference.py configs/latte/inference/16x256x256-class.py --ckpt-path Latte-XL-2-256x256-ucf101.pt
```
### Inference with PixArt-α pretrained weights
Download T5 into `./pretrained_models` and run the following command.
```bash
# 256x256
torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/pixart/inference/1x256x256.py --ckpt-path PixArt-XL-2-256x256.pth
# 512x512
torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/pixart/inference/1x512x512.py --ckpt-path PixArt-XL-2-512x512.pth
# 1024 multi-scale
torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/pixart/inference/1x1024MS.py --ckpt-path PixArt-XL-2-1024MS.pth
```
### Inference with checkpoints saved during training
During training, an experiment logging folder is created in `outputs` directory. Under each checpoint folder, e.g. `epoch12-global_step2000`, there is a `ema.pt` and the shared `model` folder. Run the following command to perform inference.
```bash
# inference with ema model
torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/16x256x256.py --ckpt-path outputs/001-STDiT-XL-2/epoch12-global_step2000/ema.pt
# inference with model
torchrun --standalone --nproc_per_node 1 scripts/inference.py configs/opensora/inference/16x256x256.py --ckpt-path outputs/001-STDiT-XL-2/epoch12-global_step2000
# inference with sequence parallelism
# sequence parallelism is enabled automatically when nproc_per_node is larger than 1
torchrun --standalone --nproc_per_node 2 scripts/inference.py configs/opensora/inference/16x256x256.py --ckpt-path outputs/001-STDiT-XL-2/epoch12-global_step2000
```
The second command will automatically generate a `model_ckpt.pt` file in the checkpoint folder.
### Inference Hyperparameters
1. DPM-solver is good at fast inference for images. However, the video result is not satisfactory. You can use it for fast demo purpose.
```python
type="dmp-solver"
num_sampling_steps=20
```
1. You can use [SVD](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt)'s finetuned VAE decoder on videos for inference (consumes more memory). However, we do not see significant improvement in the video result. To use it, download [the pretrained weights](https://huggingface.co/maxin-cn/Latte/tree/main/t2v_required_models/vae_temporal_decoder) into `./pretrained_models/vae_temporal_decoder` and modify the config file as follows.
```python
vae = dict(
type="VideoAutoencoderKLTemporalDecoder",
from_pretrained="pretrained_models/vae_temporal_decoder",
)
## Training
To resume training, run the following command. ``--load`` different from ``--ckpt-path`` as it loads the optimizer and dataloader states.
```bash
torchrun --nnodes=1 --nproc_per_node=8 scripts/train.py configs/opensora/train/64x512x512.py --data-path YOUR_CSV_PATH --load YOUR_PRETRAINED_CKPT
```
To enable wandb logging, add `--wandb` to the command.
```bash
WANDB_API_KEY=YOUR_WANDB_API_KEY torchrun --nnodes=1 --nproc_per_node=8 scripts/train.py configs/opensora/train/64x512x512.py --data-path YOUR_CSV_PATH --wandb True
```
You can modify corresponding config files to change the training settings. See more details [here](/docs/structure.md#training-config-demos).
### Training Hyperparameters
1. `dtype` is the data type for training. Only `fp16` and `bf16` are supported. ColossalAI automatically enables the mixed precision training for `fp16` and `bf16`. During training, we find `bf16` more stable.
# Datasets
## Datasets used for now
### HD-VG-130M
[HD-VG-130M](https://github.com/daooshee/HD-VG-130M?tab=readme-ov-file) comprises 130M text-video pairs. The caption is generated by BLIP-2. We find the cut and the text quality are relatively poor. It contains 20 splits. For OpenSora 1.0, we use the first split. We plan to use the whole dataset and re-process it.
### Inter4k
[Inter4k](https://github.com/alexandrosstergiou/Inter4K) is a dataset containing 1k video clips with 4K resolution. The dataset is proposed for super-resolution tasks. We use the dataset for HQ training. The videos are processed as mentioned [here](/README.md#data-processing).
### Pexels.com
[Pexels.com](https://www.pexels.com/) is a website that provides free stock photos and videos. We collect 19K video clips from this website for HQ training. The videos are processed as mentioned [here](/README.md#data-processing).
## Datasets watching list
We are also watching the following datasets and considering using them in the future, which depends on our disk space and the quality of the dataset.
| Name | Size | Description |
| ----------------- | ------------ | ----------------------------- |
| Panda-70M | 70M videos | High quality video-text pairs |
| WebVid-10M | 10M videos | Low quality |
| InternVid-10M-FLT | 10M videos | |
| EGO4D | 3670 hours | |
| OpenDV-YouTube | 1700 hours | |
| VidProM | 6.69M videos | |
# Open-Sora v1 Report
OpenAI's Sora is amazing at generating one minutes high quality videos. However, it reveals almost no information about its details. To make AI more "open", we are dedicated to build an open-source version of Sora. This report describes our first attempt to train a transformer-based video diffusion model.
## Efficiency in choosing the architecture
To lower the computational cost, we want to utilize existing VAE models. Sora uses spatial-temporal VAE to reduce the temporal dimensions. However, we found that there is no open-source high-quality spatial-temporal VAE model. [MAGVIT](https://github.com/google-research/magvit)'s 4x4x4 VAE is not open-sourced, while [VideoGPT](https://wilson1yan.github.io/videogpt/index.html)'s 2x4x4 VAE has a low quality in our experiments. Thus, we decided to use a 2D VAE (from [Stability-AI](https://huggingface.co/stabilityai/sd-vae-ft-mse-original)) in our first version.
The video training involves a large amount of tokens. Considering 24fps 1min videos, we have 1440 frames. With VAE downsampling 4x and patch size downsampling 2x, we have 1440x1024≈1.5M tokens. Full attention on 1.5M tokens leads to a huge computational cost. Thus, we use spatial-temporal attention to reduce the cost following [Latte](https://github.com/Vchitect/Latte).
As shown in the figure, we insert a temporal attention right after each spatial attention in STDiT (ST stands for spatial-temporal). This is similar to variant 3 in Latte's paper. However, we do not control a similar number of parameters for these variants. While Latte's paper claims their variant is better than variant 3, our experiments on 16x256x256 videos show that with same number of iterations, the performance ranks as: DiT (full) > STDiT (Sequential) > STDiT (Parallel) ≈ Latte. Thus, we choose STDiT (Sequential) out of efficiency. Speed benchmark is provided [here](/docs/acceleration.md#efficient-stdit).
![Architecture Comparison](https://i0.imgs.ovh/2024/03/15/eLk9D.png)
To focus on video generation, we hope to train the model based on a powerful image generation model. [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha) is an efficiently trained high-quality image generation model with T5-conditioned DiT structure. We initialize our model with PixArt-α and initialize the projection layer of inserted temporal attention with zero. This initialization preserves model's ability of image generation at beginning, while Latte's architecture cannot. The inserted attention increases the number of parameter from 580M to 724M.
![Architecture](https://i0.imgs.ovh/2024/03/16/erC1d.png)
Drawing from the success of PixArt-α and Stable Video Diffusion, we also adopt a progressive training strategy: 16x256x256 on 366K pretraining datasets, and then 16x256x256, 16x512x512, and 64x512x512 on 20K datasets. With scaled position embedding, this strategy greatly reduces the computational cost.
We also try to use a 3D patch embedder in DiT. However, with 2x downsampling on temporal dimension, the generated videos have a low quality. Thus, we leave the downsampling to temporal VAE in our next version. For now, we sample at every 3 frames with 16 frames training and every 2 frames with 64 frames training.
## Data is the key to high quality
We find that the number and quality of data have a great impact on the quality of generated videos, even larger than the model architecture and training strategy. At this time, we only prepared the first split (366K video clips) from [HD-VG-130M](https://github.com/daooshee/HD-VG-130M). The quality of these videos varies greatly, and the captions are not that accurate. Thus, we further collect 20k relatively high quality videos from [Pexels](https://www.pexels.com/), which provides free license videos. We label the video with LLaVA, an image captioning model, with three frames and a designed prompt. With designed prompt, LLaVA can generate good quality of captions.
![Caption](https://i0.imgs.ovh/2024/03/16/eXdvC.png)
As we lay more emphasis on the quality of data, we prepare to collect more data and build a video preprocessing pipeline in our next version.
## Training Details
With a limited training budgets, we made only a few exploration. We find learning rate 1e-4 is too large and scales down to 2e-5. When training with a large batch size, we find `fp16` less stable than `bf16` and may lead to generation failure. Thus, we switch to `bf16` for training on 64x512x512. For other hyper-parameters, we follow previous works.
## Loss curves
16x256x256 Pretraining Loss Curve
![16x256x256 Pretraining Loss Curve](https://i0.imgs.ovh/2024/03/16/erXQj.png)
16x256x256 HQ Training Loss Curve
![16x256x256 HQ Training Loss Curve](https://i0.imgs.ovh/2024/03/16/ernXv.png)
16x512x512 HQ Training Loss Curve
![16x512x512 HQ Training Loss Curve](https://i0.imgs.ovh/2024/03/16/erHBe.png)
# Repo & Config Structure
## Repo Structure
```plaintext
Open-Sora
├── README.md
├── docs
│ ├── acceleration.md -> Acceleration & Speed benchmark
│ ├── command.md -> Commands for training & inference
│ ├── datasets.md -> Datasets used in this project
│ ├── structure.md -> This file
│ └── report_v1.md -> Report for Open-Sora v1
├── scripts
│ ├── train.py -> diffusion training script
│ └── inference.py -> Report for Open-Sora v1
├── configs -> Configs for training & inference
├── opensora
│ ├── __init__.py
│ ├── registry.py -> Registry helper
│   ├── acceleration -> Acceleration related code
│   ├── dataset -> Dataset related code
│   ├── models
│   │   ├── layers -> Common layers
│   │   ├── vae -> VAE as image encoder
│   │   ├── text_encoder -> Text encoder
│   │   │   ├── classes.py -> Class id encoder (inference only)
│   │   │   ├── clip.py -> CLIP encoder
│   │   │   └── t5.py -> T5 encoder
│   │   ├── dit
│   │   ├── latte
│   │   ├── pixart
│   │   └── stdit -> Our STDiT related code
│   ├── schedulers -> Diffusion shedulers
│   │   ├── iddpm -> IDDPM for training and inference
│   │ └── dpms -> DPM-Solver for fast inference
│ └── utils
└── tools -> Tools for data processing and more
```
## Configs
Our config files follows [MMEgine](https://github.com/open-mmlab/mmengine). MMEngine will reads the config file (a `.py` file) and parse it into a dictionary-like object.
```plaintext
Open-Sora
└── configs -> Configs for training & inference
├── opensora -> STDiT related configs
│ ├── inference
│ │ ├── 16x256x256.py -> Sample videos 16 frames 256x256
│ │ ├── 16x512x512.py -> Sample videos 16 frames 512x512
│ │ └── 64x512x512.py -> Sample videos 64 frames 512x512
│ └── train
│ ├── 16x256x256.py -> Train on videos 16 frames 256x256
│ ├── 16x256x256.py -> Train on videos 16 frames 256x256
│ └── 64x512x512.py -> Train on videos 64 frames 512x512
├── dit -> DiT related configs
   │   ├── inference
   │   │   ├── 1x256x256-class.py -> Sample images with ckpts from DiT
   │   │   ├── 1x256x256.py -> Sample images with clip condition
   │   │   └── 16x256x256.py -> Sample videos
   │   └── train
   │     ├── 1x256x256.py -> Train on images with clip condition
   │      └── 16x256x256.py -> Train on videos
├── latte -> Latte related configs
└── pixart -> PixArt related configs
```
## Inference config demos
To change the inference settings, you can directly modify the corresponding config file. Or you can pass arguments to overwrite the config file ([config_utils.py](/opensora/utils/config_utils.py)). To change sampling prompts, you should modify the `.txt` file passed to the `--prompt_path` argument.
```plaintext
--prompt_path ./assets/texts/t2v_samples.txt -> prompt_path
--ckpt-path ./path/to/your/ckpt.pth -> model["from_pretrained"]
```
The explanation of each field is provided below.
```python
# Define sampling size
num_frames = 64 # number of frames
fps = 24 // 2 # frames per second (divided by 2 for frame_interval=2)
image_size = (512, 512) # image size (height, width)
# Define model
model = dict(
type="STDiT-XL/2", # Select model type (STDiT-XL/2, DiT-XL/2, etc.)
space_scale=1.0, # (Optional) Space positional encoding scale (new height / old height)
time_scale=2 / 3, # (Optional) Time positional encoding scale (new frame_interval / old frame_interval)
enable_flashattn=True, # (Optional) Speed up training and inference with flash attention
enable_layernorm_kernel=True, # (Optional) Speed up training and inference with fused kernel
from_pretrained="PRETRAINED_MODEL", # (Optional) Load from pretrained model
no_temporal_pos_emb=True, # (Optional) Disable temporal positional encoding (for image)
)
vae = dict(
type="VideoAutoencoderKL", # Select VAE type
from_pretrained="stabilityai/sd-vae-ft-ema", # Load from pretrained VAE
micro_batch_size=128, # VAE with micro batch size to save memory
)
text_encoder = dict(
type="t5", # Select text encoder type (t5, clip)
from_pretrained="./pretrained_models/t5_ckpts", # Load from pretrained text encoder
model_max_length=120, # Maximum length of input text
)
scheduler = dict(
type="iddpm", # Select scheduler type (iddpm, dpm-solver)
num_sampling_steps=100, # Number of sampling steps
cfg_scale=7.0, # hyper-parameter for classifier-free diffusion
)
dtype = "fp16" # Computation type (fp16, fp32, bf16)
# Other settings
batch_size = 1 # batch size
seed = 42 # random seed
prompt_path = "./assets/texts/t2v_samples.txt" # path to prompt file
save_dir = "./samples" # path to save samples
```
## Training config demos
```python
# Define sampling size
num_frames = 64
frame_interval = 2 # sample every 2 frames
image_size = (512, 512)
# Define dataset
root = None # root path to the dataset
data_path = "CSV_PATH" # path to the csv file
use_image_transform = False # True if training on images
num_workers = 4 # number of workers for dataloader
# Define acceleration
dtype = "bf16" # Computation type (fp16, bf16)
grad_checkpoint = True # Use gradient checkpointing
plugin = "zero2" # Plugin for distributed training (zero2, zero2-seq)
sp_size = 1 # Sequence parallelism size (1 for no sequence parallelism)
# Define model
model = dict(
type="STDiT-XL/2",
space_scale=1.0,
time_scale=2 / 3,
from_pretrained="YOUR_PRETRAINED_MODEL",
enable_flashattn=True, # Enable flash attention
enable_layernorm_kernel=True, # Enable layernorm kernel
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
micro_batch_size=128,
)
text_encoder = dict(
type="t5",
from_pretrained="./pretrained_models/t5_ckpts",
model_max_length=120,
shardformer=True, # Enable shardformer for T5 acceleration
)
scheduler = dict(
type="iddpm",
timestep_respacing="", # Default 1000 timesteps
)
# Others
seed = 42
outputs = "outputs" # path to save checkpoints
wandb = False # Use wandb for logging
epochs = 1000 # number of epochs (just large enough, kill when satisfied)
log_every = 10
ckpt_every = 250
load = None # path to resume training
batch_size = 4
lr = 2e-5
grad_clip = 1.0 # gradient clipping
```
# 模型唯一标识
modelCode=574
# 模型名称
modelName=open-sora_pytorch
# 模型描述
modelDescription=open-sora_pytorch是一种基于Transformer的文生视频模型
# 应用场景
appScenario=推理,视频生成,媒体,科研,教育
# 框架类型
frameType=pytorch
from .acceleration import *
from .datasets import *
from .models import *
from .registry import *
from collections.abc import Iterable
import torch.nn as nn
from torch.utils.checkpoint import checkpoint, checkpoint_sequential
def set_grad_checkpoint(model, use_fp32_attention=False, gc_step=1):
assert isinstance(model, nn.Module)
def set_attr(module):
module.grad_checkpointing = True
module.fp32_attention = use_fp32_attention
module.grad_checkpointing_step = gc_step
model.apply(set_attr)
def auto_grad_checkpoint(module, *args, **kwargs):
if getattr(module, "grad_checkpointing", False):
if not isinstance(module, Iterable):
return checkpoint(module, *args, **kwargs)
gc_step = module[0].grad_checkpointing_step
return checkpoint_sequential(module, gc_step, *args, **kwargs)
return module(*args, **kwargs)
import torch
import torch.distributed as dist
# ====================
# All-To-All
# ====================
def _all_to_all(
input_: torch.Tensor,
world_size: int,
group: dist.ProcessGroup,
scatter_dim: int,
gather_dim: int,
):
input_list = [t.contiguous() for t in torch.tensor_split(input_, world_size, scatter_dim)]
output_list = [torch.empty_like(input_list[0]) for _ in range(world_size)]
dist.all_to_all(output_list, input_list, group=group)
return torch.cat(output_list, dim=gather_dim).contiguous()
class _AllToAll(torch.autograd.Function):
"""All-to-all communication.
Args:
input_: input matrix
process_group: communication group
scatter_dim: scatter dimension
gather_dim: gather dimension
"""
@staticmethod
def forward(ctx, input_, process_group, scatter_dim, gather_dim):
ctx.process_group = process_group
ctx.scatter_dim = scatter_dim
ctx.gather_dim = gather_dim
ctx.world_size = dist.get_world_size(process_group)
output = _all_to_all(input_, ctx.world_size, process_group, scatter_dim, gather_dim)
return output
@staticmethod
def backward(ctx, grad_output):
grad_output = _all_to_all(
grad_output,
ctx.world_size,
ctx.process_group,
ctx.gather_dim,
ctx.scatter_dim,
)
return (
grad_output,
None,
None,
None,
)
def all_to_all(
input_: torch.Tensor,
process_group: dist.ProcessGroup,
scatter_dim: int = 2,
gather_dim: int = 1,
):
return _AllToAll.apply(input_, process_group, scatter_dim, gather_dim)
def _gather(
input_: torch.Tensor,
world_size: int,
group: dist.ProcessGroup,
gather_dim: int,
):
if gather_list is None:
gather_list = [torch.empty_like(input_) for _ in range(world_size)]
dist.gather(input_, gather_list, group=group, gather_dim=gather_dim)
return gather_list
# ====================
# Gather-Split
# ====================
def _split(input_, pg: dist.ProcessGroup, dim=-1):
# skip if only one rank involved
world_size = dist.get_world_size(pg)
rank = dist.get_rank(pg)
if world_size == 1:
return input_
# Split along last dimension.
dim_size = input_.size(dim)
assert dim_size % world_size == 0, (
f"The dimension to split ({dim_size}) is not a multiple of world size ({world_size}), "
f"cannot split tensor evenly"
)
tensor_list = torch.split(input_, dim_size // world_size, dim=dim)
output = tensor_list[rank].contiguous()
return output
def _gather(input_, pg: dist.ProcessGroup, dim=-1):
# skip if only one rank involved
input_ = input_.contiguous()
world_size = dist.get_world_size(pg)
dist.get_rank(pg)
if world_size == 1:
return input_
# all gather
tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
assert input_.device.type == "cuda"
torch.distributed.all_gather(tensor_list, input_, group=pg)
# concat
output = torch.cat(tensor_list, dim=dim).contiguous()
return output
class _GatherForwardSplitBackward(torch.autograd.Function):
"""Gather the input from model parallel region and concatenate.
Args:
input_: input matrix.
process_group: parallel mode.
dim: dimension
"""
@staticmethod
def symbolic(graph, input_):
return _gather(input_)
@staticmethod
def forward(ctx, input_, process_group, dim, grad_scale):
ctx.mode = process_group
ctx.dim = dim
ctx.grad_scale = grad_scale
return _gather(input_, process_group, dim)
@staticmethod
def backward(ctx, grad_output):
if ctx.grad_scale == "up":
grad_output = grad_output * dist.get_world_size(ctx.mode)
elif ctx.grad_scale == "down":
grad_output = grad_output / dist.get_world_size(ctx.mode)
return _split(grad_output, ctx.mode, ctx.dim), None, None, None
class _SplitForwardGatherBackward(torch.autograd.Function):
"""
Split the input and keep only the corresponding chuck to the rank.
Args:
input_: input matrix.
process_group: parallel mode.
dim: dimension
"""
@staticmethod
def symbolic(graph, input_):
return _split(input_)
@staticmethod
def forward(ctx, input_, process_group, dim, grad_scale):
ctx.mode = process_group
ctx.dim = dim
ctx.grad_scale = grad_scale
return _split(input_, process_group, dim)
@staticmethod
def backward(ctx, grad_output):
if ctx.grad_scale == "up":
grad_output = grad_output * dist.get_world_size(ctx.mode)
elif ctx.grad_scale == "down":
grad_output = grad_output / dist.get_world_size(ctx.mode)
return _gather(grad_output, ctx.mode, ctx.dim), None, None, None
def split_forward_gather_backward(input_, process_group, dim, grad_scale=1.0):
return _SplitForwardGatherBackward.apply(input_, process_group, dim, grad_scale)
def gather_forward_split_backward(input_, process_group, dim, grad_scale=None):
return _GatherForwardSplitBackward.apply(input_, process_group, dim, grad_scale)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment