Commit c07946d8 authored by hepj's avatar hepj
Browse files

dit & video

parents
// Update URL search params when tab is clicked
document.addEventListener("DOMContentLoaded", function () {
const tabs = document.querySelectorAll(".sd-tab-label");
function updateURL(tab) {
const syncGroup = tab.getAttribute("data-sync-group");
const syncId = tab.getAttribute("data-sync-id");
if (syncGroup && syncId) {
const url = new URL(window.location);
url.searchParams.set(syncGroup, syncId);
window.history.replaceState(null, "", url);
}
}
tabs.forEach(tab => {
tab.addEventListener("click", () => updateURL(tab));
});
});
<style>
.notification-bar {
width: 100vw;
display: flex;
justify-content: center;
align-items: center;
font-size: 16px;
padding: 0 6px 0 6px;
}
.notification-bar p {
margin: 0;
}
.notification-bar a {
font-weight: bold;
text-decoration: none;
}
/* Light mode styles (default) */
.notification-bar {
background-color: #fff3cd;
color: #856404;
}
.notification-bar a {
color: #d97706;
}
/* Dark mode styles */
html[data-theme=dark] .notification-bar {
background-color: #333;
color: #ddd;
}
html[data-theme=dark] .notification-bar a {
color: #ffa500; /* Brighter color for visibility */
}
</style>
<!-- <div class="notification-bar">
<p>You are viewing the latest developer preview docs. <a href="https://docs.vllm.ai/en/stable/">Click here</a> to view docs for the latest stable release.</p>
</div> -->
# SPDX-License-Identifier: Apache-2.0
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
import datetime
import inspect
import logging
import os
import sys
from typing import Optional
import requests
from sphinx.ext import autodoc
logger = logging.getLogger(__name__)
sys.path.append(os.path.abspath("../.."))
# -- Project information -----------------------------------------------------
project = 'FastVideo'
copyright = f'{datetime.datetime.now().year}, FastVideo Team'
author = 'the FastVideo Team'
# -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
"sphinx.ext.napoleon",
"sphinx.ext.linkcode",
"sphinx.ext.intersphinx",
"sphinx_copybutton",
"sphinx.ext.autodoc",
"sphinx.ext.autosummary",
"myst_parser",
"sphinxarg.ext",
"sphinx_design",
"sphinx_togglebutton",
]
myst_enable_extensions = [
"colon_fence",
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns: list[str] = ["**/*.template.md", "**/*.inc.md"]
# Exclude the prompt "$" when copying code
copybutton_prompt_text = r"\$ "
copybutton_prompt_is_regexp = True
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_title = project
html_theme = 'sphinx_book_theme'
html_logo = '../../assets/logo.jpg'
#html_favicon = 'assets/logos/vllm-logo-only-light.ico'
html_theme_options = {
'path_to_docs': 'docs/source',
'repository_url': 'https://github.com/hao-ai-lab/FastVideo/',
'use_repository_button': True,
'use_edit_page_button': True,
}
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ["_static"]
html_js_files = ["custom.js"]
html_css_files = ["custom.css"]
myst_url_schemes = {
'http': None,
'https': None,
'mailto': None,
'ftp': None,
"gh-issue": {
"url":
"https://github.com/hao-ai-lab/FastVideo/issues/{{path}}#{{fragment}}",
"title": "Issue #{{path}}",
"classes": ["github"],
},
"gh-pr": {
"url":
"https://github.com/hao-ai-lab/FastVideo/pull/{{path}}#{{fragment}}",
"title": "Pull Request #{{path}}",
"classes": ["github"],
},
"gh-dir": {
"url": "https://github.com/hao-ai-lab/FastVideo/tree/main/{{path}}",
"title": "{{path}}",
"classes": ["github"],
},
"gh-file": {
"url": "https://github.com/hao-ai-lab/FastVideo/blob/main/{{path}}",
"title": "{{path}}",
"classes": ["github"],
},
}
# see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa
READTHEDOCS_VERSION_TYPE = os.environ.get('READTHEDOCS_VERSION_TYPE')
if READTHEDOCS_VERSION_TYPE == "tag":
# remove the warning banner if the version is a tagged release
header_file = os.path.join(os.path.dirname(__file__),
"_templates/sections/header.html")
# The file might be removed already if the build is triggered multiple times
# (readthedocs build both HTML and PDF versions separately)
if os.path.exists(header_file):
os.remove(header_file)
# Generate additional rst documentation here.
def setup(app):
from docs.source.generate_examples import generate_examples
generate_examples()
_cached_base: str = ""
_cached_branch: str = ""
def get_repo_base_and_branch(
pr_number: str) -> tuple[Optional[str], Optional[str]]:
global _cached_base, _cached_branch
if _cached_base and _cached_branch:
return _cached_base, _cached_branch
url = f"https://api.github.com/repos/hao-ai-lab/FastVideo/pulls/{pr_number}"
response = requests.get(url)
if response.status_code == 200:
data = response.json()
_cached_base = data['head']['repo']['full_name']
_cached_branch = data['head']['ref']
return _cached_base, _cached_branch
else:
logger.error("Failed to fetch PR details: %s", response)
return None, None
def linkcode_resolve(domain, info):
if domain != 'py':
return None
if not info['module']:
return None
module = info['module']
# try to determine the correct file and line number to link to
obj = sys.modules[module]
# get as specific as we can
lineno: int = 0
filename: str = ""
try:
for part in info['fullname'].split('.'):
obj = getattr(obj, part)
if not (inspect.isclass(obj) or inspect.isfunction(obj)
or inspect.ismethod(obj)):
obj = obj.__class__ # type: ignore[assignment]
lineno = inspect.getsourcelines(obj)[1]
filename = (inspect.getsourcefile(obj)
or f"{filename}.py").split("FastVideo/", 1)[1]
except Exception:
# For some things, like a class member, won't work, so
# we'll use the line number of the parent (the class)
pass
if filename.startswith("checkouts/"):
# a PR build on readthedocs
pr_number = filename.split("/")[1]
filename = filename.split("/", 2)[2]
base, branch = get_repo_base_and_branch(pr_number)
if base and branch:
return f"https://github.com/{base}/blob/{branch}/{filename}#L{lineno}"
# Otherwise, link to the source file on the main branch
return f"https://github.com/hao-ai-lab/FastVideo/blob/main/{filename}#L{lineno}"
# Mock out external dependencies here, otherwise the autodoc pages may be blank.
autodoc_mock_imports = [
"blake3",
"compressed_tensors",
"cpuinfo",
"cv2",
"torch",
"transformers",
"psutil",
"prometheus_client",
"sentencepiece",
"vllm._C",
"PIL",
"numpy",
'triton',
"tqdm",
"tensorizer",
"pynvml",
"outlines",
"xgrammar",
"librosa",
"soundfile",
"gguf",
"lark",
"decord",
]
for mock_target in autodoc_mock_imports:
if mock_target in sys.modules:
logger.info(
"Potentially problematic mock target (%s) found; "
"autodoc_mock_imports cannot mock modules that have already "
"been loaded into sys.modules when the sphinx build starts.",
mock_target)
class MockedClassDocumenter(autodoc.ClassDocumenter):
"""Remove note about base class when a class is derived from object."""
def add_line(self, line: str, source: str, *lineno: int) -> None:
if line == " Bases: :py:class:`object`":
return
super().add_line(line, source, *lineno)
autodoc.ClassDocumenter = MockedClassDocumenter
intersphinx_mapping = {
"python": ("https://docs.python.org/3", None),
"typing_extensions":
("https://typing-extensions.readthedocs.io/en/latest", None),
"aiohttp": ("https://docs.aiohttp.org/en/stable", None),
"pillow": ("https://pillow.readthedocs.io/en/stable", None),
"numpy": ("https://numpy.org/doc/stable", None),
"torch": ("https://pytorch.org/docs/stable", None),
"psutil": ("https://psutil.readthedocs.io/en/stable", None),
}
autodoc_preserve_defaults = True
autodoc_warningiserror = True
navigation_with_keys = False
(add-pipeline)=
# 🏗️ Adding a New Diffusion Pipeline
This guide explains how to implement a custom diffusion pipeline in FastVideo, leveraging the framework's modular architecture for high-performance video generation.
## Implementation Process Overview
1. **Port Required Modules** - Identify and implement necessary model components
2. **Create Directory Structure** - Set up pipeline files and folders
3. **Implement Pipeline Class** - Build the pipeline using existing or custom stages
4. **Register Your Pipeline** - Make it discoverable by the framework
5. **Configure Your Pipeline** - (Coming soon)
Need help? Join our [Slack community](https://join.slack.com/t/fastvideo/shared_invite/zt-2zf6ru791-sRwI9lPIUJQq1mIeB_yjJg).
## Step 1: Pipeline Modules
### Identifying Required Modules
FastVideo uses the Hugging Face Diffusers format for model organization:
1. Examine the `model_index.json` in the HF model repository:
```json
{
"_class_name": "WanImageToVideoPipeline",
"_diffusers_version": "0.33.0.dev0",
"image_encoder": ["transformers", "CLIPVisionModelWithProjection"],
"image_processor": ["transformers", "CLIPImageProcessor"],
"scheduler": ["diffusers", "UniPCMultistepScheduler"],
"text_encoder": ["transformers", "UMT5EncoderModel"],
"tokenizer": ["transformers", "T5TokenizerFast"],
"transformer": ["diffusers", "WanTransformer3DModel"],
"vae": ["diffusers", "AutoencoderKLWan"]
}
```
1. For each component:
- Note the originating library (`transformers` or `diffusers`)
- Identify the class name
- Check if it's already available in FastVideo
2. Review config files in each component's directory for architecture details
### Implementing Modules
Place new modules in the appropriate directories:
- Encoders: `fastvideo/v1/models/encoders/`
- VAEs: `fastvideo/v1/models/vaes/`
- Transformer models: `fastvideo/v1/models/dits/`
- Schedulers: `fastvideo/v1/models/schedulers/`
### Adapting Model Layers
#### Layer Replacements
Replace standard PyTorch layers with FastVideo optimized versions:
- nn.LayerNorm → fastvideo.v1.layers.layernorm.RMSNorm
- Embedding layers → fastvideo.v1.layers.vocab_parallel_embedding modules
- Activation functions → versions from fastvideo.v1.layers.activation
#### Distributed Linear Layers
Use appropriate parallel layers for distribution:
```python
# Output dimension parallelism
from fastvideo.v1.layers.linear import ColumnParallelLinear
self.q_proj = ColumnParallelLinear(
input_size=hidden_size,
output_size=head_size * num_heads,
bias=bias,
gather_output=False
)
# Fused QKV projection
from fastvideo.v1.layers.linear import QKVParallelLinear
self.qkv_proj = QKVParallelLinear(
hidden_size=hidden_size,
head_size=attention_head_dim,
total_num_heads=num_attention_heads,
bias=True
)
# Input dimension parallelism
from fastvideo.v1.layers.linear import RowParallelLinear
self.out_proj = RowParallelLinear(
input_size=head_size * num_heads,
output_size=hidden_size,
bias=bias,
input_is_parallel=True
)
```
### Attention Layers
Replace standard attention with FastVideo's optimized attention:
```python
# Local attention patterns
from fastvideo.v1.attention import LocalAttention
from fastvideo.v1.attention.backends.abstract import _Backend
self.attn = LocalAttention(
num_heads=num_heads,
head_size=head_dim,
dropout_rate=0.0,
softmax_scale=None,
causal=False,
supported_attention_backends=(_Backend.FLASH_ATTN, _Backend.TORCH_SDPA)
)
# Distributed attention for long sequences
from fastvideo.v1.attention import DistributedAttention
self.attn = DistributedAttention(
num_heads=num_heads,
head_size=head_dim,
dropout_rate=0.0,
softmax_scale=None,
causal=False,
supported_attention_backends=(_Backend.SLIDING_TILE_ATTN, _Backend.FLASH_ATTN, _Backend.TORCH_SDPA)
)
```
#### Define supported backend selection
```python
_supported_attention_backends = (_Backend.FLASH_ATTN, _Backend.TORCH_SDPA)
```
### Registering Models
Register implemented modules in the model registry:
```python
# In fastvideo/v1/models/registry.py
_TEXT_TO_VIDEO_DIT_MODELS = {
"YourTransformerModel": ("dits", "yourmodule", "YourTransformerClass"),
}
_VAE_MODELS = {
"YourVAEModel": ("vaes", "yourvae", "YourVAEClass"),
}
```
## Step 2: Directory Structure
Create a new directory for your pipeline:
```
fastvideo/v1/pipelines/
├── your_pipeline/
│ ├── __init__.py
│ └── your_pipeline.py
```
## Step 3: Implement Pipeline Class
Pipelines are composed of stages, each handling a specific part of the diffusion process:
- **InputValidationStage**: Validates input parameters
- **Text Encoding Stages**: Handle text encoding (CLIP/Llama/T5)
- **CLIPImageEncodingStage**: Processes image inputs
- **TimestepPreparationStage**: Prepares diffusion timesteps
- **LatentPreparationStage**: Manages latent representations
- **ConditioningStage**: Processes conditioning inputs
- **DenoisingStage**: Performs denoising diffusion
- **DecodingStage**: Converts latents to pixels
### Creating Your Pipeline
```python
from fastvideo.v1.pipelines.composed_pipeline_base import ComposedPipelineBase
from fastvideo.v1.pipelines.stages import (
InputValidationStage, CLIPTextEncodingStage, TimestepPreparationStage,
LatentPreparationStage, DenoisingStage, DecodingStage
)
from fastvideo.v1.fastvideo_args import FastVideoArgs
from fastvideo.v1.pipelines.pipeline_batch_info import ForwardBatch
import torch
class MyCustomPipeline(ComposedPipelineBase):
"""Custom diffusion pipeline implementation."""
# Define required model components from model_index.json
_required_config_modules = [
"text_encoder", "tokenizer", "vae", "transformer", "scheduler"
]
@property
def required_config_modules(self) -> List[str]:
return self._required_config_modules
def initialize_pipeline(self, fastvideo_args: FastVideoArgs):
"""Initialize pipeline-specific components."""
pass
def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
"""Set up pipeline stages with proper dependency injection."""
self.add_stage(
stage_name="input_validation_stage",
stage=InputValidationStage()
)
self.add_stage(
stage_name="prompt_encoding_stage",
stage=CLIPTextEncodingStage(
text_encoder=self.get_module("text_encoder"),
tokenizer=self.get_module("tokenizer")
)
)
self.add_stage(
stage_name="timestep_preparation_stage",
stage=TimestepPreparationStage(
scheduler=self.get_module("scheduler")
)
)
self.add_stage(
stage_name="latent_preparation_stage",
stage=LatentPreparationStage(
scheduler=self.get_module("scheduler"),
vae=self.get_module("vae")
)
)
self.add_stage(
stage_name="denoising_stage",
stage=DenoisingStage(
transformer=self.get_module("transformer"),
scheduler=self.get_module("scheduler")
)
)
self.add_stage(
stage_name="decoding_stage",
stage=DecodingStage(
vae=self.get_module("vae")
)
)
# Register the pipeline class
EntryClass = MyCustomPipeline
```
### Creating Custom Stages (Optional)
If existing stages don't meet your needs, create custom ones:
```python
from fastvideo.v1.pipelines.stages.base import PipelineStage
class MyCustomStage(PipelineStage):
"""Custom processing stage for the pipeline."""
def __init__(self, custom_module, other_param=None):
super().__init__()
self.custom_module = custom_module
self.other_param = other_param
def forward(self, batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch:
# Access input data
input_data = batch.some_attribute
# Validate inputs
if input_data is None:
raise ValueError("Required input is missing")
# Process with your module
result = self.custom_module(input_data)
# Update batch with results
batch.some_output = result
return batch
```
Add your custom stage to the pipeline:
```python
self.add_stage(
stage_name="my_custom_stage",
stage=MyCustomStage(
custom_module=self.get_module("custom_module"),
other_param="some_value"
)
)
```
#### Stage Design Principles
1. **Single Responsibility**: Focus on one specific task
2. **Functional Pattern**: Receive and return a `ForwardBatch` object
3. **Dependency Injection**: Pass dependencies through constructor
4. **Input Validation**: Validate inputs for clear error messages
## Step 4: Register Your Pipeline
Define `EntryClass` at the end of your pipeline file:
```python
# Single pipeline class
EntryClass = MyCustomPipeline
# Or multiple pipeline classes
EntryClass = [MyCustomPipeline, MyOtherPipeline]
```
The registry will automatically:
1. Scan all packages under `fastvideo/v1/pipelines/`
2. Look for `EntryClass` variables
3. Register pipelines using their class names as identifiers
## Best Practices
- **Reuse Existing Components**: Leverage built-in stages and modules
- **Follow Module Organization**: Place new modules in appropriate directories
- **Match Model Patterns**: Follow existing code patterns and conventions
# 🐳 Using the FastVideo Docker Image
If you prefer a containerized development environment or want to avoid managing dependencies manually, you can use our prebuilt Docker image:
**Image:** [`ghcr.io/hao-ai-lab/fastvideo/fastvideo-dev:latest`](https://ghcr.io/hao-ai-lab/fastvideo/fastvideo-dev)
## Starting the container
```bash
docker run --gpus all -it ghcr.io/hao-ai-lab/fastvideo/fastvideo-dev:latest
```
This will:
- Start the container with GPU access
- Drop you into a shell with the `fastvideo-dev` Conda environment preconfigured
## Using the container
```bash
# Conda environment should already be active
# FastVideo package installed in editable mode
# Pull the latest changes from remote
cd /FastVideo
git pull
# Run linters and tests
pre-commit run --all-files
pytest tests/
```
(developer-env)
# 🧰 Developer Environment
Accelerate your FastVideo development workflow by leveraging Docker images and cloud GPUs for efficient experimentation and reproducible environments.
:::{toctree}
:caption: Contents
:maxdepth: 1
docker
runpod
:::
(runpod)=
# 📦 Developing FastVideo on RunPod
You can easily use the FastVideo Docker image as a custom container on [RunPod](https://www.runpod.io) for development or experimentation.
## Creating a new pod
Choose a GPU that supports CUDA 12.4
![RunPod CUDA selection](../../_static/images/runpod_cuda.png)
When creating your pod template, use this image:
```
ghcr.io/hao-ai-lab/fastvideo/fastvideo-dev:latest
```
Paste Container Start Command to support SSH ([RunPod Docs](https://docs.runpod.io/pods/configuration/use-ssh)):
```bash
bash -c "apt update;DEBIAN_FRONTEND=noninteractive apt-get install openssh-server -y;mkdir -p ~/.ssh;cd $_;chmod 700 ~/.ssh;echo \"$PUBLIC_KEY\" >> authorized_keys;chmod 700 authorized_keys;service ssh start;sleep infinity"
```
![RunPod template configuration](../../_static/images/runpod_template.png)
After deploying, the pod will take a few minutes to pull the image and start the SSH service.
![RunPod ssh](../../_static/images/runpod_ssh.png)
## Working with the pod
After SSH'ing into your pod, you'll find the `fastvideo-dev` Conda environment already activated.
To pull in the latest changes from the GitHub repo:
```bash
cd /FastVideo
git pull
```
`If you have a persistent volume and want to keep your code changes, you can move /FastVideo to /workspace/FastVideo, or simply clone the repository there.`
Run your development workflows as usual:
```bash
# Run linters
pre-commit run --all-files
# Run tests
pytest tests/
```
(developer-overview)=
# 🛠️ Contributing to FastVideo
Thank you for your interest in contributing to FastVideo. We want to make the process as smooth for you as possible and this is a guide to help get you started!
Our community is open to everyone and welcomes any contributions no matter how large or small.
# Developer Environment:
Do make sure you have CUDA 12.4 installed and supported. FastVideo currently only support Linux and CUDA GPUs, but we hope to support other platforms in the future.
We recommend using a fresh Python 3.10 Conda environment to develop FastVideo:
Install Miniconda:
```
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
bash Miniconda3-latest-Linux-x86_64.sh
source ~/.bashrc
```
Create and activate a Conda environment for FastVideo:
```
conda create -n fastvideo python=3.10 -y
conda activate fastvideo
```
Clone the FastVideo repository and go to the FastVideo directory:
```
git clone https://github.com/hao-ai-lab/FastVideo.git && cd FastVideo
```
Now you can install FastVideo and setup git hooks for running linting. By using `pre-commit`, the linters will run and have to pass before you'll be able to make a commit.
```bash
pip install -e .[dev]
# Can also install flash-attn (optional)
pip install flash-attn==2.7.4.post1 --no-build-isolation
# Linting, formatting and static type checking
pre-commit install --hook-type pre-commit --hook-type commit-msg
# You can manually run pre-commit with
pre-commit run --all-files
# Unit tests
pytest tests/
```
# 🔍 FastVideo Overview
This document outlines FastVideo's architecture for developers interested in framework internals or contributions. It serves as an onboarding guide for new contributors by providing an overview of the most important directories and files within the `fastvideo/v1/` codebase.
## Table of Contents - V1 Directory Structure and Files
- [`fastvideo/v1/pipelines/`](#design-pipeline-system) - Core diffusion pipeline components
- [`fastvideo/v1/models/`](#design-model-components) - Model implementations
- [`dits/`](#design-transformer-models) - Transformer-based diffusion models
- [`vaes/`](#design-vae-variational-auto-encoder) - Variational autoencoders
- [`encoders/`](#design-text-and-image-encoders) - Text and image encoders
- [`schedulers/`](#design-schedulers) - Diffusion schedulers
- [`fastvideo/v1/attention/`](#design-optimized-attention) - Optimized attention implementations
- [`fastvideo/v1/distributed/`](#design-distributed-processing) - Distributed computing utilities
- [`fastvideo/v1/layers/`](#design-tensor-parallelism) - Custom neural network layers
- [`fastvideo/v1/platforms/`](#design-platforms) - Hardware platform abstractions
- [`fastvideo/v1/worker/`](#design-executor-and-worker-abstractions) - Multi-GPU process management
- [`fastvideo/v1/fastvideo_args.py`](#design-fastvideo-args) - Argument handling
- [`fastvideo/v1/forward_context.py`](#design-forwardcontext) - Forward pass context management
- `fastvideo/v1/utils.py` - Utility functions
- [`fastvideo/v1/logger.py`](#design-logger) - Logging infrastructure
## Core Architecture
FastVideo separates model components from execution logic with these principles:
- **Component Isolation**: Models (encoders, VAEs, transformers) are isolated from execution (pipelines, stages, distributed processing)
- **Modular Design**: Components can be independently replaced
- **Distributed Execution**: Supports various parallelism strategies (Tensor, Sequence)
- **Custom Attention Backends**: Components can support and use different Attention implementations
- **Pipeline Abstraction**: Consistent interface across diffusion models
(design-fastvideo-args)=
## FastVideoArgs
The `FastVideoArgs` class in `fastvideo/v1/fastvideo_args.py` serves as the central configuration system for FastVideo. It contains all parameters needed to control model loading, inference configuration, performance optimization settings, and more.
Key features include:
- **Command-line Interface**: Automatic conversion between CLI arguments and dataclass fields
- **Configuration Groups**: Organized by functional areas (model loading, video params, optimization settings)
- **Context Management**: Global access to current settings via `get_current_fastvideo_args()`
- **Parameter Validation**: Ensures valid combinations of settings
Common configuration areas:
- **Model paths and loading options**: `model_path`, `trust_remote_code`, `revision`
- **Distributed execution settings**: `num_gpus`, `tp_size`, `sp_size`
- **Video generation parameters**: `height`, `width`, `num_frames`, `num_inference_steps`
- **Precision settings**: Control computation precision for different components
Example usage:
```python
# Load arguments from command line
fastvideo_args = prepare_fastvideo_args(sys.argv[1:])
# Access parameters
model = load_model(fastvideo_args.model_path)
# Set as global context
with set_current_fastvideo_args(fastvideo_args):
# Code that requires access to these arguments
result = generate_video()
```
(design-pipeline-system)=
## Pipeline System
### `ComposedPipelineBase`
This foundational class provides:
- **Model Loading**: Automatically loads components from HuggingFace-Diffusers-compatible model directories
- **Stage Management**: Creates and orchestrates processing stages
- **Data Flow Coordination**: Ensures proper state flow between stages
```python
class MyCustomPipeline(ComposedPipelineBase):
_required_config_modules = [
"text_encoder", "tokenizer", "vae", "transformer", "scheduler"
]
def initialize_pipeline(self, fastvideo_args: FastVideoArgs):
# Pipeline-specific initialization
pass
def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
self.add_stage("input_validation_stage", InputValidationStage())
self.add_stage("text_encoding_stage", CLIPTextEncodingStage(
text_encoder=self.get_module("text_encoder"),
tokenizer=self.get_module("tokenizer")
))
# Additional stages...
```
### Pipeline Stages
Each stage handles a specific diffusion process component:
- **Input Validation**: Parameter verification
- **Text Encoding**: CLIP, LLaMA, or T5-based encoding
- **Image Encoding**: Image input processing
- **Timestep & Latent Preparation**: Setup for diffusion
- **Denoising**: Core diffusion loop
- **Decoding**: Latent-to-pixel conversion
Each stage implements a standard interface:
```python
def forward(self, batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch:
# Process batch and update state
return batch
```
(design-forwardbatch)=
### ForwardBatch
Defined in `fastvideo/v1/pipelines/pipeline_batch_info.py`, `ForwardBatch` encapsulates the data payload passed between pipeline stages. It typically holds:
- **Input Data**: Prompts, images, generation parameters
- **Intermediate State**: Embeddings, latents, timesteps, accumulated during stage execution
- **Output Storage**: Generated results and metadata
- **Configuration**: Sampling parameters, precision settings
This structure facilitates clear state transitions between stages.
(design-model-components)=
## Model Components
The `fastvideo/v1/models/` directory contains implementations of the core neural network models used in video diffusion:
(design-transformer-models)=
### Transformer Models
Transformer networks perform the actual denoising during diffusion:
- **Location**: `fastvideo/v1/models/dits/`
- **Examples**:
- `WanTransformer3DModel`
- `HunyuanVideoTransformer3DModel`
Features include:
- Text/image conditioning
- Standardized interface for model-specific optimizations
```python
def forward(
self,
latents, # [B, T, C, H, W]
encoder_hidden_states, # Text embeddings
timestep, # Current diffusion timestep
encoder_hidden_states_image=None, # Optional image embeddings
**kwargs
):
# Perform denoising computation
return noise_pred # Predicted noise residual
```
(design-vae-variational-auto-encoder)=
### VAE (Variational Auto-Encoder)
VAEs handle conversion between pixel space and latent space:
- **Location**: `fastvideo/v1/models/vaes/`
- **Examples**:
- `AutoencoderKLWan`
- `AutoencoderKLHunyuanVideo`
These models compress image/video data to a more efficient latent representation (typically 4x-8x smaller in each dimension).
FastVideo's VAE implementations include:
- Efficient video batch processing
- Memory optimization
- Optional tiling for large frames
- Distributed weight support
(design-text-and-image-encoders)=
### Text and Image Encoders
Encoders process conditioning inputs into embeddings:
- **Location**: `fastvideo/v1/models/encoders/`
- **Text Encoders**:
- `CLIPTextModel`
- `LlamaModel`
- `UMT5EncoderModel`
- **Image Encoders**:
- `CLIPVisionModel`
FastVideo implements optimizations such as:
- Vocab parallelism for distributed processing
- Caching for common prompts
- Precision-tuned computation
(design-schedulers)=
### Schedulers
Schedulers manage the diffusion sampling process:
- **Location**: `fastvideo/v1/models/schedulers/`
- **Examples**:
- `UniPCMultistepScheduler`
- `FlowMatchEulerDiscreteScheduler`
These components control:
- Diffusion timestep sequences
- Noise prediction to latent update conversions
- Quality/speed trade-offs
```python
def step(
self,
model_output: torch.Tensor,
timestep: torch.LongTensor,
sample: torch.Tensor,
**kwargs
) -> torch.Tensor:
# Process model output and update latents
# Return updated latents
return prev_sample
```
(design-optimized-attention)=
## Optimized Attention
The `fastvideo/v1/attention/` directory contains optimized attention implementations crucial for efficient video diffusion:
### Attention Backends
Multiple implementations with automatic selection:
- **FLASH_ATTN**: Optimized for supporting hardware
- **TORCH_SDPA**: Built-in PyTorch scaled dot-product attention
- **SLIDING_TILE_ATTN**: For very long sequences
```python
# Configure available attention backends for this layer
self.attn = LocalAttention(
num_heads=num_heads,
head_size=head_dim,
causal=False,
supported_attention_backends=(_Backend.FLASH_ATTN, _Backend.TORCH_SDPA)
)
# Override via environment variable
# export FASTVIDEO_ATTENTION_BACKEND=FLASH_ATTN
```
### Attention Patterns
Supports various patterns with memory optimization techniques:
- **Cross/Self/Temporal/Global-Local Attention**
- Chunking, progressive computation, optimized masking
(design-distributed-processing)=
## Distributed Processing
The `fastvideo/v1/distributed/` directory contains implementations for distributed model execution:
(design-tensor-parallelism)=
### Tensor Parallelism
Tensor parallelism splits model weights across devices:
- **Implementation**: Through `RowParallelLinear` and `ColumnParallelLinear` layers
- **Use cases**: Will be used by encoder models as their sequence lengths are shorter and enables efficient sharding.
```python
# Tensor-parallel layers in a transformer block
from fastvideo.v1.layers.linear import ColumnParallelLinear, RowParallelLinear
# Split along output dimension
self.qkv_proj = ColumnParallelLinear(
input_size=hidden_size,
output_size=3 * hidden_size,
bias=True,
gather_output=False
)
# Split along input dimension
self.out_proj = RowParallelLinear(
input_size=hidden_size,
output_size=hidden_size,
bias=True,
input_is_parallel=True
)
```
### Sequence Parallelism
Sequence parallelism splits sequences across devices:
- **Implementation**: Through `DistributedAttention` and sequence splitting
- **Use cases**: Long video sequences or high-resolution processing. Used by DiT models.
```python
# Distributed attention for long sequences
from fastvideo.v1.attention import DistributedAttention
self.attn = DistributedAttention(
num_heads=num_heads,
head_size=head_dim,
causal=False,
supported_attention_backends=(_Backend.SLIDING_TILE_ATTN, _Backend.FLASH_ATTN)
)
```
### Communication Primitives
Efficient distributed operations via AllGather, AllReduce, and synchronization mechanisms.
Efficient communication primitives minimize distributed overhead:
- **Sequence-Parallel AllGather**: Collects sequence chunks
- **Tensor-Parallel AllReduce**: Combines partial results
- **Distributed Synchronization**: Coordinates execution
(design-forwardcontext)=
## Forward Context Management
### ForwardContext
Defined in `fastvideo/v1/forward_context.py`, `ForwardContext` manages execution-specific state *within* a forward pass, particularly for low-level optimizations. It is accessed via `get_forward_context()`.
- **Attention Metadata**: Configuration for optimized attention kernels (`attn_metadata`)
- **Profiling Data**: Potential hooks for performance metrics collection
This context-based approach enables:
- Dynamic optimization based on execution state (e.g., attention backend selection)
- Step-specific customizations within model components
Usage example:
```python
with set_forward_context(current_timestep, attn_metadata, fastvideo_args):
# During this forward pass, components can access context
# through get_forward_context()
output = model(inputs)
```
(design-executor-and-worker-abstractions)=
## Executor and Worker System
The `fastvideo/v1/worker/` directory contains the distributed execution framework:
### Executor Abstraction
FastVideo implements a flexible execution model for distributed processing:
- **Executor Base Class**: An abstract base class defining the interface for all executors
- **MultiProcExecutor**: Primary implementation that spawns and manages worker processes
- **GPU Workers**: Handle actual model execution on individual GPUs
The MultiProcExecutor implementation:
1. Spawns worker processes for each GPU
2. Establishes communication channels via pipes
3. Coordinates distributed operations across workers
4. Handles graceful startup and shutdown of the process group
Each GPU worker:
1. Initializes the distributed environment
2. Builds the pipeline for the specified model
3. Executes requested operations on its assigned GPU
4. Manages local resources and communicates results back to the executor
This design allows FastVideo to efficiently utilize multiple GPUs while providing a simple, unified interface for model execution.
(design-platforms)=
## Platforms
The `fastvideo/v1/platforms/` directory provides hardware platform abstractions that enable FastVideo to run efficiently on different hardware configurations:
### Platform Abstraction
FastVideo's platform abstraction layer enables:
- **Hardware Detection**: Automatic detection of available hardware
- **Backend Selection**: Appropriate selection of compute kernels
- **Memory Management**: Efficient utilization of hardware-specific memory features
The primary components include:
- **Platform Interface**: Defines the common API for all platform implementations
- **CUDA Platform**: Optimized implementation for NVIDIA GPUs
- **Backend Enum**: Used throughout the codebase for feature selection
Usage example:
```python
from fastvideo.v1.platforms import current_platform, _Backend
# Check hardware capabilities
if current_platform.supports_backend(_Backend.FLASH_ATTN):
# Use FlashAttention implementation
else:
# Fall back to standard implementation
```
The platform system is designed to be extensible for future hardware targets.
(design-logger)=
## Logger
See [PR](https://github.com/hao-ai-lab/FastVideo/pull/356)
*TODO*: (help wanted) Add an environment variable that disables process-aware logging.
## Contributing to FastVideo
If you're a new contributor, here are some common areas to explore:
1. **Adding a new model**: Implement new model types in the appropriate subdirectory of `fastvideo/v1/models/`
2. **Optimizing performance**: Look at attention implementations or memory management
3. **Adding a new pipeline**: Create a new pipeline subclass in `fastvideo/v1/pipelines/`
4. **Hardware support**: Extend the `platforms` module for new hardware targets
When adding code, follow these practices:
- Use type hints for better code readability
- Add appropriate docstrings
- Maintain the separation between model components and execution logic
- Follow existing patterns for distributed processing
This diff is collapsed.
(fastvideo-installation)=
# 🔧 Installation
FastVideo currently only supports Linux and NVIDIA CUDA GPUs.
FastVideo has been tested on the following GPUs, but it should work on any GPUs that supports CUDA 12.4+, please create an issue if you discover any issues:
- RTX 4090
- A40
- L40S
- A100
- H100
## Requirements
- OS: Linux
- Python: 3.10-3.12
- CUDA 12.4+
## Installation Options
### Option 1: Quick Install
```bash
pip install fastvideo
```
### Option 2: Installation from Source
We recommend using a Python environment such as Conda.
#### 1. [Optional] Install Miniconda (if not already installed)
```bash
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
bash Miniconda3-latest-Linux-x86_64.sh
source ~/.bashrc
```
#### 2. [Optional] Create and activate a Conda environment for FastVideo
```bash
conda create -n fastvideo python=3.10 -y
conda activate fastvideo
```
#### 3. Clone the FastVideo repository
```bash
git clone https://github.com/hao-ai-lab/FastVideo.git && cd FastVideo
```
#### 4. Install FastVideo
Basic installation:
```bash
pip install -e .
```
## Optional Dependencies
### Flash Attention
```bash
pip install flash-attn==2.7.4.post1 --no-build-isolation
```
### Sliding Tile Attention (STA) (Requires CUDA 12.4+ and H100)
To try Sliding Tile Attention (optional), please follow the instructions in [csrc/sliding_tile_attention/README.md](#sta-installation) to install STA.
## Development Environment Setup
If you're planning to contribute to FastVideo please see the following page:
[Contributor Guide](#developer-overview)
## Hardware Requirements
### For Basic Inference
- NVIDIA GPU with CUDA support
- Minimum 20GB VRAM for quantized models (e.g., single RTX 4090)
### For Lora Finetuning
- 40GB GPU memory each for 2 GPUs with lora
- 30GB GPU memory each for 2 GPUs with CPU offload and lora
### For Full Finetuning/Distillation
- Multiple high-memory GPUs recommended (e.g., H100)
## Troubleshooting
If you encounter any issues during installation, please open an issue on our [GitHub repository](https://github.com/hao-ai-lab/FastVideo).
You can also join our [Slack community](https://join.slack.com/t/fastvideo/shared_invite/zt-2zf6ru791-sRwI9lPIUJQq1mIeB_yjJg) for additional support.
This diff is collapsed.
This diff is collapsed.
(fastmochi)=
# FastMochi
```bash
# Download the model weight
python scripts/huggingface/download_hf.py --repo_id=FastVideo/FastMochi-diffusers --local_dir=data/FastMochi-diffusers --repo_type=model
# CLI inference
bash scripts/inference/inference_mochi_sp.sh
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment